Question

CUDA 8.0 cudaMemcpy()是否同时或逐字节地复制整个内存块？

我想限制复制时间，但我在文档中找不到任何指定cudaMemcpy()是线性还是恒定时间操作的内容。

Answer 1

同步内存传输不是固定时间，而是具有固定延迟组件和与传输大小成比例的组件。在小尺寸下，延迟占主导地位，在大尺寸下，限制传输速度受到内存或总线带宽的限制。

考虑以下琐碎的基准：

#include <iostream>
#include <string>
#include <algorithm>

__global__ void memsetkernel(int *x, int n)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    for(; tid < n; tid += stride) {
        x[tid] = threadIdx.x;
    }
}

int main(int argc, char* argv[])
{
    // size
    int n = 100;
    int nreps = 10;

    if (argc > 1) {
       n = std::stoi(std::string(argv[1]));
    }

    size_t sz = sizeof(int) * size_t(n);

    // host array
    int* host = new int[n];

    // allocate size ints on device
    int* device;
    cudaMalloc(&device, sz);
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    {
        int nthreads = 1024;
        int nblocks = std::max(1, std::min(13*2, n / nthreads));
        memsetkernel<<<nblocks, nthreads>>>(device, n);
        cudaDeviceSynchronize();
        cudaEventRecord(start);
        for(int i=0; i<nreps; i++) {
            memsetkernel<<<nblocks, nthreads>>>(device, n);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float milliseconds, kilobytes, bandwidth;
        cudaEventElapsedTime(&milliseconds, start, stop);
        milliseconds /= float(nreps); // Average of nreps
        kilobytes = float(sz) / 1e3f;
        bandwidth = kilobytes / milliseconds;        
        std::cout << "kernel assignment: " << bandwidth << " Mb/s" << std::endl; 
    }

    {
        cudaMemcpy(host, device, sz, cudaMemcpyDeviceToHost);
        cudaEventRecord(start);
        for(int i=0; i<nreps; i++) {
            cudaMemcpy(host, device, sz, cudaMemcpyDeviceToHost);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float milliseconds, kilobytes, bandwidth;
        cudaEventElapsedTime(&milliseconds, start, stop);
        milliseconds /= float(nreps); // Average of nreps
        kilobytes = float(sz) / 1e3f;
        bandwidth = kilobytes / milliseconds;        
        std::cout << "DTOH: " << bandwidth << " Mb/s" << std::endl; 
    }

    {
        cudaMemcpy(device, host, sz, cudaMemcpyHostToDevice);
        cudaEventRecord(start);
        for(int i=0; i<nreps; i++) {
            cudaMemcpy(device, host, sz, cudaMemcpyHostToDevice);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float milliseconds, kilobytes, bandwidth;
        cudaEventElapsedTime(&milliseconds, start, stop);
        milliseconds /= float(nreps); // Average of nreps
        kilobytes = float(sz) / 1e3f;
        bandwidth = kilobytes / milliseconds;
        std::cout << "HTOD: " << bandwidth << " Mb/s" << std::endl; 
    }

    // reset device
    cudaDeviceReset();

}

以不同的数据大小运行此操作会显示以下行为：

设备到主机和主机到设备都渐近接近所讨论机器的PCI-e总线带宽的60％左右（约6.5 Gb / s，使用固定主机可以达到更高的值）内存），内核达到GPU主存储器带宽的70％左右（150 Gb / s，理论最大带宽约为224Gb / s）。

NVIDIA提供了一个用于测量传输带宽的示例，您可以阅读here。您可以使用它来自己探索硬件的性能。

CUDA 8.0 - cudaMemcpy（） - 线性或恒定时间操作？

1 个答案: