最小化cudaDeviceSynchronize启动开销

时间:2016-08-01 11:21:42

标签: parallel-processing cuda tegra

我目前正在使用CUDA进行一个项目,其中管道刷新每1ms有200-10000个新事件。每次,我想调用一个(/两个)内核来计算一小部分输出;然后将这些输出提供给管道的下一个元素。

理论流程是:

  1. std::vector
  2. 接收数据
  3. cudaMemcpy向量的向量
  4. 处理
  5. 生成小型输出列表
  6. cudaMemcpy到输出std::vector
  7. 但是当我在没有处理的1block / 1thread空内核上调用cudaDeviceSynchronize时,它平均需要0.7到1.4ms,这已经高于我的1ms时间帧。

    我最终可以更改管道的时间范围,以便每5ms接收一次事件,但每次更多5次。虽然这不太理想。

    最小化cudaDeviceSynchronize开销的最佳方法是什么?溪流在这种情况下会有所帮助吗?或另一种有效运行管道的解决方案。

    (Jetson TK1,计算能力3.2)

    这是应用程序的nvprof日志:

    ==8285== NVPROF is profiling process 8285, command: python player.py test.rec
    ==8285== Profiling application: python player.py test.rec
    ==8285== Profiling result:
    Time(%)      Time     Calls       Avg       Min       Max  Name
     94.92%  47.697ms      5005  9.5290us  1.7500us  13.083us  reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
      5.08%  2.5538ms         8  319.23us  99.750us  413.42us  [CUDA memset]
    
    ==8285== API calls:
    Time(%)      Time     Calls       Avg       Min       Max  Name
     75.00%  5.03966s      5005  1.0069ms  25.083us  11.143ms  cudaDeviceSynchronize
     17.44%  1.17181s      5005  234.13us  83.750us  3.1391ms  cudaLaunch
      4.71%  316.62ms         9  35.180ms  23.083us  314.99ms  cudaMalloc
      2.30%  154.31ms     50050  3.0830us  1.0000us  2.6866ms  cudaSetupArgument
      0.52%  34.857ms      5005  6.9640us  2.5000us  464.67us  cudaConfigureCall
      0.02%  1.2048ms         8  150.60us  71.917us  183.33us  cudaMemset
      0.01%  643.25us        83  7.7490us  1.3330us  287.42us  cuDeviceGetAttribute
      0.00%  12.916us         2  6.4580us  2.0000us  10.916us  cuDeviceGetCount
      0.00%  5.3330us         1  5.3330us  5.3330us  5.3330us  cuDeviceTotalMem
      0.00%  4.0830us         1  4.0830us  4.0830us  4.0830us  cuDeviceGetName
      0.00%  3.4160us         2  1.7080us  1.5830us  1.8330us  cuDeviceGet
    

    程序(nvprof log in the end)的一个小的重构 - 由于某种原因,cudaDeviceSynchronize的平均值低了4倍,但对于空1来说它仍然很高-thread kernel:

    /* Compile with `nvcc test.cu -I.`
     * with -I pointing to "helper_cuda.h" and "helper_string.h" from CUDA samples
     **/
    #include <iostream>
    #include <cuda.h>
    #include <helper_cuda.h>
    
    #define MAX_INPUT_BUFFER_SIZE 131072
    
    typedef struct {
        unsigned short x;
        unsigned short y;
        short a;
        long long b;
    } Event;
    
    long long *d_a_[2], *d_b_[2];
    float *d_as_, *d_bs_;
    bool *d_some_bool_[2];
    Event *d_data_;
    int width_ = 320;
    int height_ = 240;
    
    __global__ void reset_timesurface(long long ts,
            long long *d_a_0, long long *d_a_1,
            long long *d_b_0, long long *d_b_1,
            float *d_as, float *d_bs,
            bool *d_some_bool_0, bool *d_some_bool_1, Event *d_data) {
        // nothing here
    }
    void reset_errors(long long ts) {
        static const int n  = 1024;
        static const dim3 grid_size(width_ * height_ / n
                + (width_ * height_ % n != 0), 1, 1);
        static const dim3 block_dim(n, 1, 1);
    
        reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1],
                d_b_[0], d_b_[1],
                d_as_, d_bs_,
                d_some_bool_[0], d_some_bool_[1], d_data_);
        cudaDeviceSynchronize();
        //  static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000);
        //  cudaMemcpy(h_holder, d_a_[0], 0, cudaMemcpyDeviceToHost);
    }
    
    int main(void) {
        checkCudaErrors(cudaMalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_a_[0], 0, sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_a_[1], 0, sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_b_[0], 0, sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_b_[1], 0, sizeof(long long)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&d_as_, sizeof(float)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_as_, 0, sizeof(float)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&d_bs_, sizeof(float)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_bs_, 0, sizeof(float)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2));
        checkCudaErrors(cudaMemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2));
        checkCudaErrors(cudaMalloc(&d_data_, sizeof(Event)*MAX_INPUT_BUFFER_SIZE));
    
        for (int i = 0; i < 5005; ++i)
            reset_errors(16487L);
    
        cudaFree(d_a_[0]);
        cudaFree(d_a_[1]);
        cudaFree(d_b_[0]);
        cudaFree(d_b_[1]);
        cudaFree(d_as_);
        cudaFree(d_bs_);
        cudaFree(d_some_bool_[0]);
        cudaFree(d_some_bool_[1]);
        cudaFree(d_data_);
        cudaDeviceReset();
    }
    
    /* nvprof ./a.out
    ==9258== NVPROF is profiling process 9258, command: ./a.out
    ==9258== Profiling application: ./a.out
    ==9258== Profiling result:
    Time(%)      Time     Calls       Avg       Min       Max  Name
     92.64%  48.161ms      5005  9.6220us  6.4160us  13.250us  reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
      7.36%  3.8239ms         8  477.99us  148.92us  620.17us  [CUDA memset]
    
    ==9258== API calls:
    Time(%)      Time     Calls       Avg       Min       Max  Name
     53.12%  1.22036s      5005  243.83us  9.6670us  8.5762ms  cudaDeviceSynchronize
     25.10%  576.78ms      5005  115.24us  44.250us  11.888ms  cudaLaunch
      9.13%  209.77ms         9  23.308ms  16.667us  208.54ms  cudaMalloc
      6.56%  150.65ms         1  150.65ms  150.65ms  150.65ms  cudaDeviceReset
      5.33%  122.39ms     50050  2.4450us     833ns  6.1167ms  cudaSetupArgument
      0.60%  13.808ms      5005  2.7580us  1.0830us  104.25us  cudaConfigureCall
      0.10%  2.3845ms         9  264.94us  22.333us  537.75us  cudaFree
      0.04%  938.75us         8  117.34us  58.917us  169.08us  cudaMemset
      0.02%  461.33us        83  5.5580us  1.4160us  197.58us  cuDeviceGetAttribute
      0.00%  15.500us         2  7.7500us  3.6670us  11.833us  cuDeviceGetCount
      0.00%  7.6670us         1  7.6670us  7.6670us  7.6670us  cuDeviceTotalMem
      0.00%  4.8340us         1  4.8340us  4.8340us  4.8340us  cuDeviceGetName
      0.00%  3.6670us         2  1.8330us  1.6670us  2.0000us  cuDeviceGet
    */
    

1 个答案:

答案 0 :(得分:1)

正如在原始消息的评论中详细说明的那样,我的问题与我正在使用的GPU(Tegra K1)完全相关。这是我找到的这个特殊问题的答案;它也可能对其他GPU有用。我的Jetson TK1上cudaDeviceSynchronize的平均值从250us增加到10us。

Tegra的速率默认为72000kHz,我们必须使用此命令将其设置为852000kHz:

$ echo 852000000 > /sys/kernel/debug/clock/override.gbus/rate
$ echo 1 > /sys/kernel/debug/clock/override.gbus/state

我们可以使用以下命令找到可用频率列表:

$ cat /sys/kernel/debug/clock/gbus/possible_rates
72000 108000 180000 252000 324000 396000 468000 540000 612000 648000 684000 708000 756000 804000 852000 (kHz)

可以在CPU和GPU上获得更多性能(再次,以换取更高的功耗);检查this link以获取更多信息。