Question

我目前正在使用CUDA进行一个项目，其中管道刷新每1ms有200-10000个新事件。每次，我想调用一个（/两个）内核来计算一小部分输出;然后将这些输出提供给管道的下一个元素。

理论流程是：

以std::vector
cudaMemcpy向量的向量
处理
生成小型输出列表
cudaMemcpy到输出std::vector

但是当我在没有处理的1block / 1thread空内核上调用cudaDeviceSynchronize时，它平均需要0.7到1.4ms，这已经高于我的1ms时间帧。

我最终可以更改管道的时间范围，以便每5ms接收一次事件，但每次更多5次。虽然这不太理想。

最小化cudaDeviceSynchronize开销的最佳方法是什么？溪流在这种情况下会有所帮助吗？或另一种有效运行管道的解决方案。

（Jetson TK1，计算能力3.2）

这是应用程序的nvprof日志：

==8285== NVPROF is profiling process 8285, command: python player.py test.rec
==8285== Profiling application: python player.py test.rec
==8285== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 94.92%  47.697ms      5005  9.5290us  1.7500us  13.083us  reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
  5.08%  2.5538ms         8  319.23us  99.750us  413.42us  [CUDA memset]

==8285== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 75.00%  5.03966s      5005  1.0069ms  25.083us  11.143ms  cudaDeviceSynchronize
 17.44%  1.17181s      5005  234.13us  83.750us  3.1391ms  cudaLaunch
  4.71%  316.62ms         9  35.180ms  23.083us  314.99ms  cudaMalloc
  2.30%  154.31ms     50050  3.0830us  1.0000us  2.6866ms  cudaSetupArgument
  0.52%  34.857ms      5005  6.9640us  2.5000us  464.67us  cudaConfigureCall
  0.02%  1.2048ms         8  150.60us  71.917us  183.33us  cudaMemset
  0.01%  643.25us        83  7.7490us  1.3330us  287.42us  cuDeviceGetAttribute
  0.00%  12.916us         2  6.4580us  2.0000us  10.916us  cuDeviceGetCount
  0.00%  5.3330us         1  5.3330us  5.3330us  5.3330us  cuDeviceTotalMem
  0.00%  4.0830us         1  4.0830us  4.0830us  4.0830us  cuDeviceGetName
  0.00%  3.4160us         2  1.7080us  1.5830us  1.8330us  cuDeviceGet

程序（nvprof log in the end）的一个小的重构 - 由于某种原因，cudaDeviceSynchronize的平均值低了4倍，但对于空1来说它仍然很高-thread kernel：

/* Compile with `nvcc test.cu -I.`
 * with -I pointing to "helper_cuda.h" and "helper_string.h" from CUDA samples
 **/
#include <iostream>
#include <cuda.h>
#include <helper_cuda.h>

#define MAX_INPUT_BUFFER_SIZE 131072

typedef struct {
    unsigned short x;
    unsigned short y;
    short a;
    long long b;
} Event;

long long *d_a_[2], *d_b_[2];
float *d_as_, *d_bs_;
bool *d_some_bool_[2];
Event *d_data_;
int width_ = 320;
int height_ = 240;

__global__ void reset_timesurface(long long ts,
        long long *d_a_0, long long *d_a_1,
        long long *d_b_0, long long *d_b_1,
        float *d_as, float *d_bs,
        bool *d_some_bool_0, bool *d_some_bool_1, Event *d_data) {
    // nothing here
}
void reset_errors(long long ts) {
    static const int n  = 1024;
    static const dim3 grid_size(width_ * height_ / n
            + (width_ * height_ % n != 0), 1, 1);
    static const dim3 block_dim(n, 1, 1);

    reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1],
            d_b_[0], d_b_[1],
            d_as_, d_bs_,
            d_some_bool_[0], d_some_bool_[1], d_data_);
    cudaDeviceSynchronize();
    //  static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000);
    //  cudaMemcpy(h_holder, d_a_[0], 0, cudaMemcpyDeviceToHost);
}

int main(void) {
    checkCudaErrors(cudaMalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_a_[0], 0, sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_a_[1], 0, sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_b_[0], 0, sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_b_[1], 0, sizeof(long long)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&d_as_, sizeof(float)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_as_, 0, sizeof(float)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&d_bs_, sizeof(float)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_bs_, 0, sizeof(float)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2));
    checkCudaErrors(cudaMemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2));
    checkCudaErrors(cudaMalloc(&d_data_, sizeof(Event)*MAX_INPUT_BUFFER_SIZE));

    for (int i = 0; i < 5005; ++i)
        reset_errors(16487L);

    cudaFree(d_a_[0]);
    cudaFree(d_a_[1]);
    cudaFree(d_b_[0]);
    cudaFree(d_b_[1]);
    cudaFree(d_as_);
    cudaFree(d_bs_);
    cudaFree(d_some_bool_[0]);
    cudaFree(d_some_bool_[1]);
    cudaFree(d_data_);
    cudaDeviceReset();
}

/* nvprof ./a.out
==9258== NVPROF is profiling process 9258, command: ./a.out
==9258== Profiling application: ./a.out
==9258== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 92.64%  48.161ms      5005  9.6220us  6.4160us  13.250us  reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
  7.36%  3.8239ms         8  477.99us  148.92us  620.17us  [CUDA memset]

==9258== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 53.12%  1.22036s      5005  243.83us  9.6670us  8.5762ms  cudaDeviceSynchronize
 25.10%  576.78ms      5005  115.24us  44.250us  11.888ms  cudaLaunch
  9.13%  209.77ms         9  23.308ms  16.667us  208.54ms  cudaMalloc
  6.56%  150.65ms         1  150.65ms  150.65ms  150.65ms  cudaDeviceReset
  5.33%  122.39ms     50050  2.4450us     833ns  6.1167ms  cudaSetupArgument
  0.60%  13.808ms      5005  2.7580us  1.0830us  104.25us  cudaConfigureCall
  0.10%  2.3845ms         9  264.94us  22.333us  537.75us  cudaFree
  0.04%  938.75us         8  117.34us  58.917us  169.08us  cudaMemset
  0.02%  461.33us        83  5.5580us  1.4160us  197.58us  cuDeviceGetAttribute
  0.00%  15.500us         2  7.7500us  3.6670us  11.833us  cuDeviceGetCount
  0.00%  7.6670us         1  7.6670us  7.6670us  7.6670us  cuDeviceTotalMem
  0.00%  4.8340us         1  4.8340us  4.8340us  4.8340us  cuDeviceGetName
  0.00%  3.6670us         2  1.8330us  1.6670us  2.0000us  cuDeviceGet
*/

Answer 1

正如在原始消息的评论中详细说明的那样，我的问题与我正在使用的GPU（Tegra K1）完全相关。这是我找到的这个特殊问题的答案;它也可能对其他GPU有用。我的Jetson TK1上cudaDeviceSynchronize的平均值从250us增加到10us。

Tegra的速率默认为72000kHz，我们必须使用此命令将其设置为852000kHz：

$ echo 852000000 > /sys/kernel/debug/clock/override.gbus/rate
$ echo 1 > /sys/kernel/debug/clock/override.gbus/state

我们可以使用以下命令找到可用频率列表：

$ cat /sys/kernel/debug/clock/gbus/possible_rates
72000 108000 180000 252000 324000 396000 468000 540000 612000 648000 684000 708000 756000 804000 852000 (kHz)

可以在CPU和GPU上获得更多性能（再次，以换取更高的功耗）;检查this link以获取更多信息。

最小化cudaDeviceSynchronize启动开销

1 个答案: