Question

我希望通过运行此代码来了解使用内核融合的性能提升。但是我为同一段代码获得了不同的运行时间。

template <class T>
struct square
{
    __host__ __device__
    T operator()(const T &x) const
    {
        return x * x;
    }
};

int main()
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    const int numOfEle = 500;
    std::cout<<"profiling norm with " << numOfEle << " elements" << std::endl;
    thrust::device_vector<float> dv(numOfEle);
    thrust::sequence(dv.begin(), dv.end());
    float init = 0.0f;
    float norm = 0.0f;
    float miliseconds = 0.0f;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    // same code runs for multiple times
    cudaEventRecord(start);
    norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"transform_reduce: "<<"norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    cudaEventRecord(start);
    thrust::device_vector<float> dv2(numOfEle);
    thrust::transform(dv.begin(), dv.end(), dv2.begin(), square<float>());
    norm = thrust::reduce(dv2.begin(), dv2.end(), 0.0f, thrust::plus<float>());
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&miliseconds, start, stop);
    std::cout<<"naive implementation: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;

    return 0;
}

这是我得到的结果。

profiling norm with 500 elements
transform_reduce: norm:4.15417e+07,miliseconds:0.323232
transform_reduce: norm:4.15417e+07,miliseconds:0.192128
transform_reduce: norm:4.15417e+07,miliseconds:0.186848
naive implementation: norm:4.15417e+07,miliseconds:0.211328

为什么第一次运行时间（0.323232）如此之大？我在这里错过了一个简介CUDA计划的内容吗？谢谢！

Answer 1

第一个执行时间最慢，因为与其他调用相比，它会产生一些额外的运行时API设置延迟。但是你的例子实际上只是测量延迟而不是计算时间，因为你的例子中的并行工作是如此之小。请考虑以下代码修改：

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <cuda_profiler_api.h>
template <class T>
struct square
{
    __host__ __device__ T operator()(const T &x) const { return x * x; }
};

void dorun(int numOfEle, int Nreps)
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    std::cout<<"profiling norm with " << numOfEle << " elements" << std::endl;
    thrust::device_vector<float> dv(numOfEle);
    thrust::sequence(dv.begin(), dv.end());
    thrust::device_vector<float> dv2(numOfEle);
    cudaDeviceSynchronize();

    cudaProfilerStart();
    for(int i=0; i<Nreps; i++) {
        float norm = 0.0f, miliseconds = 0.0f;
        cudaEventRecord(start);
        thrust::transform(dv.begin(), dv.end(), dv2.begin(), square<float>());
        norm = thrust::reduce(dv2.begin(), dv2.end(), 0.0f, thrust::plus<float>());
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&miliseconds, start, stop);
        std::cout<<i<<" naive implementation: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;
    }

    for(int i=0; i<Nreps; i++) {
        float init = 0.0f, norm = 0.0f, miliseconds = 0.0f;
        cudaEventRecord(start);
        norm = thrust::transform_reduce(dv.begin(), dv.end(), square<float>(), init, thrust::plus<float>());
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&miliseconds, start, stop);
        std::cout<<i<<" transform_reduce: norm:"<<norm<<",miliseconds:"<<miliseconds<<std::endl;
    }
    cudaProfilerStop();
}

int main()
{
    const int Nreps = 4;
    int numOfEle = 500;

    for(int i=0; i<7; i++, numOfEle *= 10) {
        dorun(numOfEle, Nreps);
        cudaDeviceReset();
    }
    return 0;
}

此处，转换缩减的两个版本分别以几种不同的大小运行多次，初始版本为naïve，只是为了确认这不是transform_reduce的属性：

$ nvcc -arch=sm_52 runtime.cu -o runtime
$ ./runtime
profiling norm with 500 elements
0 naive implementation: norm:4.15417e+07,miliseconds:0.345088
1 naive implementation: norm:4.15417e+07,miliseconds:0.219968
2 naive implementation: norm:4.15417e+07,miliseconds:0.215008
3 naive implementation: norm:4.15417e+07,miliseconds:0.212864
0 transform_reduce: norm:4.15417e+07,miliseconds:0.196704
1 transform_reduce: norm:4.15417e+07,miliseconds:0.194432
2 transform_reduce: norm:4.15417e+07,miliseconds:0.19328
3 transform_reduce: norm:4.15417e+07,miliseconds:0.192992
profiling norm with 5000 elements
0 naive implementation: norm:4.16542e+10,miliseconds:0.312928
1 naive implementation: norm:4.16542e+10,miliseconds:0.194784
2 naive implementation: norm:4.16542e+10,miliseconds:0.192032
3 naive implementation: norm:4.16542e+10,miliseconds:0.191008
0 transform_reduce: norm:4.16542e+10,miliseconds:0.179232
1 transform_reduce: norm:4.16542e+10,miliseconds:0.177568
2 transform_reduce: norm:4.16542e+10,miliseconds:0.177664
3 transform_reduce: norm:4.16542e+10,miliseconds:0.17664
profiling norm with 50000 elements
0 naive implementation: norm:4.16654e+13,miliseconds:0.288864
1 naive implementation: norm:4.16654e+13,miliseconds:0.189472
2 naive implementation: norm:4.16654e+13,miliseconds:0.186464
3 naive implementation: norm:4.16654e+13,miliseconds:0.18592
0 transform_reduce: norm:4.16654e+13,miliseconds:0.174848
1 transform_reduce: norm:4.16654e+13,miliseconds:0.190176
2 transform_reduce: norm:4.16654e+13,miliseconds:0.173216
3 transform_reduce: norm:4.16654e+13,miliseconds:0.187744
profiling norm with 500000 elements
0 naive implementation: norm:4.16665e+16,miliseconds:0.300192
1 naive implementation: norm:4.16665e+16,miliseconds:0.203936
2 naive implementation: norm:4.16665e+16,miliseconds:0.2008
3 naive implementation: norm:4.16665e+16,miliseconds:0.199232
0 transform_reduce: norm:4.16665e+16,miliseconds:0.197984
1 transform_reduce: norm:4.16665e+16,miliseconds:0.191776
2 transform_reduce: norm:4.16665e+16,miliseconds:0.192096
3 transform_reduce: norm:4.16665e+16,miliseconds:0.191264
profiling norm with 5000000 elements
0 naive implementation: norm:4.16667e+19,miliseconds:0.525504
1 naive implementation: norm:4.16667e+19,miliseconds:0.50608
2 naive implementation: norm:4.16667e+19,miliseconds:0.505216
3 naive implementation: norm:4.16667e+19,miliseconds:0.504896
0 transform_reduce: norm:4.16667e+19,miliseconds:0.345792
1 transform_reduce: norm:4.16667e+19,miliseconds:0.344736
2 transform_reduce: norm:4.16667e+19,miliseconds:0.344512
3 transform_reduce: norm:4.16667e+19,miliseconds:0.34384
profiling norm with 50000000 elements
0 naive implementation: norm:4.16667e+22,miliseconds:4.56586
1 naive implementation: norm:4.16667e+22,miliseconds:4.5408
2 naive implementation: norm:4.16667e+22,miliseconds:4.62774
3 naive implementation: norm:4.16667e+22,miliseconds:4.54912
0 transform_reduce: norm:4.16667e+22,miliseconds:1.68493
1 transform_reduce: norm:4.16667e+22,miliseconds:1.67744
2 transform_reduce: norm:4.16667e+22,miliseconds:1.76778
3 transform_reduce: norm:4.16667e+22,miliseconds:1.86694
profiling norm with 500000000 elements
0 naive implementation: norm:4.16667e+25,miliseconds:63.7808
1 naive implementation: norm:4.16667e+25,miliseconds:63.813
2 naive implementation: norm:4.16667e+25,miliseconds:62.8569
3 naive implementation: norm:4.16667e+25,miliseconds:61.5553
0 transform_reduce: norm:4.16667e+25,miliseconds:14.7033
1 transform_reduce: norm:4.16667e+25,miliseconds:14.6545
2 transform_reduce: norm:4.16667e+25,miliseconds:14.655
3 transform_reduce: norm:4.16667e+25,miliseconds:14.5933

注意执行时间实际上不会随着样本大小的增加而变化，直到我们达到5000000个元素，而在500000000个元素时，第一个解决方案不再是最慢的。这完全是因为固定延迟，一旦实际并行工作远大于固定延迟，这就变得无关紧要了。

因此，让我们详细了解一些分析器输出。首先在变换调用中以小尺寸启动第一个内核的一些API跟踪：

240.66ms  2.6860us  cudaFuncGetAttributes
240.66ms  2.5910us  cudaFuncGetAttributes
240.66ms     527ns  cudaConfigureCall
240.66ms     401ns  cudaSetupArgument
240.67ms  1.7100ms  cudaLaunch (void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<

然后是第二个：

242.82ms  2.6440us  cudaFuncGetAttributes
242.83ms  2.6460us  cudaFuncGetAttributes
242.83ms     557ns  cudaConfigureCall
242.83ms     394ns  cudaSetupArgument
242.83ms  16.992us  cudaLaunch (void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<

第一次异步启动需要1.7ms，而第二次需要16us。但是如果我们查看相同执行的GPU跟踪，我们会看到第一次调用：

   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
229.58ms  2.0800us              (1 1 1)      (1024 1 1)        12       32B        0B         -           -  GeForce GTX 970         1         7  void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<unsigned long=1>, unsigned long=0>, unsigned long=0>, thrust::system::cuda::detail::bulk_::detail::closure<thrust::system::cuda::detail::for_each_n_detail::for_each_kernel, thrust::tuple<thrust::system::cuda::detail::bulk_::detail::cursor<unsigned int=0>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<thrust::detail::unary_transform_functor<square<float>>, void>, unsigned int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>>>(unsigned long=1) [163]

这是第二次：

230.03ms  2.1120us              (1 1 1)      (1024 1 1)        12       32B        0B         -           -  GeForce GTX 970         1         7  void thrust::system::cuda::detail::bulk_::detail::launch_by_value<unsigned int=0, thrust::system::cuda::detail::bulk_::detail::cuda_task<thrust::system::cuda::detail::bulk_::parallel_group<thrust::system::cuda::detail::bulk_::concurrent_group<thrust::system::cuda::detail::bulk_::agent<unsigned long=1>, unsigned long=0>, unsigned long=0>, thrust::system::cuda::detail::bulk_::detail::closure<thrust::system::cuda::detail::for_each_n_detail::for_each_kernel, thrust::tuple<thrust::system::cuda::detail::bulk_::detail::cursor<unsigned int=0>, thrust::zip_iterator<thrust::tuple<thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::detail::normal_iterator<thrust::device_ptr<float>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<thrust::detail::unary_transform_functor<square<float>>, void>, unsigned int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>>>(unsigned long=1) [196]

这两个内核运行时间略长于2us，即API调用启动它们的时间要少得多。因此，额外的API延迟是导致时序差异的原因，而不是代码本身性能的任何变化。

在多次运行相同的CUDA代码时获得不同的时间性能？

1 个答案: