Question

我有以下两种计算复杂内部产品的实现，一种使用在CPU上运行的STL库，另一种使用在GPU上运行的Thrust：

CPU实施

#include <vector>
#include <numeric>
#include <complex>

int main(int argc, char **argv)
{
    int vec_size = atoi(argv[1]);

    std::vector< std::complex<float> > host_x( vec_size );
    std::generate(host_x.begin(), host_x.end(), std::rand);

    std::vector< std::complex<float> > host_y( vec_size );
    std::generate(host_y.begin(), host_y.end(), std::rand);

    std::complex<float> z = std::inner_product(host_x.begin(), host_x.end(), host_y.begin(), std::complex<float>(0.0f,0.0f) );

    return 0;
}

GPU实施

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>

int main(int argc, char **argv)
{
    int vec_size = atoi(argv[1]);

    thrust::host_vector< thrust::complex<float> > host_x( vec_size );
    thrust::generate(host_x.begin(), host_x.end(), rand);

    thrust::host_vector< thrust::complex<float> > host_y( vec_size );
    thrust::generate(host_y.begin(), host_y.end(), rand);

    thrust::device_vector< thrust::complex<float> > device_x = host_x;
    thrust::device_vector< thrust::complex<float> > device_y = host_y;

    thrust::complex<float> z = thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );

    return 0;
}

我使用g ++编译CPU实现，使用mvcc编译GPU实现。两者都有-O3优化。我在向量中运行了3,000,000个元素的两个实现，并得到以下时序结果：

CPU：
真实的0m0.159s
用户0m0.100s
sys 0m0.048s

GPU：
真实0m0.284s
用户0m0.190s
sys 0m0.083s

我使用以下软件：

$ gcc -v
Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr --with-gxx-include-dir=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/usr/include/c++/4.2.1
Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn)
Target: x86_64-apple-darwin13.3.0
Thread model: posix

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2013 NVIDIA Corporation
Built on Thu_Sep__5_10:17:14_PDT_2013
Cuda compilation tools, release 5.5, V5.5.0

与GitHub回购的最新版本的Thrust一起。

我的CPU是2.4 GHz Intel Core 2 Duo，我的GPU是NVIDIA GeForce 320M 256 MB。

问题： 我是Thrust使用的新手，但是我的GPU实现不应该比我的CPU实现快得多吗？我意识到GPU存在内存交易成本，但我想我是在试图弄清楚我是否正确使用Thrust来执行GPU上的内部产品，因为我认为时间结果出乎意料地被逆转了

修改： 根据每个人的建议，我可以配置迭代次数并更改时间的粒度，如下所示：

#include <stdio.h>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>

int main(int argc, char **argv)
{
    int vec_size = atoi(argv[1]);
    int iterations = atoi(argv[2]);

    float milliseconds = 0;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::host_vector< thrust::complex<float> > host_x( vec_size );
    thrust::generate(host_x.begin(), host_x.end(), rand);

    thrust::host_vector< thrust::complex<float> > host_y( vec_size );
    thrust::generate(host_y.begin(), host_y.end(), rand);

    printf("vector size = %lu bytes\n", vec_size * sizeof(thrust::complex<float>)); 

    cudaEventRecord(start);

    thrust::device_vector< thrust::complex<float> > device_x = host_x;
    thrust::device_vector< thrust::complex<float> > device_y = host_y;
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("copy (device)\t\t%f ms\n", milliseconds);

    cudaEventRecord(start);

    for(int i = 0; i < iterations; ++i)
    {
        thrust::inner_product(thrust::cuda::par, device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );
    }

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("inner_product (device)\t%f ms\n", milliseconds/iterations); 

    cudaEventRecord(start);

    for(int i = 0; i < iterations; ++i)
    {
        thrust::inner_product(thrust::host, host_x.begin(), host_x.end(), host_y.begin(), thrust::complex<float>(0.0f,0.0f) );
    }

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("inner_product (host)\t%f ms\n", milliseconds/iterations);   

    return 0;
}

在Tegra K1上，我得到了以下内容：

$ nvcc complex_inner_product.cu -O3 -arch=sm_32 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device)       45.741653 ms
inner_product (device)  10.595121 ms
inner_product (host)    1.807912 ms

在Intel Core 2 Duo 2.4 GHz和GeForce 320M上，我得到了以下结果：

$ nvcc complex_inner_product.cu -O3 -arch=sm_12 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device)       227.765213 ms
inner_product (device)  42.180416 ms
inner_product (host)    0.000018 ms

在Intel Core i5 3.3 GHz和GeForce GT 755M上：

$ nvcc complex_inner_product.cu -O3 -arch=sm_30 -o cip
$ ./cip 3100000 1000
vector size = 24800000 bytes
copy (device)       22.930016 ms
inner_product (device)  6.249663 ms
inner_product (host)    0.000003 ms

因此，无论我使用何种计算能力或硬件，主机处理器至少比GPU快10倍。有什么想法吗？

Answer 1

您的基准测试方法需要考虑很多事项。我不是在争论你的结果是否有效;根据您认为重要的事情，这是一个意见问题。但有些事情需要考虑：

CUDA启动时间包含在您的测量中。
数据传输时间包含在您的测量中。
您只进行一次测量通过。
您使用的是非常低端的GPU。
您选择要测试的功能并不是非常耗费计算量（每个浮点数量只需几次触发）。

如果你只是计算部分的时间，我希望你会发现GPU看起来好一点。这是一个完全有效的例子：

$ cat t489.cu
#include <vector>
#include <numeric>
#include <complex>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/complex.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>

int main(int argc, char **argv)
{
    timeval tv1, tv2;
    int vec_size = atoi(argv[1]);

    std::vector< std::complex<float> > cpu_x( vec_size );
    std::generate(cpu_x.begin(), cpu_x.end(), std::rand);

    std::vector< std::complex<float> > cpu_y( vec_size );
    std::generate(cpu_y.begin(), cpu_y.end(), std::rand);

    gettimeofday(&tv1, 0);
    std::complex<float> cpu_z = std::inner_product(cpu_x.begin(), cpu_x.end(), cpu_y.begin(), std::complex<float>(0.0f,0.0f) );
    gettimeofday(&tv2, 0);
    std::cout <<"CPU result: " << cpu_z.real() << "," << cpu_z.imag() << std::endl;
    unsigned t2 = (tv2.tv_sec*1000000) + tv2.tv_usec;
    unsigned t1 = (tv1.tv_sec*1000000) + tv1.tv_usec;
    float et = (t2-t1)/(float) 1000;
    std::cout << "CPU elapsed time: " << et << "ms" << std::endl;
    thrust::host_vector< thrust::complex<float> > host_x( vec_size );
    thrust::generate(host_x.begin(), host_x.end(), rand);

    thrust::host_vector< thrust::complex<float> > host_y( vec_size );
    thrust::generate(host_y.begin(), host_y.end(), rand);

    thrust::device_vector< thrust::complex<float> > device_x = host_x;
    thrust::device_vector< thrust::complex<float> > device_y = host_y;

    gettimeofday(&tv1, 0);
    thrust::complex<float> z = thrust::inner_product(device_x.begin(), device_x.end(), device_y.begin(), thrust::complex<float>(0.0f,0.0f) );
    gettimeofday(&tv2, 0);
    std::cout <<"GPU result: " << z.real() << "," << z.imag() << std::endl;
    t2 = (tv2.tv_sec*1000000) + tv2.tv_usec;
    t1 = (tv1.tv_sec*1000000) + tv1.tv_usec;
    et = (t2-t1)/(float) 1000;
    std::cout << "GPU elapsed time: " << et << "ms" << std::endl;

    return 0;
}
$ nvcc -arch=sm_20 -O3 -o t489 t489.cu
$ ./t489 3000000
CPU result: 3.45238e+24,0
CPU elapsed time: 19.294ms
GPU result: 3.46041e+24,0
GPU elapsed time: 3.426ms
$

这是使用Quadro5000 GPU（比GT320M强大得多），RHEL 5.5，CUDA 6.5RC，Thrust 1.8（主分支）

那么哪些数字很重要？随你（由你决定。如果您只是打算在GPU上执行此单个内部产品，而不是GPU上的其他计算或任何活动，那么使用GPU将毫无意义。但是在更大问题的背景下，内部产品只是其中的一部分，GPU可能比CPU更快。

（结果不匹配，因为程序在每种情况下都会产生不同的起始值。）

在GPU上运行的推力复杂内部产品比在CPU上执行STL更慢

1 个答案: