与奇怪的CUB交互的ModernGPU ReduceByKey错误

时间:2014-11-11 21:48:42

标签: cuda thrust cub

我试图在一个代码中比较推力,CUB和ModernGPU(MGPU)库中的reduce_by_key,仅按推力和CUB上的previous post行。目的不是为了确定基准,而是为了确保我能正确使用它们。

thrust::reduce_by_keycub::DeviceReduce::ReduceByKey来电可以很好地协同工作,而thrust::reduce_by_key和ModernGPU ReduceByKey调用可以很好地协同工作,但是当我在调用MGPU后调用CUB时,CUB停止工作。 cuda-memcheck说我的MGPU代码有错误,但很难找到,因为我只有一个MGPU函数调用,错误是非致命的,并且MGPU调用继续以获得正确的结果!

完整的可编辑代码和cuda-memcheck输出进一步下载,但首先澄清:

按照以下顺序发表声明,所有结果都是正确的(尽管cuda-memcheck抱怨)

test_thrust(Nkey,Nseg,x,key,output,keysout);                                                                                                                                
test_cub(Nkey,Nseg,x,key,output,keysout);                                                                                                                                   
test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);                                                                                                                        

但是按照这个顺序的语句,CUB结果是错误的:

test_thrust(Nkey,Nseg,x,key,output,keysout);                                                                                                                                
test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);
test_cub(Nkey,Nseg,x,key,output,keysout);                                                                                                                                                                                                                                                        

其中“CUB结果错误”意味着cub::DeviceReduce::ReduceByKey例程根本不更新其输出参数。

这是使用CUDA 6.5,CUB 1.3.2,以及通过按下github上的“download zip”按钮下载的最新ModernGPU - 也许是版本1.1。可以使用以下库编译以下代码:

nvcc $MGPU/src/mgpucontext.cu $MGPU/src/mmio.cpp $MGPU/src/mgpuutil.cpp test_reduce_thrust_MGPU.cu -I $CUB/ -I $MGPU/include

以下是代码:

#include <iostream>
#include <string>

// THRUST:
#include <thrust/sort.h>
#include <thrust/device_vector.h>

// CUB:
#include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>

// Modern GPU (MGPU)
#include "kernels/reducebykey.cuh"

//========================================
// for CUB:
struct CustomSum
{
    template <typename T>
    CUB_RUNTIME_FUNCTION __device__ __forceinline__
    //__host__ __device__ __forceinline__
    T operator()(const T &a, const T &b) const {
        return b+a;
    }
};
//========================================

void show_vecs(const std::string& title,
           int Nkey,int Nseg,
           const thrust::device_vector<float>& x,
           const thrust::device_vector<int>& key,
           thrust::device_vector<float>& output,
           thrust::device_vector<int>& keysout,
           bool check){
    std::cout << title << std::endl;
    for (int i=0;i<Nkey;i++) std::cout << x[i] <<" ";  std::cout<<std::endl;
    for (int i=0;i<Nkey;i++) std::cout << key[i] <<" ";  std::cout<<std::endl;
    for (int i=0;i<Nkey;i++) std::cout << keysout[i] <<" ";  std::cout<<std::endl;
    for (int i=0;i<Nseg;i++) std::cout << output[i] <<" ";  std::cout<<std::endl;
    if (check){
    std::cout << "total="      << thrust::reduce(output.begin(),output.end())
          <<", should be " << thrust::reduce(x.begin(),x.end()) <<std::endl;
    }
}
void test_thrust(int Nkey,int Nseg,
         const thrust::device_vector<float>& x,
         const thrust::device_vector<int>& key,
         thrust::device_vector<float>& output,
         thrust::device_vector<int>& keysout){

    std::cout<<"=================================================================="<<std::endl
         <<" THRUST reduce_by_key:"<<std::endl
         <<"=================================================================="<<std::endl;

    // reset output:
    thrust::fill(keysout.begin(), keysout.end(), 0);
    thrust::fill(output.begin(), output.end(), 0.0f);

    show_vecs("Thrust input",Nkey,Nseg,x,key,output,keysout,0);

    thrust::reduce_by_key(key.begin(),
              key.end(),
              x.begin(),
              keysout.begin(),
              output.begin());

    show_vecs("Thrust output",Nkey,Nseg,x,key,output,keysout,1);
}

void test_cub(int Nkey,int Nseg,
          thrust::device_vector<float>& x,
          thrust::device_vector<int>& key,
          thrust::device_vector<float>& output,
          thrust::device_vector<int>& keysout){
    std::cout<<"=================================================================="<<std::endl
         <<" CUB ReduceByKey:"<<std::endl
         <<"=================================================================="<<std::endl;

    // reset output:
    thrust::fill(keysout.begin(), keysout.end(), 0);
    thrust::fill(output.begin(), output.end(),  0.0f);

    show_vecs("CUB input",Nkey,Nseg,x,key,output,keysout,0);

    int   *cub_keys_in =thrust::raw_pointer_cast(&key[0]);
    int   *cub_keys_out=thrust::raw_pointer_cast(&keysout[0]);
    float *cub_val_in  =thrust::raw_pointer_cast(&x[0]);  
    float *cub_val_out =thrust::raw_pointer_cast(&output[0]);
    int *d_num_segments;
    cudaMalloc(&d_num_segments, sizeof(int));
    cudaMemcpy(d_num_segments,&Nseg,sizeof(int),cudaMemcpyHostToDevice);
    CustomSum   reduction_op;

    // Reduce using temporary storage
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;
    cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, 
                   cub_keys_in, cub_keys_out, 
                   cub_val_in, cub_val_out, 
                   d_num_segments, reduction_op, Nkey);
    cudaDeviceSynchronize();
    cudaMalloc(&d_temp_storage, temp_storage_bytes);
    std::cout << "temp_storage_bytes = " << temp_storage_bytes << std::endl;
    cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, 
                   cub_keys_in, cub_keys_out, 
                   cub_val_in, cub_val_out, 
                   d_num_segments, reduction_op, Nkey);
    cudaDeviceSynchronize();
    cudaFree(d_temp_storage);

    show_vecs("CUB output",Nkey,Nseg,x,key,output,keysout,1);
}


void test_mgpu(int argc, char** argv,
           int Nkey,int Nseg,
           thrust::device_vector<float>& x,
           thrust::device_vector<int>& key,
           thrust::device_vector<float>& output,
           thrust::device_vector<int>& keysout){
    std::cout<<"=================================================================="<<std::endl
         <<" MGPU ReduceByKey:"<<std::endl
         <<"=================================================================="<<std::endl;

    // reset output:
    thrust::fill(keysout.begin(), keysout.end(), 0);
    thrust::fill(output.begin(), output.end(), 0.0f);

    show_vecs("MGPU input",Nkey,Nseg,x,key,output,keysout,0);

    mgpu::ContextPtr context=mgpu::CreateCudaDevice(argc, argv, false);
    int   *mgpu_keys_in =thrust::raw_pointer_cast(&key[0]);
    int   *mgpu_keys_out=thrust::raw_pointer_cast(&keysout[0]);
    float *mgpu_vals_in =thrust::raw_pointer_cast(&x[0]);
    float *mgpu_vals_out=thrust::raw_pointer_cast(&output[0]);
    float identity=0.0f;
    int count=key.size();
    int numSegments,numOut;
    ReduceByKey(mgpu_keys_in, mgpu_vals_in, count, identity,
            mgpu::plus<float>(), mgpu::equal_to<int>(),
            mgpu_keys_out, 
            mgpu_vals_out,
            &numSegments, 
            &numOut, *context);

    // Explicit device allocations made things worse:
    // float  gident=0.0;
    // float *identity;    cudaMalloc(&identity, sizeof(float));    
    // int   *numSegments; cudaMalloc(&numSegments, sizeof(int));
    // int   *numOut;      cudaMalloc(&numOut, sizeof(int));
    // cudaMemcpy(identity,&gident,sizeof(float),cudaMemcpyHostToDevice);
    // ReduceByKey(mgpu_keys_in, mgpu_vals_in, key.size(), gident, //*identity,
    //          mgpu::plus<float>(), mgpu::equal_to<int>(),
    //          mgpu_keys_out, 
    //          mgpu_vals_out,
    //          numSegments, 
    //          numOut, *context);

    cudaDeviceSynchronize();

    show_vecs("MGPU output",Nkey,Nseg,x,key,output,keysout,1);
}

int main(int argc, char** argv){

    // an array of unsorted keys, 9 values total

    const int Nkey=20;
    int Nseg=9;
    int ikey[Nkey] = {0, 0, 0, 6, 8, 0, 2, 4, 6, 8, 1, 3, 5, 7, 8, 1, 3, 5, 7, 8}; 

    thrust::device_vector<int> key(ikey,ikey+Nkey);
    thrust::device_vector<int> keysout(Nkey);

    // a data vector, x, to be reduced by key

    float xval[Nkey];
    for (int i=0; i<Nkey; i++) xval[i]=ikey[i]+0.1f;
    thrust::device_vector<float> x(xval,xval+Nkey);

    // an output for reduced x:

    thrust::device_vector<float> output(Nseg,0.0f);

    // First, sort key and x, by key

    thrust::sort_by_key(key.begin(),key.end(),x.begin());

    // Now, the tests:

    test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);
    test_thrust(Nkey,Nseg,x,key,output,keysout);
    test_cub(Nkey,Nseg,x,key,output,keysout);
    test_mgpu(argc,argv,Nkey,Nseg,x,key,output,keysout);

  return 1;
}

cuda-memcheck输出的前几行是:

========= CUDA-MEMCHECK
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpyAsync. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x2ef673]
=========     Host Frame:./a.out [0x74793]
=========     Host Frame:./a.out [0x1c87c]
=========     Host Frame:./a.out [0x1ae7e]
=========     Host Frame:./a.out [0x190b0]
=========     Host Frame:./a.out [0x10974]
=========     Host Frame:./a.out [0x10c8c]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
=========     Host Frame:./a.out [0x43b9]

这对我来说都很神秘。

最终我希望尝试使用MGPU ReduceByKeyPreprocess / ReduceByKeyApply对联,但看起来ReduceByKey调用在某种程度上是错误的。

非常感谢任何帮助,谢谢!

0 个答案:

没有答案