Question

我有两个cudaArray，a1和a2（具有相同的大小），它们代表两个矩阵。

使用纹理内存，我想将这两个cudaArrays相乘。然后我想将结果复制回一个普通数组，让它命名为* a1_h。

事实是，我只是不知道该怎么做。我设法定义，分配我的两个cudaArrays并将浮点数放入其中。

现在我想做一个执行这些乘法的内核。

有人能帮助我吗？

ROOM_X和ROOM_Y是int，它们定义矩阵的宽度和高度。 mytex_M1和mytex_M2的纹理定义为：texture＆lt; float，2，cudaReadModeElementType＆gt;

这是我的主要内容：

int main(int argc, char * argv[]) {

    int size = ROOM_X * ROOM_Y * sizeof(float);

    //creation of arrays on host.Will be useful for filling the cudaArrays
    float *M1_h, *M2_h;

//allocating memories on Host
    M1_h = (float *)malloc(size);
    M2_h = (float *)malloc(size);

//creation of  channel descriptions for 2d texture
cudaChannelFormatDesc channelDesc_M1 = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc channelDesc_M2 = cudaCreateChannelDesc<float>();

//creation of 2 cudaArray * . 
cudaArray *M1_array,*M2_array;

//bind arrays and channel in order to allocate space
cudaMallocArray(&M1_array,&channelDesc_M1,ROOM_X,ROOM_Y);
cudaMallocArray(&M2_array,&channelDesc_M2,ROOM_X,ROOM_Y);

//filling the matrices on host
Matrix(M1_h);
Matrix(M2_h);

//copy from host to device (putting the initial values of M1 and M2 into the arrays)
 cudaMemcpyToArray(M1_array, 0, 0,M1_h, size,cudaMemcpyHostToDevice);
 cudaMemcpyToArray(M2_array, 0, 0,M2_h, size,cudaMemcpyHostToDevice);

//set textures parameters 
mytex_M1.addressMode[0] = cudaAddressModeWrap;
mytex_M1.addressMode[1] = cudaAddressModeWrap;
mytex_M1.filterMode = cudaFilterModeLinear;
mytex_M1.normalized = true; //NB coordinates in [0,1]

mytex_M2.addressMode[0] = cudaAddressModeWrap;
mytex_M2.addressMode[1] = cudaAddressModeWrap;
mytex_M2.filterMode = cudaFilterModeLinear;
mytex_M2.normalized = true; //NB coordinates in [0,1]

//bind arrays to the textures 
cudaBindTextureToArray(mytex_M1,M1_array);  
cudaBindTextureToArray(mytex_M2,M2_array);

//allocate device memory for result
float* M1_d;
cudaMalloc( (void**)&M1_d, size);

//dimensions of grid and blocks
dim3 dimGrid(ROOM_X,ROOM_Y);
dim3 dimBlock(1,1);

//execution of the kernel . The result of the multiplication has to be put in M1_d
mul_texture<<<dimGrid, dimBlock >>>(M1_d);

//copy result from device to host
cudaMemcpy(M1_h,M1_d, size, cudaMemcpyDeviceToHost);


//free memory on device
cudaFreeArray(M1_array);
cudaFreeArray(M2_array);
cudaFree(M1_d);

//free memory on host
free(M1_h);
free(M2_h);

return 0;
}

Answer 1

声明纹理时

纹理引用只能声明为静态全局变量，不能作为参数传递给函数。 http://docs.nvidia.com/cuda/cuda-c-programming-guide/#texture-reference-api

所以，如果你已经成功定义了纹理引用，初始化数组，然后复制到纹理空间并准备输出缓冲区（这似乎是根据你的代码完成的），你需要做的是实现核心。例如：

__global__ void
mul_texture(float* M1_d, int w, int h)
{
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    // take care of the size of the image, it's a good practice
    if ( x < w && y < h )
    {
        // the output M1_d is actually represented as 1D array
        // so the offset of each value is related to their (x,y) position
        // in a tow-major order
        int gid = x + y * w;

        // As texture are declared at global scope,
        // we can access their content at any kernel
        float M1_value = tex2D(mytex_M1,x,y);
        float M2_value = tex2D(mytex_M2,x,y);

        // The final results is the pointwise multiplication
        M1_d[ gid ] = M1_value * M2_value;
    }
}

您需要更改内核调用以包含w和h值，这些值对应于宽度（矩阵中的列数）和高度（矩阵的行数）。< / p>

mul_texture<<<dimGrid, dimBlock >>>(M1_d, ROOM_X, ROOM_Y);

请注意，您没有执行任何error checking，这对您现在和将来都会有很大帮助。我没有检查这个答案中提供的内核是否有效，因为你的代码没有编译。

两个cudaArray在内核中的乘法？（使用纹理内存）

1 个答案: