Question

我在CUDA上有一个很大的结构数组结构，该结构是恒定的，并且只对我的应用程序只读。一个非常简单的例子是

struct Graph{
    Node * nodes;
    int nNode;
}
struct Node{
   int* pos;
   int nPos;
}

我的内核需要导航该图并对其进行查询。如您所知，使用cudaMalloc和cudaMemcpy将此结构复制到GPU内存只是很多代码，应该使用统一内存来消除这种需求。

在我的代码中，我在CPU中生成了图形，然后为了进行测试，设计了以下内核

__global__ void testKernel(const Graph graph,int * d_res){
    d_res[0]=graph.nNode;


};

被称为：

// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));

testKernel<<<1,1>>>(graph,d_res);

gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));

带有错误检查from here。

当我如图所示使用testKernel时，它可以正常工作，但是如果我将内核更改为：

__global__ void testKernel(const Graph graph,int * d_res){
    d_res[0]=graph.nodes[0].nPos;

};

我收到非法的内存访问错误。

这是因为统一内存无法正确处理此类数据吗？有没有办法确保我可以避免将所有显式副本都写入GPU内存？

完整MCVE：

#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
    int* pos;
    int nPos;
}Node;
typedef struct Graph{
    Node * nodes;
    int nNode;
}Graph;


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(const Graph graph, int * d_res){
    d_res[0] = graph.nNode;
    // d_res[0]=graph.nodes[0].nPos; // Not working

};



int main(void){

    // fake data, this comes from another process
     Graph graph;
    graph.nodes = (Node*)malloc(2*sizeof(Node));
    graph.nNode = 2;
    for (int i = 0; i < 2; i++){


    // They can have different sizes in the original code
    graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
    graph.nodes[i].pos[0] = 0;
    graph.nodes[i].pos[1] = 1;
    graph.nodes[i].pos[2] = 2;

    graph.nodes[i].nPos = 3;

}



printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));

printf("%d", h_res[0]);
return 0;
}

Answer 1

您的代码未使用CUDA统一内存。 UM绝不是“自动”的。它需要特定的编程步骤才能利用它，并且具有特定的系统要求。

UM section of the programming guide涵盖了所有这些内容。

有没有一种方法可以确保避免将所有显式副本写入GPU内存？

正确使用UM应该可以做到这一点。这是一个完整的示例。我唯一要做的就是将您在主机代码中的malloc操作机械地转换为等效的cudaMallocManaged操作。

$ cat t1389.cu
#include <algorithm>
#include <stdio.h>

typedef struct node{
    int* pos;
    int nPos;
}Node;
typedef struct Graph{
    Node * nodes;
    int nNode;
}Graph;


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(const Graph graph, int * d_res){
    d_res[0] = graph.nNode;
     d_res[0]=graph.nodes[0].nPos; // Not working

};



int main(void){

    // fake data, this comes from another process
     Graph graph;
    cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
    graph.nNode = 2;
    for (int i = 0; i < 2; i++){


    // They can have different sizes in the original code
    cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
    graph.nodes[i].pos[0] = 0;
    graph.nodes[i].pos[1] = 1;
    graph.nodes[i].pos[2] = 2;

    graph.nodes[i].nPos = 3;

}



printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));

printf("%d", h_res[0]);
return 0;
}
$ nvcc t1389.cu -o t1389
$ cuda-memcheck ./t1389
========= CUDA-MEMCHECK
2
3========= ERROR SUMMARY: 0 errors
$

UM记录了许多系统要求。我不会在这里尝试全部背诵。首先，您需要cc3.0或更高版本的GPU。您的MCVE不包含任何标准错误检查，因此我没有尝试添加它。但是，如果您仍然对此代码有疑问，请确保使用正确的CUDA错误检查并使用cuda-memcheck运行它。

如果整个数据结构（包括嵌入式指针）是使用普通的主机分配器分配的，并且您无法对此进行控制，那么您将无法在UM机制中直接使用它，而无需进行某种涉及复制。上面链接的编程指南部分的K.1.6部分提到的IBM Power9系统例外。

在尝试与UM一起使用主机分配器（例如malloc）之前，您应该先测试pageableMemoryAccessUsesHostPageTables属性，如本节所述。

除正确配置的IBM Power9系统外，当前不会在任何系统上设置该属性。当前没有x86系统设置/可用此属性。

统一内存和数组结构

1 个答案: