Question

我正在尝试在设备上填充多个链接列表，然后将这些列表返回给主机。

根据我的理解，我需要为我的struct Element分配内存，但我不知道如何去做，因为我将有许多链表，每个链表都有未知数量的元素。我尝试了几种不同的东西，但它仍然没有用。所以我回到了起点。这是我的代码：

//NODE CLASS
class Node{
public:
    int x,y;
    Node *parent;
    __device__ __host__ Node(){}
    __device__ __host__ Node(int cX, int cY){x = cX; y = cY;}
    __device__ __host__ int get_row() { return x; }
    __device__ __host__ int get_col() { return y; }
};

//LINKED LIST
class LinkedList{
public:
    __device__ __host__ struct Element{
            Node n1;
            Element *next;
    };

    __device__ __host__ LinkedList(){
        head = NULL;
    }

    __device__ __host__ void addNode(Node n){
        Element *el = new Element();
        el->n1 = n;

        el->next = head;
        head = el;
    }

    __device__ __host__ Node popFirstNode(){
        Element *cur = head;
        Node n;

        if(cur != NULL){
            n = cur -> n1;
            head = head -> next;
        }

        delete cur;
        return n;
    }
    __device__ __host__ bool isEmpty(){
        Element *cur = head;

        if(cur == NULL){
            return true;
        }

        return false;
    }

    Element *head;
};

//LISTS
__global__ void listsKernel(LinkedList* d_Results, int numLists){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    Node n(1,1);

    if(idx < numLists){
        d_Results[idx].addNode(n);
        d_Results[idx].addNode(n);
        d_Results[idx].addNode(n);
        d_Results[idx].addNode(n);
    }
}

int main(){
    int numLists = 10;
    size_t size = numLists * sizeof(LinkedList); 
    LinkedList curList;

    LinkedList* h_Results = (LinkedList*)malloc(size);
    LinkedList* d_Results;
    cudaMalloc((void**)&d_Results, size);

    listsKernel<<<256,256>>>(d_Results, numLists);
    cudaMemcpy(h_Results, d_Results, sizeof(LinkedList)*numLists, cudaMemcpyDeviceToHost);

    for(int i = 0; i < numLists; i++){
        curList = h_Results[i];
        while(curList.isEmpty() == false){
            Node n = curList.popFirstNode();
            std::cout << "x: " << n.get_row() << " y: " << n.get_col();
        }
    }
}

正如您所看到的，我正在尝试在设备上填充10个链接列表，然后将它们返回给主机，但上面的代码会导致未处理的异常 - 访问冲突读取位置。我假设它没有处理来自设备的指针。

任何帮助都会很棒。

Answer 1

只要注意代码，看起来你有一个基本的误解：有无法从设备访问的主机内存，以及无法从主机访问的设备内存。因此，当您在设备内存中创建链接列表节点并将指针复制回主机时，主机无法取消引用这些指针，因为它们指向设备内存。

如果您真的想在主机和设备之间来回传递链接列表，最好的办法是将条目复制到数组中，执行memcpy，然后将数组复制回链表。其他事情也可以做，具体取决于您的用例。

（可以从设备中分配一个可从主机和访问的内存区域，但是它有一些尴尬，我没有使用它的经验）

CUDA从设备到主机复制链接列表

1 个答案: