Question

我试图简单地在CUDA中并行递增几个矩阵值并尝试将它们复制回主存储器。但是当我在线程函数返回后将它们打印出来时，值是相同的。我甚至试过用一个线程运行程序，但没有运气。任何帮助将不胜感激。

我的代码：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>

#define BLOCK_SIZE 1024
#define MAX_N       100000000
#define MAX_THREADS     1024

int num_threads;
int count;              // Count of threads that have updated their partition
int size;
//int increment; // VS
int * inc2;
//int my_start;


//Host data
int * thread_ids;

//nvcc -arch=sm_20 -o nbody.exe nbody.cu (compilation)

__global__ void pcyc_red(float * a, float * b, float * c, float * D, float * X, 
                    float * a2, float * b2, float * c2, float * D2,
                    int * inc2_dev, int * size_dev, int * num_threads_dev){

//__threadfence();
int thread_id = threadIdx.x + (blockIdx.x * blockDim.x);
float k1;
float k2;
int i;

int start = 0;
//int end = size_dev-1;
//int inc2_dev = inc2_dev1[0];
//int inc_dev = *inc_dev1;
//int size_dev = size_dev1[0];
int nthreads = num_threads_dev[0];
//Thread work assignment
int chunk_size = size_dev[0]/nthreads;
int my_start = thread_id*(chunk_size);
int my_end = start + ((thread_id + 1)*chunk_size - 1);
//__threadfence();
__syncthreads();
//Forward Reduction
for(i = my_start; i <= my_end; ++i){
    a[i] = a[i]++;
    b[i] = b[i]++;
    c[i] = c[i]++;
    D[i] = D[i]++;
    X[i] = X[i]++;
}

__threadfence();
//__syncthreads();
}//Device Function


float* init_vector(int size){
float* output;
output = (float*) calloc(size, sizeof(float));
int i;
for(i = 0; i < size; ++i){
    output[i] = 2.0;
}
return output;
}

float* init_vector_ac(int s){
//s will be used for size-1 not to be confused for size.
float* output;
output = (float*) calloc(s, sizeof(float));
int i;
for(i = 0; i < s; ++i){
    output[i] = -1.0;
}
return output;
}

// Main program 
int main(int argc, char *argv[]) {

//num_threads -> atoi(argv[argc-1]); 
//struct timeval start, stop; 
float total_time;
int i;

///Host structures
float* a;
float* b;
float* c;
float* D;
float* X;

//increment = 2; // VS
inc2 = (int*) malloc(sizeof(int));
inc2[0] = 1;
//size = (int*) malloc(sizeof(int));
//num_threads = (int*) malloc(sizeof(int));
//my_start = 0;
//wait_flag = false;

///Device Data
//SYSTEM * sys_dev;
float * a_dev;
float * b_dev;
float * c_dev;
float * D_dev;
float * X_dev;

float * a2_dev;
float * b2_dev;
float * c2_dev;
float * D2_dev;
//float * X2_dev;

//int * inc_dev;
int * inc2_dev;
//int * mstart_dev;
int * size_dev;
int * num_threads_dev;
int result_var;

//int final_inc2;

cudaEvent_t start, stop;    // GPU timing variables
//struct timeval cpu_start, cpu_stop; // CPU timing variables
   // float time_array[10]; 

// Timing initializations
cudaEventCreate(&start);
cudaEventCreate(&stop);

if (argc != 3) 
{
    printf("Use: <executable_name> <size> <num_threads>\n"); 
    exit(0);
}
if ((size = atoi(argv[argc-2])) > MAX_N) 
{
    printf("Maximum number of nodes allowed: %d\n", MAX_N);
    exit(0);
}; 

if ((num_threads = atoi(argv[argc-1])) > MAX_THREADS) 
{
    printf("Maximum number of threads allowed: %d.\n", MAX_THREADS);
    exit(0);
}; 

int size_array = (size) * sizeof(float);
int size_array2 = (size - 1) * sizeof(float);

// Initialize host tridiagonal matrix
a = init_vector_ac(size-1); // a[i] = -1.0
b = init_vector(size);      // b[i] = 2.0
c = init_vector_ac(size-1); // c[i] = -1.0
D = init_vector(size);      // D[i] = 2.0
X = init_vector(size);      // X[i] = 2.0

//xs = init_vector_err(size);   

// Shift elements of a by 1
for(i = size-1; i > 0; i--) a[i] = a[i-1];
a[0] = 0.0;


thread_ids = (int*) calloc(num_threads, sizeof(int));

count = 0;

for(i = 0; i < num_threads; ++i){
    thread_ids[i] = i;
}
//Cuda Operation

cudaEventRecord( start, 0);

cudaMalloc((void **) &a_dev, size);
cudaMalloc((void **) &b_dev, size);
cudaMalloc((void **) &c_dev, size);
cudaMalloc((void **) &D_dev, size);
cudaMalloc((void **) &X_dev, size);
cudaMalloc((void **) &a2_dev, size);
cudaMalloc((void **) &b2_dev, size);
cudaMalloc((void **) &c2_dev, size);
cudaMalloc((void **) &D2_dev, size);
//cudaMalloc((void**)&inc_dev, sizeof(int));
cudaMalloc((void**)&inc2_dev, sizeof(int));
//cudaMalloc((void**)&mstart_dev, sizeof(int));
cudaMalloc((void**)&size_dev, sizeof(int));
cudaMalloc((void**)&num_threads_dev, sizeof(int));


cudaMemcpy(a_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D_dev, D, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(X_dev, X, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(a2_dev, a, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(b2_dev, b, size_array, cudaMemcpyHostToDevice);
cudaMemcpy(c2_dev, c, size_array2, cudaMemcpyHostToDevice);
cudaMemcpy(D2_dev, D, size_array, cudaMemcpyHostToDevice);

//cudaMemcpy(inc_dev, &increment, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(inc2_dev, inc2, sizeof(int), cudaMemcpyHostToDevice);
//cudaMemcpy(mstart_dev, &my_start, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(size_dev, &size, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(num_threads_dev, &num_threads, sizeof(int), cudaMemcpyHostToDevice);

cudaDeviceSynchronize();
pcyc_red<<<1, num_threads>>>(a_dev, b_dev, c_dev, D_dev, X_dev,
                            a2_dev, b2_dev, c2_dev, D2_dev,
                            inc2_dev, size_dev, num_threads_dev);
cudaDeviceSynchronize();

cudaMemcpy(X, X_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(c, c_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(D, D_dev, size_array, cudaMemcpyDeviceToHost);
cudaMemcpy(inc2, inc2_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&result_var, num_threads_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&total_time, start, stop);

printf("Final Var: %d\n\n", inc2[0]);
printf("Num Threads Var: %d\n\n", result_var);

for(i = 0; i < size; ++i){
    printf("a=%8.4f \n", a[i]); 
    printf("b=%8.4f \n", b[i]); 
    printf("c=%8.4f \n", c[i]); 
    printf("D=%8.4f \n", D[i]); 
    printf("X=%8.4f \n", X[i]); 
}

printf("Threads = %d, matrix_size = %d, time = %f\n", 
    num_threads, size, total_time);

cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
cudaFree(D_dev);
cudaFree(X_dev);
//cudaFree(inc_dev);
cudaFree(inc2_dev);
//cudaFree(mstart_dev);
//cudaFree(size_dev);
//cudaFree(num_threads_dev);

}//end of main

Answer 1

将proper cuda error checking添加到您的代码中。

我可以看到的一个问题是您的分配大小与您的数组大小不匹配。仅举几个例子：

int size_array = (size) * sizeof(float);
...
cudaMalloc((void **) &b_dev, size);  // size should probably be size_array here
...                          ^^^^
cudaMemcpy(b_dev, b, size_array, cudaMemcpyHostToDevice);  // this won't work, will throw error
                     ^^^^^^^^^^

以上肯定是一个错误，代码中有几种类型。您可能还会遇到机器配置问题（未正确安装CUDA等），错误检查也会指示该问题。

如何在CUDA中将内存从设备正确复制到主机？

1 个答案: