当将数组应用到共享内存中的另一个数组时,我尝试了六种不同的方法(请参阅程序中的注释)。经过讨论和测试,我的结论是: (1)memcpy并不比数组的元素副本快。 (2)对于小阵列,方法3是最好的。对于更大的阵列,方法6是最好的。
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 6000;
const int NUM_OF_COPIES= 1000;
//const int NUM_OF_COPIES= 1000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
//printf("start_index[%d] = %d, end_index[%d] = %d\n", threadIdx.x, start_index[threadIdx.x], threadIdx.x, end_index[threadIdx.x]);
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA * sizeof(int));
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
//__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], (end_index[threadIdx.x] - start_index[threadIdx.x] + 1) * sizeof(int));
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
dest[i] = src[i];
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
/*
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
*/
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
/*
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
*/
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}
答案 0 :(得分:1)
似乎很清楚,__threadfence_block()
是一项昂贵的操作。 4个最长的测试用例都使用__threadfence_block()
。两个最短的测试用例没有。
如果我将__threadfence_block()
添加到第3(即最短)测试用例,则时间(对我来说)从~2秒变为~17秒。
请注意,您的测试用例并非都完全相同,输出结果的差异就是证明。我对代码进行了修改,更清楚地证明了这一点:
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <stdlib.h>
#include <assert.h>
const int NUM_OF_BLOCKS = 1;
const int NUM_OF_THREADS_PER_BLOCK = 8;
const int NUM_OF_DATA = 50;
const int NUM_OF_COPIES= 10000000;
cudaError_t cuda_status;
__shared__ int start_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int end_index[NUM_OF_THREADS_PER_BLOCK];
__shared__ int src[NUM_OF_DATA];
__shared__ int dest[NUM_OF_DATA];
using namespace std;
__device__ void init(){
unsigned int num_of_data_per_thread = NUM_OF_DATA / NUM_OF_THREADS_PER_BLOCK;
unsigned int extra_data = NUM_OF_DATA % NUM_OF_THREADS_PER_BLOCK;
int size[NUM_OF_THREADS_PER_BLOCK];
start_index[threadIdx.x] = threadIdx.x * num_of_data_per_thread;
if (threadIdx.x < extra_data){
start_index[threadIdx.x] = start_index[threadIdx.x] + threadIdx.x;
size[threadIdx.x] = num_of_data_per_thread + 1;
}else{
start_index[threadIdx.x] = start_index[threadIdx.x] + extra_data;
size[threadIdx.x] = num_of_data_per_thread ;
}
end_index[threadIdx.x] = start_index[threadIdx.x] + size[threadIdx.x] -1;
}
__device__ void inc_src_data(int* src){
int i;
for (i = 0; i < NUM_OF_DATA; i++, src++){
*src += 1;
}
//__threadfence_block();
}
template <int sel>
__device__ void copy_to_dest_array(int* src, int* dest){
int i;
switch (sel){
case 1:
// Approach 1: every thread executes memcpy
memcpy(dest, src, NUM_OF_DATA);
break;
case 2:
// Approach 2: one thread executes memcpy and then threadfence
if (threadIdx.x == 0){
memcpy(dest, src, NUM_OF_DATA);
__threadfence_block();
}
break;
case 3:
// Approach 3: every thread copies each element individually
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block(); // added this line to demonstrate timing difference
break;
case 4:
// Approach 4: one thread copy each element individually and then threadfence
if (threadIdx.x == 0)
for (i = 0; i < NUM_OF_DATA; i++, dest++, src++)
*dest = *src;
__threadfence_block();
break;
case 5:
// Approach 5: every thread execute memcpy and then threadfence
memcpy(dest+start_index[threadIdx.x], src + start_index[threadIdx.x], end_index[threadIdx.x] - start_index[threadIdx.x] + 1);
__threadfence_block();
break;
case 6:
// Approach 6: every thread copies each element individually and then threadfence
for (i = start_index[threadIdx.x]; i <= end_index[threadIdx.x]; i++){
*(dest + i) = *(src + i);
}
__threadfence_block();
break;
default:
assert(0);
break;
}
}
template <int sel>
__global__ void copy_data_test(int* data){
init();
copy_to_dest_array<sel>(data, src);
for (int i = 0; i < NUM_OF_COPIES; i++){
inc_src_data(src);
copy_to_dest_array<sel>(&src[0], &dest[0]);
}
copy_to_dest_array<sel>(dest, data);
}
template <int sel>
void run_test(int *rdata, int *hdata, int *ddata){
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaMemcpy(ddata, hdata, NUM_OF_DATA * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start);
copy_data_test<sel><<<NUM_OF_BLOCKS, NUM_OF_THREADS_PER_BLOCK>>>(ddata);
cudaEventRecord(stop);
cout << "kernel error: " << cudaGetErrorString(cudaPeekAtLastError()) << "---" << cudaGetErrorString(cudaDeviceSynchronize()) << endl;
cudaMemcpy(rdata, ddata, NUM_OF_DATA * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float et;
cudaEventElapsedTime(&et, start, stop);
cout << "Trial " << sel << " elapsed time: " << et << "ms" << endl;
cout << "after kernel processing" << endl;
for (int i = 0; i < NUM_OF_DATA; i++)
cout << rdata[i] << " ";
cout << endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
int main(int argc, char **argv){
int h_data[NUM_OF_DATA];
int r_data[NUM_OF_DATA];
int* d_data;
int i;
cudaSetDevice(0);
srand(time(NULL));
cout << "before kernel processing" << endl;
for (i = 0; i < NUM_OF_DATA; i++){
h_data[i] = rand()%100;
cout << h_data[i] << " ";
}
cout << endl;
cudaMalloc(&d_data, sizeof(int) * NUM_OF_DATA);
run_test<1>(r_data, h_data, d_data);
run_test<2>(r_data, h_data, d_data);
run_test<3>(r_data, h_data, d_data);
run_test<4>(r_data, h_data, d_data);
run_test<5>(r_data, h_data, d_data);
run_test<6>(r_data, h_data, d_data);
return 0;
}