Question

我有一个简单的程序来计算平方根，循环展开是以

完成的

循环展开

 #include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);


// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
  int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
  int n=N;
  //printf("%d\n",n);
  for(int q=0;q<2;q++)
  {
  if(N<2000)
  {
  arr[idx+q] = arr[idx+q] * arr[idx+q];
  }
  }

}



// main routine that executes on the host
int main(void)
{
  clock_t start = clock(),diff;
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 1000;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  //int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
  first <<< 4, 128 >>> (a_d, N);
  //cudaThreadSynchronize();
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); cudaFree(a_d);
  diff = clock() - start;
 int msec = diff * 1000 / CLOCKS_PER_SEC;

 printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);

}

然后意识到可以通过动态并行来最小化循环计算。

展开动态并行化实现为

展开动态并行

#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);


// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
  int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
  int n=N;
  square <<< 1,2 >>> (arr, n,idx);


}

__global__ void square(float *a, int N,int idx)
{
  int tdx = blockIdx.x * blockDim.x + threadIdx.x;
  printf("%d\n",N);
  if(N<2000)
  {
  a[tdx+idx] = a[tdx+idx] * a[tdx+idx];
  }
}

// main routine that executes on the host
int main(void)
{
  clock_t start = clock(),diff;
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 1000;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  //int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
  first <<< 4, 128 >>> (a_d, N);
  //cudaThreadSynchronize();
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); cudaFree(a_d);
  diff = clock() - start;
 int msec = diff * 1000 / CLOCKS_PER_SEC;

 printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);

}

实现动态并行与展开需要更多的时间来执行而不是仅展开。 Aren，我们想在这种情况下用动态并行性来改善执行时间吗？

Answer 1

动态并行性主要用于具有动态并行性的情况。那就是：在你完成一些计算之前你不知道你需要多少并行性的情况。您可以从内核中启动，而不是将数据传输回主机，然后立即将其输入到另一个启动参数中。在这种模式中，避免了内核启动之间的memcpys，你会看到加速。

在上面的例子中，情况并非如此。您可能刚刚从主机启动了两倍的线程。没有任何动态需要，因为在第一次内核启动时你没有可用的并行性。

此外，使用动态并行性启动的内核的性能要求与从主机启动的内核类似。您必须启动合理数量的工作，否则启动延迟将占据您的计算时间。

循环展开动态并行性会降低时间性能

1 个答案: