初学者帮助CUDA代码性能

时间:2015-04-07 08:54:25

标签: cuda

我刚刚开始学习cuda,我想知道CUDA代码与cpu代码在简单字符串搜索上的表现。

搜索方法:如果数据字符串的前x个字符与之完全相同关键字,然后返回true。(x =>关键字的大小)

有100个关键字和10000个数据字符串。我在这里想要完成的是以并发方式进行比较并比较经过的时间。我写了4个不同的内核和一个cpu代码。然而,我得到的结果相当令人困惑。

searchKeywordKernel:创建4 * 32个线程。每个线程获取一个关键字并将其与10000个数据字符串进行比较,然后将结果写入bool数组。耗时2650ms。

searchKeywordKernel2:创建10 * 1024个线程。每个线程获取一个数据字符串并将其与100个关键字进行比较,然后将结果写入bool数组。花了1397ms。

searchKeywordKernel3:创建1 * 1线程。它的行为类似于cpu代码,生成结果需要279ms。

searchKeywordKernel4:创建977 * 1024个线程。每个线程进行一次字符串比较,花费1334ms。

CPU:进行1000000次字符串比较。花了265毫秒。

我想问几个问题:

为什么searchKeywordKernel3在与cpu代码相似的时间内生成结果?我仔细检查了代码,但没有发现任何问题。

为什么cpu代码与不包括searchKeywordKernel3的内核相比效果更好?是否因为读取操作或数据大小?

硬件信息:显卡:NVidia GT730,处理器:Intel i5-4460。

用于生成结果的代码是:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>

#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 10000
#define STRINGSIZE 250

using namespace std;

__global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int dataStringIndex = 0;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;  
    int resultIndex = 0;

    if (keywordStringIndex < SEARCHTERMSIZE)
    {
        for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
        {
            dataCharIndex = dataStringIndex*STRINGSIZE;
            keywordCharIndex = keywordStringIndex*STRINGSIZE;
            resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
            result[resultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    result[resultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
        }
    }   
}
__global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = 0;
    int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;
    int resultIndex = 0;

    if (dataStringIndex < SEARCHITEMSIZE)
    {
        for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
        {
            dataCharIndex = dataStringIndex*STRINGSIZE;
            keywordCharIndex = keywordStringIndex*STRINGSIZE;
            resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
            result[resultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    result[resultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
        }
    }
}
__global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
{
    int keywordStringIndex = 0;
    int dataStringIndex = 0;
    int keywordCharIndex = 0;
    int dataCharIndex = 0;
    int resultIndex = 0;

    if (threadIdx.x + blockIdx.x * blockDim.x < 1)
    {
        for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
        {
            for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
            {
                dataCharIndex = dataStringIndex*STRINGSIZE;
                keywordCharIndex = keywordStringIndex*STRINGSIZE;
                result[resultIndex] = true;
                while (keyword[keywordCharIndex] != '\0')
                {
                    if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                    {
                        result[resultIndex] = false;
                        break;
                    }
                    keywordCharIndex++;
                    dataCharIndex++;
                }
                resultIndex++;
            }
        }
    }
}
__global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
    {
        int keywordStringIndex = id / SEARCHITEMSIZE;
        int dataStringIndex = id%SEARCHITEMSIZE;
        int keywordCharIndex;
        int dataCharIndex;
        int resultIndex;

        dataCharIndex = dataStringIndex*STRINGSIZE;
        keywordCharIndex = keywordStringIndex*STRINGSIZE;
        resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
        result[resultIndex] = true;
        while (keyword[keywordCharIndex] != '\0')
        {
            if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
            {
                result[resultIndex] = false;
                break;
            }
            keywordCharIndex++;
            dataCharIndex++;
        }       
    }
}

int main()
{
    chrono::steady_clock::time_point startTime;
    chrono::steady_clock::time_point endTime;
    typedef chrono::duration<int, milli> millisecs_t;

    //////////Search Data Init/////////////////
    cout << "Before Search Data Init" << endl;
    startTime = chrono::steady_clock::now();
    char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
    int temp = 0;
    int dataIndex = 0;
    for (int i = 0; i < SEARCHITEMSIZE; i++)
    {
        dataIndex = i*STRINGSIZE;
        temp = rand() % (STRINGSIZE-21) + 20;
        for (int k = 0; k < temp; k++)
        {           
            data[dataIndex] = 'a';
            dataIndex++;
        }
        data[dataIndex] = '\0';
    }           
    endTime = chrono::steady_clock::now();
    millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
    //////////Search Data Init/////////////////

    //////////Search Keyword Init/////////////////
    cout << "Before Search Keyword Init" << endl;
    startTime = chrono::steady_clock::now();
    char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
    int keywordIndex = 0;
    for (int i = 0; i < SEARCHTERMSIZE; i++)
    {
        keywordIndex = i*STRINGSIZE;
        temp = rand() % (STRINGSIZE - 21) + 20;
        for (int k = 0; k < temp; k++)
        {
            keyword[keywordIndex] = 'a';
            keywordIndex++;
        }
        keyword[keywordIndex] = '\0';
        keywordIndex++;
    }   
    endTime = chrono::steady_clock::now();
    millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Search Keyword Init: " << duration1.count()  << "ms" << endl;
    //////////Search Keyword Init/////////////////  

    bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
    bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    char* d_data;
    char* d_keyword;
    bool* d_result;

    /////////////////////////CudaMalloc/////////////////////////////////
    cout << "Before Malloc" << endl;
    startTime = chrono::steady_clock::now();

    cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
    cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
    cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);

    endTime = chrono::steady_clock::now();
    millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "After Malloc: " << duration2.count() << "ms" << endl;
    /////////////////////////CudaMalloc/////////////////////////////////

    cudaEvent_t start, stop;
    float elapsedTime;

    /////////////////////////CudaMemCpy///////////////////////////////////
    cout << "Before Memcpy" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Memcpy: " << elapsedTime << "ms" << endl;
    /////////////////////////CudaMemCpy///////////////////////////////////



    ////////////////////////Kernel//////////////////////////////////////////
    cout << "Before Kernel" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel//////////////////////////////////////////

    cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel2//////////////////////////////////////////
    cout << "Before Kernel2" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel2: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel2//////////////////////////////////////////

    cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel3//////////////////////////////////////////
    cout << "Before Kernel3" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel3: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel3//////////////////////////////////////////

    cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    ////////////////////////Kernel4//////////////////////////////////////////
    cout << "Before Kernel4" << endl;
    cudaEventCreate(&start);
    cudaEventRecord(start, 0);

    searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);

    cudaEventCreate(&stop);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cout << "After Kernel4: " << elapsedTime << "ms" << endl;
    ////////////////////////Kernel4//////////////////////////////////////////

    cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);

    /*
    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 10; j++)
            cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
        cout << "*****************************************" << endl;
    }
    */
    /////////////////////////////////// CPU code //////////////////////////////////////////

    bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];

    int dataCharIndex = 0;
    int keywordCharIndex = 0;
    int nonParallelResultIndex = 0;

    cout << "CPU code starts" << endl;
    startTime = chrono::steady_clock::now();
    for (int i = 0; i < SEARCHTERMSIZE;i++)
    {   
        for (int j = 0; j < SEARCHITEMSIZE; j++)
        {
            keywordCharIndex = i*STRINGSIZE;
            dataCharIndex = j*STRINGSIZE;
            cpuResult[nonParallelResultIndex] = true;
            while (keyword[keywordCharIndex] != '\0')
            {
                if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                {
                    cpuResult[nonParallelResultIndex] = false;
                    break;
                }
                keywordCharIndex++;
                dataCharIndex++;
            }
            nonParallelResultIndex++;
        }
    }
    endTime = chrono::steady_clock::now();
    millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
    cout << "CPU code ends: " << duration3.count() << "ms" << endl;
    /////////////////////////////////// CPU code //////////////////////////////////////////
    /*
    for (int i = 0; i < 10; i++)
    {
        for (int j = 0; j < 10; j++)
            cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
        cout << "*****************************************" << endl;
    }   
    */
    ////////////////////////////////////Result Comparison////////////////////////////////////////
    bool kernel1Res, kernel2Res, kernel3Res, kernel4Res;

    kernel1Res = true;
    kernel2Res = true;
    kernel3Res = true;
    kernel4Res = true;

    for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
    {
        if (cpuResult[i] != result[i] && kernel1Res)
            kernel1Res = false;
        if (cpuResult[i] != result2[i] && kernel2Res)
            kernel2Res = false;
        if (cpuResult[i] != result3[i] && kernel3Res)
            kernel3Res = false;
        if (cpuResult[i] != result4[i] && kernel4Res)
            kernel4Res = false;
        if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res)
            break;      
    }
    ////////////////////////////////////Result Comparison////////////////////////////////////////

    cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
    cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
    cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
    cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;

    cout << "Before Deleting arrays" << endl;
    delete[] data;
    delete[] keyword;
    delete[] result;
    delete[] result2;
    delete[] result3;
    delete[] result4;
    delete[] cpuResult;
    cout << "After Deleting arrays" << endl;

    cout << "Before Freeing device memory" << endl;
    cudaFree(d_data);
    cudaFree(d_keyword);    
    cudaFree(d_result);
    cout << "After Freeing device memory" << endl;

    cudaDeviceReset();
    system("pause");
    return 0;
}

提前谢谢。

1 个答案:

答案 0 :(得分:2)

您的代码似乎功能正确 - 这是程序员的第一份工作。那么如何让它运行得更快呢?

CUDA程序员应该注意性能的前两个概念是:

  1. 您需要批次的主题。通常我们需要10,000个或更多线程,并且通常没有任何重大损失,因为拥有更多线程。对机器体系结构产生了大量线程的需求 - 它是一个隐藏延迟的机器,它通过拥有可以即时切换的大量工作来隐藏延迟。 &#34;工作&#34;在这种情况下,可以松散地翻译为&#34;线程&#34;。

  2. 您希望有效利用内存系统。这可能涉及许多不同的想法,但我们要关注的第一个是合并访问全局内存。 (您并未在任何内核中使用任何共享内存,但如果您使用,我们也希望对共享内存进行非银行冲突访问)。我们还希望在数据使用方面具有效率,最后,与任何计算机优化一样,我们希望利用内存层次结构来查找数据重用机会,并将这些数据项移动到更高的&#34;更高的&#34;内存层次结构中的级别。

  3. 那么这对您的代码意味着什么?如果你想写一个&#34;快速&#34;内核,你需要很多线程,并且还要针对100%合并的全局内存负载。因此,内核1和3中的策略看起来不是很好 - 它们根本就没有启动足够的线程。 2更好,但内核4中的策略可能更好 - 它允许我们启动100 * 10000个线程。这符合我们对&#34; lot&#34;的定义。因此,让我们继续使用线程策略,该策略说每个线程将负责生成result数组的一个元素(因为有100 * 10000个结果)。

    现在,关于合并访问,这归结为数据组织。相邻线程如何访问数据?它是连续的吗?在你的kernel4的情况下,它不是。相邻的线程正在从data读取,其间隙非常大,因为您遍历正在执行工作的while循环。

    要解决此问题,我们可以转置我们的数据。我选择使用数据重用优化:

    1. 指定每个线程块以处理data
    2. 的一个元素
    3. 分配线程块中的每个线程,以处理与步骤1中result项关联的data个元素。
    4. 由于每个threadblock只处理data的一个元素(字符串),我们可以将该元素(字符串)移动到共享内存中,这样我们每个线程块只读取一次,然后每个线程检索所需的值超出共享内存。这意味着data中的每个字符串只能从<全局内存中读取一次,这是最佳的。
    5. 由于步骤3中的优化选择,我们可以避免转换data以实现最佳合并负载。但是我们仍然需要在keyword中转换字符串,因为每个线程都将通过全局加载来读取它。我们在这里受益于整个keyword阵列更小 - 大约25K字节,它可以适合GPU L1缓存(如果可用)或当然适合L2。
    6. 根据我的测试,通过上述策略和选择,我能够制作出比CPU代码快5倍的内核。由于这个内核很可能在很大程度上限制了带宽,因此我们可能在性能方面处于领先地位。这是一个功能齐全的示例,将您的代码添加到第5个内核中,该内核源自您的第4个内核,但使用keyword数组的转置形式:

      $ cat t703.cu
      #include <stdio.h>
      #include <iostream>
      #include <chrono>
      
      #define SEARCHTERMSIZE 100
      #define SEARCHITEMSIZE 10000
      #define STRINGSIZE 250
      
      using namespace std;
      
      __global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
      {
          int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
          int dataStringIndex = 0;
          int keywordCharIndex = 0;
          int dataCharIndex = 0;  
          int resultIndex = 0;
      
          if (keywordStringIndex < SEARCHTERMSIZE)
          {
              for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
              {
                  dataCharIndex = dataStringIndex*STRINGSIZE;
                  keywordCharIndex = keywordStringIndex*STRINGSIZE;
                  resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
                  result[resultIndex] = true;
                  while (keyword[keywordCharIndex] != '\0')
                  {
                      if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                      {
                          result[resultIndex] = false;
                          break;
                      }
                      keywordCharIndex++;
                      dataCharIndex++;
                  }
              }
          }   
      }
      __global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
      {
          int keywordStringIndex = 0;
          int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
          int keywordCharIndex = 0;
          int dataCharIndex = 0;
          int resultIndex = 0;
      
          if (dataStringIndex < SEARCHITEMSIZE)
          {
              for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
              {
                  dataCharIndex = dataStringIndex*STRINGSIZE;
                  keywordCharIndex = keywordStringIndex*STRINGSIZE;
                  resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
                  result[resultIndex] = true;
                  while (keyword[keywordCharIndex] != '\0')
                  {
                      if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                      {
                          result[resultIndex] = false;
                          break;
                      }
                      keywordCharIndex++;
                      dataCharIndex++;
                  }
              }
          }
      }
      __global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
      {
          int keywordStringIndex = 0;
          int dataStringIndex = 0;
          int keywordCharIndex = 0;
          int dataCharIndex = 0;
          int resultIndex = 0;
      
          if (threadIdx.x + blockIdx.x * blockDim.x < 1)
          {
              for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
              {
                  for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
                  {
                      dataCharIndex = dataStringIndex*STRINGSIZE;
                      keywordCharIndex = keywordStringIndex*STRINGSIZE;
                      result[resultIndex] = true;
                      while (keyword[keywordCharIndex] != '\0')
                      {
                          if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                          {
                              result[resultIndex] = false;
                              break;
                          }
                          keywordCharIndex++;
                          dataCharIndex++;
                      }
                      resultIndex++;
                  }
              }
          }
      }
      __global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
      {
          int id = threadIdx.x + blockIdx.x * blockDim.x;
          if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
          {
              int keywordStringIndex = id / SEARCHITEMSIZE;
              int dataStringIndex = id%SEARCHITEMSIZE;
              int keywordCharIndex;
              int dataCharIndex;
              int resultIndex;
      
              dataCharIndex = dataStringIndex*STRINGSIZE;
              keywordCharIndex = keywordStringIndex*STRINGSIZE;
              resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
              result[resultIndex] = true;
              while (keyword[keywordCharIndex] != '\0')
              {
                  if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                  {
                      result[resultIndex] = false;
                      break;
                  }
                  keywordCharIndex++;
                  dataCharIndex++;
              }       
          }
      }
      
      // this kernel is a modification of kernel 4, and assumes that the keyword array is transposed
      // and that the kernel will be launched with one block per data string, and one thread per keyword
      
      __global__ void searchKeywordKernel5(bool* result, const char  * __restrict__ data,  const char * keyword)
      {
          int bid = blockIdx.x;
          int tid = threadIdx.x;
          __shared__ char sdata[STRINGSIZE];
          if (bid < SEARCHITEMSIZE)
          {
              int my_tid = tid;
              while (my_tid < STRINGSIZE){  //load data string to be used by this block into shared mem
                sdata[my_tid] = data[bid*STRINGSIZE + my_tid]; //coalesced global load
                my_tid += blockDim.x;}
              __syncthreads();
              if (tid < SEARCHTERMSIZE){
                int resultIndex = tid*SEARCHITEMSIZE + bid;
                result[resultIndex] = true; //uncoalesced store - could be improved by reorganizing result
                char test = keyword[tid]; // coalesced global load
                int i = 0;
                while (test != '\0')
                {
                  char temp = sdata[i]; // shared memory broadcast
                  if ((test != temp) || (temp == '\0'))
                  {
                      result[resultIndex] = false; //uncoalesced store
                      break;
                  }
                  i++;
                  test = keyword[i*SEARCHTERMSIZE+tid]; //coalesced global load
                }
              }        
          }
      }
      
      
      int main()
      {
          chrono::steady_clock::time_point startTime;
          chrono::steady_clock::time_point endTime;
          typedef chrono::duration<int, milli> millisecs_t;
      
          //////////Search Data Init/////////////////
          cout << "Before Search Data Init" << endl;
          startTime = chrono::steady_clock::now();
          char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
          int temp = 0;
          int dataIndex = 0;
          for (int i = 0; i < SEARCHITEMSIZE; i++)
          {
              dataIndex = i*STRINGSIZE;
              temp = rand() % (STRINGSIZE-21) + 20;
              for (int k = 0; k < temp; k++)
              {           
                  data[dataIndex] = 'a';
                  dataIndex++;
              }
              data[dataIndex] = '\0';
          }           
          endTime = chrono::steady_clock::now();
          millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
          cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
          //////////Search Data Init/////////////////
      
          //////////Search Keyword Init/////////////////
          cout << "Before Search Keyword Init" << endl;
          startTime = chrono::steady_clock::now();
          char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
          int keywordIndex = 0;
          for (int i = 0; i < SEARCHTERMSIZE; i++)
          {
              keywordIndex = i*STRINGSIZE;
              temp = rand() % (STRINGSIZE - 21) + 20;
              for (int k = 0; k < temp; k++)
              {
                  keyword[keywordIndex] = 'a';
                  keywordIndex++;
              }
              keyword[keywordIndex] = '\0';
              keywordIndex++;
          }   
          endTime = chrono::steady_clock::now();
          millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
          cout << "After Search Keyword Init: " << duration1.count()  << "ms" << endl;
          //////////Search Keyword Init/////////////////  
      
          bool* result  = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
          bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
          bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
          bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
          bool* result5 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
      
          char* d_data;
          char* d_keyword;
          char* d_keyword_T;
          bool* d_result;
      
          /////////////////////////CudaMalloc/////////////////////////////////
          cout << "Before Malloc" << endl;
          startTime = chrono::steady_clock::now();
      
          cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
          cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
          cudaMalloc(&d_keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
          cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
      
          endTime = chrono::steady_clock::now();
          millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
          cout << "After Malloc: " << duration2.count() << "ms" << endl;
          /////////////////////////CudaMalloc/////////////////////////////////
      
          cudaEvent_t start, stop;
          float elapsedTime;
      
          /////////////////////////CudaMemCpy///////////////////////////////////
          cout << "Before Memcpy" << endl;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventRecord(start, 0);
      
          cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
          cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
      
          //transpose keywords
          char* keyword_T = new char[SEARCHTERMSIZE*STRINGSIZE];
          for (int i = 0; i < SEARCHTERMSIZE; i++)
            for (int j = 0; j < STRINGSIZE; j++)
              keyword_T[j*SEARCHTERMSIZE+i] = keyword[i*STRINGSIZE+j];
      
          cudaMemcpy(d_keyword_T, keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
      
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&elapsedTime, start, stop);
          cudaEventDestroy(start);
          cudaEventDestroy(stop);
          cout << "After Memcpy: " << elapsedTime << "ms" << endl;
          /////////////////////////CudaMemCpy///////////////////////////////////
      
      
      
          ////////////////////////Kernel//////////////////////////////////////////
          cout << "Before Kernel" << endl;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventRecord(start, 0);
      
          searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&elapsedTime, start, stop);
          cudaEventDestroy(start);
          cudaEventDestroy(stop);
          cout << "After Kernel: " << elapsedTime << "ms" << endl;
          ////////////////////////Kernel//////////////////////////////////////////
      
          cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
      
          ////////////////////////Kernel2//////////////////////////////////////////
          cout << "Before Kernel2" << endl;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventRecord(start, 0);
      
          searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&elapsedTime, start, stop);
          cudaEventDestroy(start);
          cudaEventDestroy(stop);
          cout << "After Kernel2: " << elapsedTime << "ms" << endl;
          ////////////////////////Kernel2//////////////////////////////////////////
      
          cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
      
          ////////////////////////Kernel3//////////////////////////////////////////
          cout << "Before Kernel3" << endl;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventRecord(start, 0);
      
          searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&elapsedTime, start, stop);
          cudaEventDestroy(start);
          cudaEventDestroy(stop);
          cout << "After Kernel3: " << elapsedTime << "ms" << endl;
          ////////////////////////Kernel3//////////////////////////////////////////
      
          cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
      
          ////////////////////////Kernel4//////////////////////////////////////////
          cout << "Before Kernel4" << endl;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventRecord(start, 0);
      
          searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&elapsedTime, start, stop);
          cudaEventDestroy(start);
          cudaEventDestroy(stop);
          cout << "After Kernel4: " << elapsedTime << "ms" << endl;
          ////////////////////////Kernel4//////////////////////////////////////////
      
          cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
      
      
          cudaFuncSetCacheConfig(searchKeywordKernel5, cudaFuncCachePreferL1);
      
          ////////////////////////Kernel5//////////////////////////////////////////
          cout << "Before Kernel5" << endl;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventRecord(start, 0);
      
          searchKeywordKernel5 << <SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_result, d_data, d_keyword_T);
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&elapsedTime, start, stop);
          cudaEventDestroy(start);
          cudaEventDestroy(stop);
          cout << "After Kernel5: " << elapsedTime << "ms" << endl;
          ////////////////////////Kernel5//////////////////////////////////////////
      
          cudaMemcpy(result5, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
      
          /*
          for (int i = 0; i < 10; i++)
          {
              for (int j = 0; j < 10; j++)
                  cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
              cout << "*****************************************" << endl;
          }
          */
          /////////////////////////////////// CPU code //////////////////////////////////////////
      
          bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
      
          int dataCharIndex = 0;
          int keywordCharIndex = 0;
          int nonParallelResultIndex = 0;
      
          cout << "CPU code starts" << endl;
          startTime = chrono::steady_clock::now();
          for (int i = 0; i < SEARCHTERMSIZE;i++)
          {   
              for (int j = 0; j < SEARCHITEMSIZE; j++)
              {
                  keywordCharIndex = i*STRINGSIZE;
                  dataCharIndex = j*STRINGSIZE;
                  cpuResult[nonParallelResultIndex] = true;
                  while (keyword[keywordCharIndex] != '\0')
                  {
                      if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
                      {
                          cpuResult[nonParallelResultIndex] = false;
                          break;
                      }
                      keywordCharIndex++;
                      dataCharIndex++;
                  }
                  nonParallelResultIndex++;
              }
          }
          endTime = chrono::steady_clock::now();
          millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
          cout << "CPU code ends: " << duration3.count() << "ms" << endl;
          /////////////////////////////////// CPU code //////////////////////////////////////////
          /*
          for (int i = 0; i < 10; i++)
          {
              for (int j = 0; j < 10; j++)
                  cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
              cout << "*****************************************" << endl;
          }   
          */
          ////////////////////////////////////Result Comparison////////////////////////////////////////
          bool kernel1Res, kernel2Res, kernel3Res, kernel4Res, kernel5Res;
      
          kernel1Res = true;
          kernel2Res = true;
          kernel3Res = true;
          kernel4Res = true;
          kernel5Res = true;
      
          for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
          {
              if (cpuResult[i] != result[i] && kernel1Res)
                  kernel1Res = false;
              if (cpuResult[i] != result2[i] && kernel2Res)
                  kernel2Res = false;
              if (cpuResult[i] != result3[i] && kernel3Res)
                  kernel3Res = false;
              if (cpuResult[i] != result4[i] && kernel4Res)
                  kernel4Res = false;
              if (cpuResult[i] != result5[i] && kernel5Res)
                  kernel5Res = false;
              if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res && !kernel5Res)
                  break;      
          }
          ////////////////////////////////////Result Comparison////////////////////////////////////////
      
          cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
          cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
          cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
          cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;
          cout << boolalpha << "Kernel5 computation: " << kernel5Res << endl;
      
          cout << "Before Deleting arrays" << endl;
          delete[] data;
          delete[] keyword;
          delete[] result;
          delete[] result2;
          delete[] result3;
          delete[] result4;
          delete[] cpuResult;
          cout << "After Deleting arrays" << endl;
      
          cout << "Before Freeing device memory" << endl;
          cudaFree(d_data);
          cudaFree(d_keyword);    
          cudaFree(d_result);
          cout << "After Freeing device memory" << endl;
      
          cudaDeviceReset();
          return 0;
      }
      
      $ nvcc -O3 -std=c++11 -o t703 t703.cu
      $ ./t703
      Before Search Data Init
      After Search Data Init: 0ms
      Before Search Keyword Init
      After Search Keyword Init: 0ms
      Before Malloc
      After Malloc: 38ms
      Before Memcpy
      After Memcpy: 1.09805ms
      Before Kernel
      After Kernel: 1455.98ms
      Before Kernel2
      After Kernel2: 110.16ms
      Before Kernel3
      After Kernel3: 363.236ms
      Before Kernel4
      After Kernel4: 96.9751ms
      Before Kernel5
      After Kernel5: 10.9064ms
      CPU code starts
      CPU code ends: 76ms
      Kernel1 computation: true
      Kernel2 computation: true
      Kernel3 computation: true
      Kernel4 computation: true
      Kernel5 computation: true
      Before Deleting arrays
      After Deleting arrays
      Before Freeing device memory
      After Freeing device memory
      $
      

      一些注意事项:

      1. 您使用cuda事件有些不正确。你应该在时间区域之外创建你的cuda事件。此外,如果您打算重新创建事件,则应首先销毁它。您将在我的代码中看到这些更改。
      2. 以上结果来自Fedora20 linux系统,其中CUDA 7运行在四核Xeon处理器和Quadro5000 GPU上。您的系统上的数字会有所不同(尽管我希望我的内核仍然比您的CPU代码更快!)
      3. 要了解有关GPU代码优化的更多信息,GTC和GTC-Express提供了许多优秀的演示文稿,here is one of them
      4. 正如您所发现的,使用-G(调试)开关(这是Visual Studio在调试CUDA项目上执行的操作)编译CUDA代码可能会对代码性能产生重大影响。无论何时对CUDA代码进行基准测试或分析以获得性能,都不应使用-G开关。