CUDA二进制搜索的实现_Cuda_Nvidia_Binary Search

CUDA二进制搜索的实现

cuda

CUDA二进制搜索的实现,cuda,nvidia,binary-search,Cuda,Nvidia,Binary Search,我试图加快CPU二进制搜索的速度。不幸的是，GPU版本总是比CPU版本慢得多。也许这个问题不适合GPU，或者我做错了什么 CPU版本（约0.6ms）：使用长度为2000的排序数组并对特定值进行二进制搜索 ... Lookup ( search[j], search_array, array_length, m ); ... int Lookup ( int search, int* arr, int length, int& m ) { int l(0), r(len

我试图加快CPU二进制搜索的速度。不幸的是，GPU版本总是比CPU版本慢得多。也许这个问题不适合GPU，或者我做错了什么

CPU版本（约0.6ms）：使用长度为2000的排序数组并对特定值进行二进制搜索

...
Lookup ( search[j], search_array, array_length, m );
...
int Lookup ( int search, int* arr, int length, int& m )
{      
   int l(0), r(length-1);
   while ( l <= r ) 
   {
      m = (l+r)/2;      
      if ( search < arr[m] )
         r = m-1;
      else if ( search > arr[m] )
         l = m+1;
      else
      {         
         return index[m];
      }         
   }
   if ( arr[m] >= search )
      return m;
   return (m+1);      
}

....
p_ary_search<<<16, 64>>>(search[j], array_length, dev_arr, dev_ret_val);
....

__global__ void p_ary_search(int search, int array_length, int *arr, int *ret_val ) 
{
   const int num_threads = blockDim.x * gridDim.x;
   const int thread = blockIdx.x * blockDim.x + threadIdx.x;
   int set_size = array_length;

   ret_val[0] = -1; // return value
   ret_val[1] = 0;  // offset

   while(set_size != 0)
   {
      // Get the offset of the array, initially set to 0
      int offset = ret_val[1];

      // I think this is necessary in case a thread gets ahead, and resets offset before it's read
      // This isn't necessary for the unit tests to pass, but I still like it here
      __syncthreads();

      // Get the next index to check
      int index_to_check = get_index_to_check(thread, num_threads, set_size, offset);

      // If the index is outside the bounds of the array then lets not check it
      if (index_to_check < array_length)
      {
         // If the next index is outside the bounds of the array, then set it to maximum array size
         int next_index_to_check = get_index_to_check(thread + 1, num_threads, set_size, offset);
         if (next_index_to_check >= array_length)
         {
            next_index_to_check = array_length - 1;
         }

         // If we're at the mid section of the array reset the offset to this index
         if (search > arr[index_to_check] && (search < arr[next_index_to_check])) 
         {
            ret_val[1] = index_to_check;
         }
         else if (search == arr[index_to_check]) 
         {
            // Set the return var if we hit it
            ret_val[0] = index_to_check;
         }
      }

      // Since this is a p-ary search divide by our total threads to get the next set size
      set_size = set_size / num_threads;

      // Sync up so no threads jump ahead and get a bad offset
      __syncthreads();
   }
}

。。。
查找（搜索[j]，搜索数组，数组长度，m）；
...
整数查找（整数搜索、整数*arr、整数长度、整数和m）
{      
int l（0），r（长度-1）；
while（l arr[m]）
l=m+1；
其他的
{         
收益指数[m]；
}         
}
如果（arr[m]>=搜索）
返回m；
回报率（m+1）；
}

GPU版本（约20ms）：使用长度为2000的排序数组并对特定值进行二进制搜索

...
Lookup ( search[j], search_array, array_length, m );
...
int Lookup ( int search, int* arr, int length, int& m )
{      
   int l(0), r(length-1);
   while ( l <= r ) 
   {
      m = (l+r)/2;      
      if ( search < arr[m] )
         r = m-1;
      else if ( search > arr[m] )
         l = m+1;
      else
      {         
         return index[m];
      }         
   }
   if ( arr[m] >= search )
      return m;
   return (m+1);      
}

....
p_ary_search<<<16, 64>>>(search[j], array_length, dev_arr, dev_ret_val);
....

__global__ void p_ary_search(int search, int array_length, int *arr, int *ret_val ) 
{
   const int num_threads = blockDim.x * gridDim.x;
   const int thread = blockIdx.x * blockDim.x + threadIdx.x;
   int set_size = array_length;

   ret_val[0] = -1; // return value
   ret_val[1] = 0;  // offset

   while(set_size != 0)
   {
      // Get the offset of the array, initially set to 0
      int offset = ret_val[1];

      // I think this is necessary in case a thread gets ahead, and resets offset before it's read
      // This isn't necessary for the unit tests to pass, but I still like it here
      __syncthreads();

      // Get the next index to check
      int index_to_check = get_index_to_check(thread, num_threads, set_size, offset);

      // If the index is outside the bounds of the array then lets not check it
      if (index_to_check < array_length)
      {
         // If the next index is outside the bounds of the array, then set it to maximum array size
         int next_index_to_check = get_index_to_check(thread + 1, num_threads, set_size, offset);
         if (next_index_to_check >= array_length)
         {
            next_index_to_check = array_length - 1;
         }

         // If we're at the mid section of the array reset the offset to this index
         if (search > arr[index_to_check] && (search < arr[next_index_to_check])) 
         {
            ret_val[1] = index_to_check;
         }
         else if (search == arr[index_to_check]) 
         {
            // Set the return var if we hit it
            ret_val[0] = index_to_check;
         }
      }

      // Since this is a p-ary search divide by our total threads to get the next set size
      set_size = set_size / num_threads;

      // Sync up so no threads jump ahead and get a bad offset
      __syncthreads();
   }
}

。。。。
p_ary_search（搜索[j]，数组长度，dev_arr，dev_ret val）；
....
__全局无效p__ary_搜索（int搜索、int数组长度、int*arr、int*ret_val）
{
const int num_threads=blockDim.x*gridDim.x；
const int thread=blockIdx.x*blockDim.x+threadIdx.x；
int set_size=数组长度；
ret_val[0]=-1；//返回值
ret_val[1]=0；//偏移量
while（设置大小！=0）
{
//获取数组的偏移量，初始设置为0
int offset=ret_val[1]；
//我认为这是必要的，以防线程前进，并在读取之前重置偏移量
//单元测试不一定要通过，但我仍然喜欢这里
__同步线程（）；
//获取要检查的下一个索引
int index_to_check=获取_index_to_check（线程、num_线程、设置_大小、偏移量）；
//如果索引在数组的边界之外，则不允许检查它
if（索引到检查<数组长度）
{
//如果下一个索引超出数组的边界，则将其设置为最大数组大小
int next_index_to_check=获取_index_to_check（线程+1，num_线程，设置_大小，偏移量）；
如果（下一个索引到检查>=数组长度）
{
下一个要检查的索引=数组长度-1；
}
//如果我们在数组的中间部分，请将偏移量重置为此索引
if（搜索>arr[索引到检查]&（搜索


即使我尝试更大的阵列，时间比也不会更好
 您的代码中有太多不同的分支，因此您实际上是在GPU上序列化整个过程。您希望拆分工作，以便同一扭曲中的所有线程在分支中采用相同的路径。见本手册第47页
 我必须承认，我不完全确定您的内核是做什么的，但我是否正确地假设您只寻找一个满足搜索条件的索引？如果是这样的话，那么看看CUDA附带的简化示例，了解如何构造和优化这样的查询。（您所做的基本上是尝试减少与查询最接近的索引）
不过有一些快速提示：
您正在对全局内存执行大量的读写操作，速度非常慢。尝试改用共享内存
其次，请记住u syncthreads（）只同步同一块中的线程，因此对全局内存的读/写不一定会在所有线程之间同步（尽管全局内存写入的延迟实际上可能会使其看起来像是同步的）
简单的二进制搜索并不能完全支持GPU操作。这是一个不能并行的串行操作。但是，您可以将数组分割成小块，并对每个小块进行二进制搜索。创建X个块，确定在X个并行线程中哪些可能包含变量。除了一个候选者，你们可能想在我使用的2000个元素的数组中检查一下推力二进制搜索。并使用CPU版本的二进制搜索来搜索数字395。在我的电脑上，它的声音只有0.000933ms。对于测试，我创建了内核，让内核完全不做任何事情：全局void Search（）{int tid=threadIdx.x+blockIdx.x*blockDim.x；if（tid<2000）{}并将其命名为0.034704 ms。从这个结果中，我真的想知道使用CUDA以加快速度是否有意义。或者我做错了什么…这真的是这样的，CUDA作为一些开销需要花费一些时间，但是，当你在做一些事情时，例如，CPU和GPU上花费10秒，可以使它快10倍，即使它有0.03秒的开销，你更喜欢哪一个？CUDA肯定可以工作，但如果在CPU上它已经非常快，那么它可能不值得。谢谢。我将尝试转移更多的工作从PC上的GPU，我希望这将以某种方式弥补现有的开销。我认为如果没有CPU和GPU之间的内存复制，开销已经很小了，但显然不是。我还将检查“推力”二进制搜索。