Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/asp.net-core/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Cuda 内核中的bool变量是否需要同步_Cuda - Fatal编程技术网

Cuda 内核中的bool变量是否需要同步

Cuda 内核中的bool变量是否需要同步,cuda,Cuda,我有一个由for循环组成的内核,它在数组中搜索特定的int值。我使用256个线程的网格块来实现这一点。但是,当一个线程找到该值时,我想让其他线程知道退出。目前我使用的是布尔标志,但我不确定它是否正常工作。我关心的是同步 __device__ bool found; __global__ void search() { for(int i = threadIdx.x; i<1000000; i += stride) { if(found == true)

我有一个由for循环组成的内核,它在数组中搜索特定的int值。我使用256个线程的网格块来实现这一点。但是,当一个线程找到该值时,我想让其他线程知道退出。目前我使用的是布尔标志,但我不确定它是否正常工作。我关心的是同步

__device__ bool found;

__global__
void search()
{
   for(int i = threadIdx.x; i<1000000; i += stride)
   {
        if(found == true)
        {
            break;
        } 
        else if(arr[i] = x)
        {
             found = true;
             break;
        }
   }
}  

int main()
{
    bool flag = false;
    cudaMemcpyToSymbol(found, &flag, sizeof(bool), 0,cudaMemcpyHostToDevice);
}
\uuuuuuu设备\uuuuuuuuo找到;
__全球的__
无效搜索()
{

对于(int i=threadIdx.x;i正如评论中所指出的,通过将全局设备标志声明为
volatile
,这将禁止缓存,并通过使用内存栅栏函数,您可能可以实现您想要的。除了新的网格同步之外,实际上没有一个全局同步原语CUDA 9和新硬件中引入了机制,但在本例中可能不需要这样做。将伪代码转化为一个玩具示例:

#include <iostream>
#include <thrust/device_vector.h>

__device__ volatile bool found;
__device__ volatile size_t idx;

template<bool docheck>
__global__
void search(const int* arr, int x, size_t N)
{
   size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   size_t stride = blockDim.x * gridDim.x;

   for(; (i<N) && (!found); i += stride)
   {
        if(arr[i] == x)
        {
             if (docheck) found = true;
             idx = i;
             __threadfence();
             break;
        }
   }
}  

int main()
{
    const size_t N = 1 << 24;
    const size_t findidx = 280270;
    const int findval = 0xdeadbeef;

    thrust::device_vector<int> data(N,1);
    data[findidx] = findval;

    bool flag = false;
    size_t zero = 0;


    {
    cudaMemcpyToSymbol(found, &flag, sizeof(bool));
    cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
    int blocks, threads;
    cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<false>);
    search<false><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
    cudaDeviceSynchronize();
    size_t result = 0;
    cudaMemcpyFromSymbol(&result, idx, sizeof(size_t)); 
    std::cout << "result = " << result << std::endl;
    }

    {
    cudaMemcpyToSymbol(found, &flag, sizeof(bool));
    cudaMemcpyToSymbol(idx, &zero, sizeof(size_t));
    int blocks, threads;
    cudaOccupancyMaxPotentialBlockSize(&blocks, &threads, search<true>);
    search<true><<<blocks, threads>>>(thrust::raw_pointer_cast(data.data()), findval, N);
    cudaDeviceSynchronize();
    size_t result = 0;
    cudaMemcpyFromSymbol(&result, idx, sizeof(size_t)); 
    std::cout << "result = " << result << std::endl;
    }

    return 0;
}
#包括
#包括
__发现设备挥发性bool;
__设备uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
模板
__全球的__
无效搜索(常量int*arr,整数x,大小N)
{
size_t i=线程IDX.x+块IDX.x*块DIM.x;
步长=blockDim.x*gridDim.x;

对于(;(iit很有可能通过将
bool
变量标记为来解决您的问题。作为额外措施,您可以在内核代码中写入该
bool
变量的唯一点之后立即使用调用。这是一个非常有用的答案。您是如何获得
nvprof
来显示它的时间量的设置查找标志的线程的时间?@Brosef:我没有。探查器正在测量内核的完整执行时间。嗯,我的没有显示。只显示
搜索(int*,int,int)
[CUDA memcpy HtoD]的时间显然,你没有运行我的代码。注意我的答案中的代码包含两个内核。我看到了。我没有做很多C++或CUDA。当你从<代码>主()/<代码>运行这两个内核时,什么是<代码> <代码>代码> <代码>?
$ nvcc -arch=sm_52 -o notify notify.cu
$ nvprof ./notify
==3916== NVPROF is profiling process 3916, command: ./notify
result = 280270
result = 280270
==3916== Profiling application: ./notify
==3916== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   78.00%  1.6773ms         1  1.6773ms  1.6773ms  1.6773ms  void search<bool=0>(int const *, int, unsigned long)
                   19.93%  428.63us         1  428.63us  428.63us  428.63us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
                    1.82%  39.199us         1  39.199us  39.199us  39.199us  void search<bool=1>(int const *, int, unsigned long)