C# 返回CUDA中数组的最小和最大元素_C#_Cuda_Cudafy.net

C# 返回CUDA中数组的最小和最大元素

c# cuda

C# 返回CUDA中数组的最小和最大元素,c#,cuda,cudafy.net,C#,Cuda,Cudafy.net,我正在CUDA中执行一些数组操作/计算（通过，尽管我对CUDA/C++方法同样感兴趣），需要计算数组中的最小值和最大值。其中一个内核如下所示： [Cudafy] public static void UpdateEz(GThread thread, float time, float ca, float cb, float[,] hx, float[,] hy, float[,] ez) { var i = thread.blockIdx.x;

我正在CUDA中执行一些数组操作/计算（通过，尽管我对CUDA/C++方法同样感兴趣），需要计算数组中的最小值和最大值。其中一个内核如下所示：

    [Cudafy]
    public static void UpdateEz(GThread thread, float time, float ca, float cb, float[,] hx, float[,] hy, float[,] ez)
    {
        var i = thread.blockIdx.x;
        var j = thread.blockIdx.y;

        if (i > 0 && i < ez.GetLength(0) - 1 && j > 0 && j < ez.GetLength(1) - 1)
            ez[i, j] =
                ca * ez[i, j]
                + cb * (hx[i, j] - hx[i - 1, j])
                + cb * (hy[i, j - 1] - hy[i, j])
                ;
    }

    [Cudafy]
    public static void UpdateEz(GThread thread, float time, float ca, float cb, float[,] hx, float[,] hy, float[,] ez, out float min, out float max)
    {
        var i = thread.blockIdx.x;
        var j = thread.blockIdx.y;

        min = float.MaxValue;
        max = float.MinValue;

        if (i > 0 && i < ez.GetLength(0) - 1 && j > 0 && j < ez.GetLength(1) - 1)
        {
            ez[i, j] =
                ca * ez[i, j]
                + cb * (hx[i, j] - hx[i - 1, j])
                + cb * (hy[i, j - 1] - hy[i, j])
                ;

            min = Math.Min(ez[i, j], min);
            max = Math.Max(ez[i, j], max);

        }
    }

[Cudafy]
公共静态void updatez（GThread线程、浮点时间、浮点ca、浮点cb、浮点[，]hx、浮点[，]hy、浮点[，]ez）
{
var i=thread.blockIdx.x；
var j=thread.blockIdx.y；
如果（i>0&&i0&&j


我想这样做：
    [Cudafy]
    public static void UpdateEz(GThread thread, float time, float ca, float cb, float[,] hx, float[,] hy, float[,] ez)
    {
        var i = thread.blockIdx.x;
        var j = thread.blockIdx.y;

        if (i > 0 && i < ez.GetLength(0) - 1 && j > 0 && j < ez.GetLength(1) - 1)
            ez[i, j] =
                ca * ez[i, j]
                + cb * (hx[i, j] - hx[i - 1, j])
                + cb * (hy[i, j - 1] - hy[i, j])
                ;
    }

    [Cudafy]
    public static void UpdateEz(GThread thread, float time, float ca, float cb, float[,] hx, float[,] hy, float[,] ez, out float min, out float max)
    {
        var i = thread.blockIdx.x;
        var j = thread.blockIdx.y;

        min = float.MaxValue;
        max = float.MinValue;

        if (i > 0 && i < ez.GetLength(0) - 1 && j > 0 && j < ez.GetLength(1) - 1)
        {
            ez[i, j] =
                ca * ez[i, j]
                + cb * (hx[i, j] - hx[i - 1, j])
                + cb * (hy[i, j - 1] - hy[i, j])
                ;

            min = Math.Min(ez[i, j], min);
            max = Math.Max(ez[i, j], max);

        }
    }

[Cudafy]
公共静态void updatez（GThread线程、浮点时间、浮点ca、浮点cb、浮点[，]hx、浮点[，]hy、浮点[，]ez、浮点最小值、浮点最大值）
{
var i=thread.blockIdx.x；
var j=thread.blockIdx.y；
min=float.MaxValue；
max=float.MinValue；
如果（i>0&&i0&&j

任何人都知道返回最小值和最大值的便捷方法（对于整个数组，而不仅仅是每个线程或块）？
您可以使用一种方法开发自己的最小/最大算法
如果您可以使用npp，那么此函数可能很有用：。
您可以使用一种方法开发自己的最小/最大算法
如果您有可能使用npp，那么此函数可能很有用：。
根据您对问题的评论，您试图在计算最大值和最小值时找到它们；虽然这是可能的，但它并不是最有效的。如果您打算这样做，那么您可以对一些全局最小值和全局最大值进行原子比较，其缺点是每个线程都将被序列化，这可能是一个严重的瓶颈
对于通过缩减查找数组中的最大值或最小值的更规范的方法，您可以执行以下操作：
#define MAX_NEG ... //some small number

template <typename T, int BLKSZ> __global__
void cu_max_reduce(const T* d_data, const int d_len, T* max_val)
{
    volatile __shared__ T smem[BLKSZ];

    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
        //starting index for each block to begin loading the input data into shared memory
    const int bid_sidx = bid*BLKSZ;

    //load the input data to smem, with padding if needed. each thread handles 2 elements
    #pragma unroll
    for (int i = 0; i < 2; i++)
    {
                //get the index for the thread to load into shared memory
        const int tid_idx = 2*tid + i;
        const int ld_idx = bid_sidx + tid_idx;
        if(ld_idx < (bid+1)*BLKSZ && ld_idx < d_len)
            smem[tid_idx] = d_data[ld_idx];
        else
            smem[tid_idx] = MAX_NEG;

        __syncthreads();
    }

    //run the reduction per-block
    for (unsigned int stride = BLKSZ/2; stride > 0; stride >>= 1)
    {
        if(tid < stride)
        {
            smem[tid] = ((smem[tid] > smem[tid + stride]) ? smem[tid]:smem[tid + stride]);
        }
        __syncthreads();
    }

    //write the per-block result out from shared memory to global memory
    max_val[bid] = smem[0];
}


//assume we have d_data as a device pointer with our data, of length data_len
template <typename T> __host__
T cu_find_max(const T* d_data, const int data_len)
{
    //in your host code, invoke the kernel with something along the lines of:
    const int thread_per_block = 16; 
    const int elem_per_thread = 2;
    const int BLKSZ = elem_per_thread*thread_per_block; //number of elements to process per block
    const int blocks_per_grid = ceil((float)data_len/(BLKSZ));

    dim3 block_dim(thread_per_block, 1, 1);
    dim3 grid_dim(blocks_per_grid, 1, 1);

    T *d_max;
    cudaMalloc((void **)&d_max, sizeof(T)*blocks_per_grid); 

    cu_max_reduce <T, BLKSZ> <<<grid_dim, block_dim>>> (d_data, data_len, d_max);

    //etc....
}

#定义最大负//一些小数目
模板\uuu全局__
无效cu_max_reduce（常数T*d_数据、常数int d_len、T*max_val）
{
易失性共享smem[BLKSZ]；
const int tid=threadIdx.x；
const int bid=blockIdx.x；
//启动每个块的索引，以开始将输入数据加载到共享内存中
const int bid_sidx=bid*BLKSZ；
//将输入数据加载到smem，如果需要，可以使用填充。每个线程处理2个元素
#布拉格展开
对于（int i=0；i<2；i++）
{
//获取线程加载到共享内存中的索引
const int tid_idx=2*tid+i；
const int ld_idx=bid_sidx+tid_idx；
如果（ld_idx<（投标+1）*BLKSZ和ld_idx0；步长>>=1）
{
如果（tid<步幅）
{
smem[tid]=（（smem[tid]>smem[tid+步幅]）？smem[tid]：smem[tid+步幅]）；
}
__同步线程（）；
}
//将每个块的结果从共享内存写入全局内存
最大值[bid]=smem[0]；
}
//假设我们有d_数据作为设备指针，数据长度为
模板主机__
T cu_find_max（常数T*d_数据，常数int数据长度）
{
//在主机代码中，使用以下内容调用内核：
const int thread_per_block=16；
const int elem_per_thread=2；
const int BLKSZ=elem_per_thread*thread_per_block；//每个块要处理的元素数
const int blocks_per_grid=ceil（（浮点）数据长度/（BLKSZ））；
dim3块尺寸（每个块的螺纹，1,1）；
dim3网格尺寸（每个网格的块，1,1）；
T*d_max；
Cudamaloc（（空隙**）和d_最大值，尺寸（T）*每个网格块）；
cu_max_reduce（数据、数据长度、数据最大值）；
//等等。。。。
}

这将找到每个块的最大值。您可以在1个块上对其输出再次运行它（例如，使用d_max作为输入数据，并使用更新的启动参数），以查找全局最大值-如果数据集太大，则需要以这样的多遍方式运行它（在本例中，超过2*4096个元素，因为我们让每个线程处理2个元素，尽管您可以只在每个线程中处理更多元素来增加这一点）
我应该指出，这并不是特别有效（在加载共享内存时，您可能希望使用更智能的步幅来避免银行冲突），而且我也不是100%确定它是否正确（它在我尝试过的一些小测试用例中起作用），但我试图将其写得非常清晰。另外，别忘了输入一些错误检查代码，以确保您的CUDA呼叫成功完成，我将它们放在这里以保持简短（er）
我还应该向您介绍一些更深入的文档；您可以在上查看CUDA样本缩减，尽管它没有进行最小/最大计算，但它的总体思想是相同的（而且更有效）。此外，如果您希望简单，您可能只想使用推力的函数推力：：max_元素
和推力：：min_元素
，以及位于：Thrust.github.com/doc/group_extrema.html的文档。根据您对问题的评论，您试图在计算它们时找到最大值和最小值；而这是可能的，它不是最有效的。如果你打算这样做，那么你可以和一些全局最小值和全局值进行原子比较