Cuda 如何实现1D模具代码的推力版本？_Cuda_Shared Memory_Thrust_Stencil Buffer

Cuda 如何实现1D模具代码的推力版本？

cuda

Cuda 如何实现1D模具代码的推力版本？,cuda,shared-memory,thrust,stencil-buffer,Cuda,Shared Memory,Thrust,Stencil Buffer,基本上，可以使用纯推力实现下面所示的1D模具内核吗？我希望这个实现尽可能高效，这意味着推力应该知道对相同元素有多个访问，并且需要使用共享内存访问 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <thrust/device_vector.h> #include <thrust/device_

基本上，可以使用纯推力实现下面所示的1D模具内核吗？我希望这个实现尽可能高效，这意味着推力应该知道对相同元素有多个访问，并且需要使用共享内存访问

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#define BLOCK_SIZE 128
#define RADIUS 8
#define SIZE 1024*1024*8
const dim3 DimBlock(BLOCK_SIZE);
const dim3 DimGrid(SIZE/BLOCK_SIZE);

__global__ void stencil_1d(const int * in, int *out) {
    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    if( gindex < SIZE ) temp[lindex] = in[gindex]; else temp[lindex] = 0;
    if (threadIdx.x < RADIUS) {
    if(gindex - RADIUS>=0 )temp[lindex - RADIUS] = in[gindex - RADIUS]; else  temp[lindex - RADIUS] = 0;
    if(gindex + BLOCK_SIZE < SIZE )  temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; else  temp[lindex + BLOCK_SIZE] = 0;
    }

    // Synchronize (ensure all the data is available)
    __syncthreads();

    // Apply the stencil
    int result = 0;
    for (int offset = -RADIUS; offset <= RADIUS; offset++)
    if( gindex < SIZE ) result += temp[lindex + offset];

    // Store the result
    if( gindex < SIZE ) out[gindex] = result;
}

int main()
{
    cudaError_t cudaStat;
    thrust::device_vector<int> dev_vec_inp(SIZE,1);
    thrust::device_vector<int> dev_vec_out(SIZE);
    try 
    {
        stencil_1d<<< DimGrid, DimBlock >>>(thrust::raw_pointer_cast(dev_vec_inp.data()) , thrust::raw_pointer_cast(dev_vec_out.data()));
        cudaStat =  cudaGetLastError(); 
        if (cudaStat != cudaSuccess)
        throw cudaGetErrorString(cudaStat);
        else std::cout<<"1D stencil has been executed successfully" << std:: endl;
    }   
    catch(const char* e)
    {
        std::cout<< e;
    }
}

#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#定义块大小128
#定义半径8
#定义大小1024*1024*8
常量dim3 DimBlock（块大小）；
常量dim3 DimGrid（大小/块大小）；
__全局无效模具（常量int*in，int*out）{
__共享内部温度[块大小+2*半径]；
int-gindex=threadIdx.x+blockIdx.x*blockDim.x；
int-lindex=螺纹内径x.x+半径；
//将输入元素读入共享内存
如果（gindex=0）温度[lindex-RADIUS]=in[gindex-RADIUS]；否则温度[lindex-RADIUS]=0；
如果（gindex+块大小<大小）温度[lindex+块大小]=in[gindex+块大小]；否则温度[lindex+块大小]=0；
}
//同步（确保所有数据可用）
__同步线程（）；
//应用模板
int结果=0；
对于（int offset=-RADIUS；offset（推力：：原始指针投射（dev_vec_inp.data（）），推力：：原始指针投射（dev_vec_out.data（））；
cudaStat=cudaGetLastError（）；
如果（cudaStat！=cudaSuccess）
抛出cudaGetErrorString（cudaStat）；
else std:：cout对于这种特殊类型的模具op（+
），可以使用前缀和方法。首先执行前缀和，然后使用模具半径从模具窗口的右端减去模具窗口的左端
这是一张草图：
$ cat t1777.cu
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/host_vector.h>
#include <iostream>

const int ds = 20;
const int stencil_radius = 7;
using namespace thrust::placeholders;
typedef int mt;

int main(){

  thrust::device_vector<mt> data(ds, 1);
  thrust::device_vector<mt> result(ds-(2*stencil_radius));
  thrust::inclusive_scan(data.begin(), data.end(), data.begin());
  thrust::transform(data.begin(), data.end()-(2*stencil_radius),data.begin()+(2*stencil_radius), result.begin(), _2-_1);
  thrust::host_vector<mt> h_result = result;
  thrust::copy(h_result.begin(), h_result.end(), std::ostream_iterator<mt>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t1777 t1777.cu
$ ./t1777
14,14,14,14,14,14,
$

$cat t1777.cu
#包括
#包括
#包括
#包括
#包括
#包括
常数int ds=20；
const int模具半径=7；
使用命名空间推力：：占位符；
类型定义int mt；
int main（）{
推力：设备矢量数据（ds，1）；
推力：：设备矢量结果（ds-（2*模具半径））；
inclusive_扫描（data.begin（），data.end（），data.begin（））；
推力：：变换（data.begin（），data.end（）-（2*模具半径），data.begin（）+（2*模具半径），result.begin（），_2-_1）；
推力：：主机向量h_结果=结果；
推力：：复制（h_result.begin（）、h_result.end（）、std：：ostream_迭代器（std：：cout，“，”）；
std:：cout对于这种特殊类型的模具op（+
），可以使用前缀和方法。首先执行前缀和，然后使用模具半径从模具窗口的右端减去模具窗口的左端
这是一张草图：
$ cat t1777.cu
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/host_vector.h>
#include <iostream>

const int ds = 20;
const int stencil_radius = 7;
using namespace thrust::placeholders;
typedef int mt;

int main(){

  thrust::device_vector<mt> data(ds, 1);
  thrust::device_vector<mt> result(ds-(2*stencil_radius));
  thrust::inclusive_scan(data.begin(), data.end(), data.begin());
  thrust::transform(data.begin(), data.end()-(2*stencil_radius),data.begin()+(2*stencil_radius), result.begin(), _2-_1);
  thrust::host_vector<mt> h_result = result;
  thrust::copy(h_result.begin(), h_result.end(), std::ostream_iterator<mt>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t1777 t1777.cu
$ ./t1777
14,14,14,14,14,14,
$

$cat t1777.cu
#包括
#包括
#包括
#包括
#包括
#包括
常数int ds=20；
const int模具半径=7；
使用命名空间推力：：占位符；
类型定义int mt；
int main（）{
推力：设备矢量数据（ds，1）；
推力：：设备矢量结果（ds-（2*模具半径））；
inclusive_扫描（data.begin（），data.end（），data.begin（））；
推力：：变换（data.begin（），data.end（）-（2*模具半径），data.begin（）+（2*模具半径），result.begin（），_2-_1）；
推力：：主机向量h_结果=结果；
推力：：复制（h_result.begin（）、h_result.end（）、std：：ostream_迭代器（std：：cout，“，”）；
std：：除非您选择创建自己的基本上绕过“推力”的函子，否则通常不可能让“推力”在这样的工作中使用共享内存。即使使用自己的函子，也可能很难构造。@RobertCrovella谢谢。然后我将仅使用cuda内核进行模具计算，并使用“推力向量”进行内存管理。推力有一个相邻的差异，可以计算出一个差异表，从中你可以构造你需要的任何模具。通常不可能让推力使用共享内存来完成这样的工作，除非你选择创建你自己的基本上绕过推力的函子。即使有你自己的函子，它也可能是不同的ult构造。@RobertCrovella谢谢。然后我将使用cuda内核进行模具计算，使用推力向量进行内存管理。推力有一个相邻的差分，可以计算一个差分表，从中可以构造您需要的任何模具