将openMp程序移植到cuda c：正确的网格大小/块大小和缩减_C_Cuda_Openmp_Gpgpu_Thrust

将openMp程序移植到cuda c：正确的网格大小/块大小和缩减

c cuda

将openMp程序移植到cuda c：正确的网格大小/块大小和缩减,c,cuda,openmp,gpgpu,thrust,C,Cuda,Openmp,Gpgpu,Thrust,我想将openMP程序转换为cuda c。我试图在网络和sdk上找到自己的路。但是材料超出了我的水平。我的c程序在n=2^30索引上循环，并添加每个索引的权重 1正确的网格大小和块大小是多少？我的猜测是复制openMP并执行 grid_size=n/max_number_of_cuda_threads; block_size=1; 2如何在cuda中实现openMP缩减？我尝试了一个cudaMemcpy，然后在标准c中减少了数组，但速度似乎很慢。我看一下推力库和它的reduce操作符

我想将openMP程序转换为cuda c。我试图在网络和sdk上找到自己的路。但是材料超出了我的水平。我的c程序在n=2^30索引上循环，并添加每个索引的权重

1正确的网格大小和块大小是多少？我的猜测是复制openMP并执行

grid_size=n/max_number_of_cuda_threads;
block_size=1;

2如何在cuda中实现openMP缩减？我尝试了一个cudaMemcpy，然后在标准c中减少了数组，但速度似乎很慢。我看一下推力库和它的reduce操作符。但我不知道如何将它与我当前的代码集成

程序c

#include <math.h>
#include <omp.h>

float get_weigth_of_index(long index,float* data){
    int i;
    float v=0;
    for(i=0;i<4;i++)
        v+=index*data[i];
    return v;
}

int main(){
    long i;
    float r=0;
    long n=pow(2,30);
    float data[4]={0,1,2,3};
    #pragma omp parallel for reduction (+:r)
    for(i=0;i<n;i++)
        r+=get_weigth_of_index(i,data);
    return 0;
}

我认为推力：：转换\减少可以解决你的问题。此代码显示了如何使用它：

#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h>
#include <cmath>

struct get_weigth_of_index
{

    get_weigth_of_index(float* data, size_t n)
    {
        cudaMalloc((void**)&_data,n * sizeof(float));
        cudaMemcpy(_data, data, n * sizeof(float), cudaMemcpyHostToDevice);
        _n = n;
    }

    float* _data;
    size_t _n;
    __host__ __device__
    float operator()(const int& index) const
    { 
        float v=0;
        for(size_t i=0; i<_n; i++)
            v += index * _data[i];
        return v;
    }
};

int main(void)
{

    float x[4] = {1.0, 2.0, 3.0, 4.0};

    size_t len = 1024; // init your value
    float * index //init and fill you array here 
    // transfer to device
    thrust::device_vector<float> d_index(index, index + len);

    get_weigth_of_index unary_op(x, 4);
    thrust::plus<float> binary_op;
    float init = 0;

    float sum = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);

    std::cout << sum<< std::endl;

    return 0;
}

我认为推力：：转换\减少可以解决你的问题。此代码显示了如何使用它：

#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h>
#include <cmath>

struct get_weigth_of_index
{

    get_weigth_of_index(float* data, size_t n)
    {
        cudaMalloc((void**)&_data,n * sizeof(float));
        cudaMemcpy(_data, data, n * sizeof(float), cudaMemcpyHostToDevice);
        _n = n;
    }

    float* _data;
    size_t _n;
    __host__ __device__
    float operator()(const int& index) const
    { 
        float v=0;
        for(size_t i=0; i<_n; i++)
            v += index * _data[i];
        return v;
    }
};

int main(void)
{

    float x[4] = {1.0, 2.0, 3.0, 4.0};

    size_t len = 1024; // init your value
    float * index //init and fill you array here 
    // transfer to device
    thrust::device_vector<float> d_index(index, index + len);

    get_weigth_of_index unary_op(x, 4);
    thrust::plus<float> binary_op;
    float init = 0;

    float sum = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);

    std::cout << sum<< std::endl;

    return 0;
}

您可能希望阅读并更好地了解哪些内容会影响您对块和网格大小的选择。您可能希望阅读并更好地了解哪些内容会影响您对块和网格大小的选择。是否有不需要初始化索引数组的推力运算符？如果我使用一个非常大的索引，比如long index={1，…，2**40}，索引将无法放入设备内存中，如果推力遍历这些值会更好。@Nicolassessis Breton:是的，请参阅推力：：计数_iterator@talonmies正是我需要的。谢谢。是否有不需要初始化索引数组的推力运算符？如果我使用一个非常大的索引，比如long index={1，…，2**40}，索引将无法放入设备内存中，如果推力遍历这些值会更好。@Nicolassessis Breton:是的，请参阅推力：：计数_iterator@talonmies正是我需要的。非常感谢。

#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h>
#include <cmath>

struct get_weigth_of_index
{

    get_weigth_of_index(float* data, size_t n)
    {
        cudaMalloc((void**)&_data,n * sizeof(float));
        cudaMemcpy(_data, data, n * sizeof(float), cudaMemcpyHostToDevice);
        _n = n;
    }

    float* _data;
    size_t _n;
    __host__ __device__
    float operator()(const int& index) const
    { 
        float v=0;
        for(size_t i=0; i<_n; i++)
            v += index * _data[i];
        return v;
    }
};

int main(void)
{

    float x[4] = {1.0, 2.0, 3.0, 4.0};

    size_t len = 1024; // init your value
    float * index //init and fill you array here 
    // transfer to device
    thrust::device_vector<float> d_index(index, index + len);

    get_weigth_of_index unary_op(x, 4);
    thrust::plus<float> binary_op;
    float init = 0;

    float sum = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);

    std::cout << sum<< std::endl;

    return 0;
}