cuda功能在cuda中的应用_Cuda - Fatal编程技术网

cuda功能在cuda中的应用

cuda

cuda功能在cuda中的应用,cuda,Cuda,在将矩阵a和向量x相乘得到结果y之后，我想对y应用函数h元素我想得到z=h（Ax），其中h被元素地应用于向量Ax 我知道如何在GPU上进行矩阵/向量乘法（使用cublas）。现在，我希望h（这是我自己的函数，用C++编写）也应用于GPU中的结果向量，我如何才能做到这一点？两种可能的方法是：编写自己的CUDA内核来执行该操作使用（例如）以下是两种方法的一个有效示例： $ cat t934.cu #include <iostream> #include <thrust/ho

在将矩阵a和向量x相乘得到结果y之后，我想对y应用函数h元素

我想得到z=h（Ax），其中h被元素地应用于向量Ax

我知道如何在GPU上进行矩阵/向量乘法（使用cublas）。现在，我希望h（这是我自己的函数，用C++编写）也应用于GPU中的结果向量，我如何才能做到这一点？

两种可能的方法是：

编写自己的CUDA内核来执行该操作

使用（例如）

以下是两种方法的一个有效示例：

$ cat t934.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/for_each.h>

#define DSIZE 4

#define nTPB 256

template <typename T>
__host__ __device__ T myfunc(T &d){

  return d + 5;  // define your own function here
}

struct mytfunc
{
template <typename T>
__host__ __device__
 void operator()(T &d){

  d = myfunc(d);
  }
};

template <typename T>
__global__ void mykernel(T *dvec, size_t dsize){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize) dvec[idx] = myfunc(dvec[idx]);
}

int main(){

  // first using kernel
  float *h_data, *d_data;
  h_data = new float[DSIZE];
  cudaMalloc(&d_data, DSIZE*sizeof(float));
  for (int i = 0; i < DSIZE; i++) h_data[i] = i;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  mykernel<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_data, DSIZE);
  cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < DSIZE; i++) std::cout << h_data[i] << ",";
  std::cout << std::endl;

  // then using thrust
  thrust::host_vector<float>   hvec(h_data, h_data+DSIZE);
  thrust::device_vector<float> dvec = hvec;
  thrust::for_each(dvec.begin(), dvec.end(), mytfunc());
  thrust::copy_n(dvec.begin(), DSIZE, std::ostream_iterator<float>(std::cout, ","));
  std::cout << std::endl;
}

$ nvcc -o t934 t934.cu
$ ./t934
5,6,7,8,
10,11,12,13,
$

$cat t934.cu
#包括
#包括
#包括
#包括
#包括
#定义DSIZE 4
#定义nTPB 256
模板
__主机设备myfunc（T&d）{
返回d+5；//在这里定义您自己的函数
}
结构mytfunc
{
模板
__主机设备__
无效运算符（）（T&d）{
d=myfunc（d）；
}
};
模板
__全局无效mykernel（T*dvec，size\T dsize）{
int idx=threadIdx.x+blockDim.x*blockIdx.x；
如果（idx对于（inti=0；i
编写自己的CUDA内核来执行该操作
使用（例如）
以下是两种方法的一个有效示例：
$ cat t934.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/for_each.h>

#define DSIZE 4

#define nTPB 256

template <typename T>
__host__ __device__ T myfunc(T &d){

  return d + 5;  // define your own function here
}

struct mytfunc
{
template <typename T>
__host__ __device__
 void operator()(T &d){

  d = myfunc(d);
  }
};

template <typename T>
__global__ void mykernel(T *dvec, size_t dsize){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < dsize) dvec[idx] = myfunc(dvec[idx]);
}

int main(){

  // first using kernel
  float *h_data, *d_data;
  h_data = new float[DSIZE];
  cudaMalloc(&d_data, DSIZE*sizeof(float));
  for (int i = 0; i < DSIZE; i++) h_data[i] = i;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  mykernel<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_data, DSIZE);
  cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < DSIZE; i++) std::cout << h_data[i] << ",";
  std::cout << std::endl;

  // then using thrust
  thrust::host_vector<float>   hvec(h_data, h_data+DSIZE);
  thrust::device_vector<float> dvec = hvec;
  thrust::for_each(dvec.begin(), dvec.end(), mytfunc());
  thrust::copy_n(dvec.begin(), DSIZE, std::ostream_iterator<float>(std::cout, ","));
  std::cout << std::endl;
}

$ nvcc -o t934 t934.cu
$ ./t934
5,6,7,8,
10,11,12,13,
$

$cat t934.cu
#包括
#包括
#包括
#包括
#包括
#定义DSIZE 4
#定义nTPB 256
模板
__主机设备myfunc（T&d）{
返回d+5；//在这里定义您自己的函数
}
结构mytfunc
{
模板
__主机设备__
无效运算符（）（T&d）{
d=myfunc（d）；
}
};
模板
__全局无效mykernel（T*dvec，size\T dsize）{
int idx=threadIdx.x+blockDim.x*blockIdx.x；
如果（idx对于（inti=0；i