让CUDA推力使用您选择的CUDA流
看看CUDA推力代码中的内核启动,它们似乎总是使用默认流。我可以使用我选择的流吗?我在API中遗漏了什么吗?不,您没有遗漏任何东西(至少在CUDA 6.0附带的发布快照之前) 原始的基于推力标签的调度系统有意地抽象了所有底层CUDA API调用,牺牲了一些性能以便于使用和一致性(请记住,推力具有CUDA以外的后端)。如果您想要这种级别的灵活性,您需要尝试另一个库(例如CUB)让CUDA推力使用您选择的CUDA流,cuda,thrust,Cuda,Thrust,看看CUDA推力代码中的内核启动,它们似乎总是使用默认流。我可以使用我选择的流吗?我在API中遗漏了什么吗?不,您没有遗漏任何东西(至少在CUDA 6.0附带的发布快照之前) 原始的基于推力标签的调度系统有意地抽象了所有底层CUDA API调用,牺牲了一些性能以便于使用和一致性(请记住,推力具有CUDA以外的后端)。如果您想要这种级别的灵活性,您需要尝试另一个库(例如CUB) 在CUDA 7.0快照之后的版本中,可以通过设置推力操作的选择流。我想在推力1.8发布后更新Talonmes提供的答案,
在CUDA 7.0快照之后的版本中,可以通过设置推力操作的选择流。我想在推力1.8发布后更新Talonmes提供的答案,该版本引入了将CUDA执行流指示为
thrust::cuda::par.on(stream)
另见
在下面,我将重铸中的示例
就CUDA推力API而言
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust\device_vector.h>
#include <thrust\execution_policy.h>
#include "Utilities.cuh"
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
struct BinaryOp{ __host__ __device__ int operator()(const int& o1,const int& o2) { return o1 * o2; } };
int main()
{
const int N = 6000000;
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
// --- Host side check results vector allocation and initialization
int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
// --- Device side input data allocation.
int *d_in = 0; gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
// --- Device side output data allocation.
int *d_out = 0; gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
int streamSize = N / NUM_STREAMS;
size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
// --- Set kernel launch configuration
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS, 1,1);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamCreate(&streams[i]));
/**************************/
/* BREADTH-FIRST APPROACH */
/**************************/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), BinaryOp());
thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]) + streamSize/2,
thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_out[offset + streamSize/2]), BinaryOp());
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
gpuErrchk(cudaDeviceSynchronize());
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
// --- GPU output check
int sum = 0;
for(int i = 0; i < N; i++) {
//printf("%i %i\n", h_out[i], h_checkResults[i]);
sum += h_checkResults[i] - h_out[i];
}
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_checkResults;
return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#包括“Utilities.cuh”
使用名称空间std;
#定义NUM_线程32
#定义NUM_块16
#定义NUM_流3
结构二进制操作{{uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
int main()
{
常数整数N=6000000;
//---主机端输入数据分配和初始化。将主机内存注册为页面锁定(异步cudaMemcpyAsync需要)。
int*h_in=new int[N];for(int i=0;i 是否应该可以通过推力的master/development分支开始使用推力流进行实验。实验性公告是。示例语法:推力::排序(推力::cuda::par(stream),key.begin(),key.end())
par
表示什么?@einpoklum这是一个推力执行策略,就像推力::seq
或推力::设备
一样。另一个工作示例是