Arrays 将2D malloc倾斜设备内存复制到设备中的3D阵列
在执行一个内核(即,Arrays 将2D malloc倾斜设备内存复制到设备中的3D阵列,arrays,cuda,Arrays,Cuda,在执行一个内核(即,reorder_raw)并输出一个更新的设备内存(即,d_ordered)之后,我想在另一个内核中执行一些分层插值。我知道我必须为此创建一个3D阵列,然后将我的设备内存转移到我的阵列内存d_ordered_array。但是,当我使用函数cudaMemcpy2DToArray时,我在mexPrintf(cudaGetErrorString(cudaGetLastError())中得到错误无效内存代码末尾的函数 否则,如果我对cudaMemcpy2DToArray进行注释,则不会
reorder_raw
)并输出一个更新的设备内存(即,d_ordered
)之后,我想在另一个内核中执行一些分层插值。我知道我必须为此创建一个3D阵列,然后将我的设备内存转移到我的阵列内存d_ordered_array
。但是,当我使用函数cudaMemcpy2DToArray
时,我在mexPrintf(cudaGetErrorString(cudaGetLastError())中得到错误无效内存代码>代码末尾的函数
否则,如果我对cudaMemcpy2DToArray
进行注释,则不会出现错误
void delay_US_linear(
short *h_raw, short *d_ordered, float *d_delay,
int samples, int channels, int scanlines, int elements,
float pitch, float speed_sound, float sample_freq, float delay_offset,
size_t in_pitch, size_t out_pitch
){
// Allocate the GPU raw data and ordered data buffer
short *d_raw;
cudaMalloc((void**)& d_raw, sizeof(short)*samples*channels*scanlines);
cudaMemcpy(d_raw, h_raw, sizeof(short)*samples*channels*scanlines, cudaMemcpyHostToDevice);
// Allocate block and grid dimensions
int griddim_x = (samples + order_X - 1) / order_X;
int griddim_y = (scanlines);
int griddim_z = 1;
dim3 dimGrid(griddim_x, griddim_y, griddim_z);
dim3 dimBlock(order_X, order_Y, order_Z);
// Use all threads in block for shared memory
int shared_size = order_X * order_Y * order_Z * sizeof(short);
// Only need to change the channel order, independency in axial and scanline dimension
reorder_raw << <dimGrid, dimBlock, shared_size >> > (
d_raw, d_ordered, samples, channels, scanlines, elements, in_pitch/sizeof(short));
cudaDeviceSynchronize();
// Create a 3D array
cudaArray *d_ordered_array;
cudaChannelFormatDesc desc = cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSigned);
cudaMalloc3DArray(&d_ordered_array, &desc, make_cudaExtent(samples, channels, scanlines),
cudaArrayLayered);
// Copy device memory to the 3D array
cudaMemcpy2DToArray(d_ordered_array, 0, 0, d_ordered, in_pitch, sizeof(short)*samples,
channels*scanlines,cudaMemcpyDeviceToDevice);
cudaFreeArray(d_ordered_array);
cudaFree(d_raw);
mexPrintf(cudaGetErrorString(cudaGetLastError()));
}
使用cudamaloc3d
而不是cudamalocitch
分配d\u顺序
,并使用cudaMemcpy3D
操作而不是cudaMemcpy2DToArray
,您将能够实现此功能。这些是与3DcudaArray
相匹配的。以下是一个例子:
$ cat t1733.cu
#include <iostream>
void delay_US_linear(
short *h_raw, cudaPitchedPtr d_ordered, float *d_delay,
int samples, int channels, int scanlines, int elements,
float pitch, float speed_sound, float sample_freq, float delay_offset,
size_t in_pitch, size_t out_pitch
){
// Create a 3D array
cudaArray *d_ordered_array;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<short>();
cudaExtent my_ext = make_cudaExtent(samples, channels, scanlines);
cudaMalloc3DArray(&d_ordered_array, &desc, my_ext, cudaArrayLayered);
// Copy device memory to the 3D array
cudaMemcpy3DParms p = {0};
p.srcPtr = d_ordered;
p.dstArray = d_ordered_array;
p.extent = my_ext;
p.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&p);
cudaFreeArray(d_ordered_array);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
int main(){
const int samples = 4864; // 4864
const int channels = 64; //64
const int scanlines = 128;// 128
cudaPitchedPtr d_ordered;
size_t in_pitch=0, out_pitch = 0;
short *h_raw = NULL;
float *d_delay = NULL;
const int elements = 0;
float pitch = 0;
float speed_sound = 0;
float sample_freq = 0;
float delay_offset = 0;
cudaExtent my_ext = make_cudaExtent(samples*sizeof(short), channels, scanlines);
cudaMalloc3D(&d_ordered, my_ext);
// cudaMallocPitch((void**) &d_ordered,&in_pitch,sizeof(short)*samples,channels*scanlines);
delay_US_linear(h_raw, d_ordered, d_delay, samples, channels, scanlines, elements,
pitch, speed_sound, sample_freq, delay_offset, in_pitch, out_pitch);
}
$ nvcc -o t1733 t1733.cu
$ cuda-memcheck ./t1733
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$
$cat t1733.cu
#包括
无效延迟_US _线性(
短*h_原始,CUDAPITCHEDTR d_订购,浮动*d_延迟,
int采样、int通道、int扫描线、int元素,
浮动节距、浮动速度、浮动声音、浮动采样频率、浮动延迟偏移、,
大小输入音高,大小输出音高
){
//创建三维阵列
cudaArray*d_有序_数组;
cudaChannelFormatDesc desc=cudaCreateChannelDesc();
cudaExtent my_ext=制作cudaExtent(样本、通道、扫描线);
cudaMalloc3DArray(&d_ordered_array,&desc,my_ext,cudaArrayLayered);
//将设备内存复制到3D阵列
cudaMemcpy3DParms p={0};
p、 srcPtr=d_有序;
p、 dstArray=d_有序_数组;
p、 范围=我的外部;
p、 种类=CUDAMEMCPydeviceto装置;
cudaMemcpy3D&p;
cudaFreeArray(d_有序_数组);
std::cout使用cudamaloc3d
而不是cudamalocitch
分配d\u顺序
,并使用cudaMemcpy3D
操作而不是cudaMemcpy2DToArray
,您将能够使其工作。这些是与您的3DcudaArray
相匹配的。以下是一个示例:
$ cat t1733.cu
#include <iostream>
void delay_US_linear(
short *h_raw, cudaPitchedPtr d_ordered, float *d_delay,
int samples, int channels, int scanlines, int elements,
float pitch, float speed_sound, float sample_freq, float delay_offset,
size_t in_pitch, size_t out_pitch
){
// Create a 3D array
cudaArray *d_ordered_array;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<short>();
cudaExtent my_ext = make_cudaExtent(samples, channels, scanlines);
cudaMalloc3DArray(&d_ordered_array, &desc, my_ext, cudaArrayLayered);
// Copy device memory to the 3D array
cudaMemcpy3DParms p = {0};
p.srcPtr = d_ordered;
p.dstArray = d_ordered_array;
p.extent = my_ext;
p.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&p);
cudaFreeArray(d_ordered_array);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
int main(){
const int samples = 4864; // 4864
const int channels = 64; //64
const int scanlines = 128;// 128
cudaPitchedPtr d_ordered;
size_t in_pitch=0, out_pitch = 0;
short *h_raw = NULL;
float *d_delay = NULL;
const int elements = 0;
float pitch = 0;
float speed_sound = 0;
float sample_freq = 0;
float delay_offset = 0;
cudaExtent my_ext = make_cudaExtent(samples*sizeof(short), channels, scanlines);
cudaMalloc3D(&d_ordered, my_ext);
// cudaMallocPitch((void**) &d_ordered,&in_pitch,sizeof(short)*samples,channels*scanlines);
delay_US_linear(h_raw, d_ordered, d_delay, samples, channels, scanlines, elements,
pitch, speed_sound, sample_freq, delay_offset, in_pitch, out_pitch);
}
$ nvcc -o t1733 t1733.cu
$ cuda-memcheck ./t1733
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$
$cat t1733.cu
#包括
无效延迟_US _线性(
短*h_原始,CUDAPITCHEDTR d_订购,浮动*d_延迟,
int采样、int通道、int扫描线、int元素,
浮动节距、浮动速度、浮动声音、浮动采样频率、浮动延迟偏移、,
大小输入音高,大小输出音高
){
//创建三维阵列
cudaArray*d_有序_数组;
cudaChannelFormatDesc desc=cudaCreateChannelDesc();
cudaExtent my_ext=制作cudaExtent(样本、通道、扫描线);
cudaMalloc3DArray(&d_ordered_array,&desc,my_ext,cudaArrayLayered);
//将设备内存复制到3D阵列
cudaMemcpy3DParms p={0};
p、 srcPtr=d_有序;
p、 dstArray=d_有序_数组;
p、 范围=我的外部;
p、 种类=CUDAMEMCPydeviceto装置;
cudaMemcpy3D&p;
cudaFreeArray(d_有序_数组);
标准::cout