来自2D阵列CUDA的2D纹理
我试图将一个Nx3数组传递给内核,并像在纹理内存中一样从中读取,然后写入第二个数组。这是我的简化代码,N=8:来自2D阵列CUDA的2D纹理,cuda,Cuda,我试图将一个Nx3数组传递给内核,并像在纹理内存中一样从中读取,然后写入第二个数组。这是我的简化代码,N=8: #include <cstdio> #include "handle.h" using namespace std; texture<float,2> tex_w; __global__ void kernel(int imax, float(*w)[3], float (*f)[3]) { int i = threadIdx.x; int j =
#include <cstdio>
#include "handle.h"
using namespace std;
texture<float,2> tex_w;
__global__ void kernel(int imax, float(*w)[3], float (*f)[3])
{
int i = threadIdx.x;
int j = threadIdx.y;
if(i<imax)
f[i][j] = tex2D(tex_w, i, j);
}
void print_to_stdio(int imax, float (*w)[3])
{
for (int i=0; i<imax; i++)
{
printf("%2d %3.6f\t %3.6f\t %3.6f\n",i, w[i][0], w[i][1], w[i][2]);
}
}
int main(void)
{
int imax = 8;
float (*w)[3];
float (*d_w)[3], (*d_f)[3];
dim3 grid(imax,3);
w = (float (*)[3])malloc(imax*3*sizeof(float));
for(int i=0; i<imax; i++)
{
for(int j=0; j<3; j++)
{
w[i][j] = i + 0.01f*j;
}
}
cudaMalloc( (void**) &d_w, 3*imax*sizeof(float) );
cudaMalloc( (void**) &d_f, 3*imax*sizeof(float) );
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, desc, imax, 3, sizeof(float)*imax ) );
cudaMemcpy(d_w, w, 3*imax*sizeof(float), cudaMemcpyHostToDevice);
// just use threads for simplicity
kernel<<<1,grid>>>(imax, d_w, d_f);
cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);
cudaUnbindTexture(tex_w);
cudaFree(d_w);
cudaFree(d_f);
print_to_stdio(imax, w);
free(w);
return 0;
}
但我得到的却是:
0 0.000000 2.020000 5.010000
1 0.010000 3.000000 5.020000
2 0.020000 3.010000 6.000000
3 1.000000 3.020000 6.010000
4 1.010000 4.000000 6.020000
5 1.020000 4.010000 7.000000
6 2.000000 4.020000 7.010000
7 2.010000 5.000000 7.020000
我认为这与我给cudaBindTexture2D的pitch参数有关,但是使用较小的值会导致无效的参数错误
提前谢谢 我可以给你一个完整的解决方案,但你可能学不到:D, 所以这里有一些建议,也许你可以自己解决剩下的问题 提示1.
使用
cudaBindTexture2D
时,它会请求偏移和俯仰。这两个参数都有某些硬件相关的对齐限制。如果使用cudamaloc(..)
,则偏移量保证为0。使用cudamallocpatch(..)
检索音调。您还需要确保主机内存的排列方式相同,否则memcpy
将无法正常工作
提示2.理解2D中的索引。当访问
W[i][j]
中的元素时,您需要知道元素W[i][j+1]
是内存中的下一个元素,而不是W[i+1][j]
提示3.使用一维数组,自己计算二维索引。这将给你更好的控制 在brano的回答之后,我将进一步了解音高的工作原理,然后回答我自己的问题。以下是修改后的代码:
#include <cstdio>
#include <iostream>
#include "handle.cu"
using namespace std;
texture<float,2,cudaReadModeElementType> tex_w;
__global__ void kernel(int imax, float (*f)[3])
{
int i = threadIdx.x;
int j = threadIdx.y;
// width = 3, height = imax
// but we have imax threads in x, 3 in y
// therefore height corresponds to x threads (i)
// and width corresponds to y threads (j)
if(i<imax)
{
// linear filtering looks between indices
f[i][j] = tex2D(tex_w, j+0.5f, i+0.5f);
}
}
void print_to_stdio(int imax, float (*w)[3])
{
for (int i=0; i<imax; i++)
{
printf("%2d %3.3f %3.3f %3.3f\n",i, w[i][0], w[i][1], w[i][2]);
}
printf("\n");
}
int main(void)
{
int imax = 8;
float (*w)[3];
float (*d_f)[3], *d_w;
dim3 grid(imax,3);
w = (float (*)[3])malloc(imax*3*sizeof(float));
for(int i=0; i<imax; i++)
{
for(int j=0; j<3; j++)
{
w[i][j] = i + 0.01f*j;
}
}
print_to_stdio(imax, w);
size_t pitch;
HANDLE_ERROR( cudaMallocPitch((void**)&d_w, &pitch, 3*sizeof(float), imax) );
HANDLE_ERROR( cudaMemcpy2D(d_w, // device destination
pitch, // device pitch (calculated above)
w, // src on host
3*sizeof(float), // pitch on src (no padding so just width of row)
3*sizeof(float), // width of data in bytes
imax, // height of data
cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, tex_w.channelDesc, 3, imax, pitch) );
tex_w.normalized = false; // don't use normalized values
tex_w.filterMode = cudaFilterModeLinear;
tex_w.addressMode[0] = cudaAddressModeClamp; // don't wrap around indices
tex_w.addressMode[1] = cudaAddressModeClamp;
// d_f will have result array
cudaMalloc( &d_f, 3*imax*sizeof(float) );
// just use threads for simplicity
kernel<<<1,grid>>>(imax, d_f);
cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);
cudaUnbindTexture(tex_w);
cudaFree(d_w);
cudaFree(d_f);
print_to_stdio(imax, w);
free(w);
return 0;
}
#包括
#包括
#包括“handle.cu”
使用名称空间std;
纹理纹理;
__全局无效内核(int-imax,float(*f)[3])
{
int i=threadIdx.x;
int j=螺纹内径x.y;
//宽度=3,高度=imax
//但是我们在x中有imax线程,在y中有3个线程
//因此,高度对应于x个螺纹(i)
//和宽度对应于y螺纹(j)
if(iThanks.你能告诉我如何为它创建一个合适的通道描述符吗?你的代码假设tex_w已经有了一个,CUDA文档对此不是很清楚。
#include <cstdio>
#include <iostream>
#include "handle.cu"
using namespace std;
texture<float,2,cudaReadModeElementType> tex_w;
__global__ void kernel(int imax, float (*f)[3])
{
int i = threadIdx.x;
int j = threadIdx.y;
// width = 3, height = imax
// but we have imax threads in x, 3 in y
// therefore height corresponds to x threads (i)
// and width corresponds to y threads (j)
if(i<imax)
{
// linear filtering looks between indices
f[i][j] = tex2D(tex_w, j+0.5f, i+0.5f);
}
}
void print_to_stdio(int imax, float (*w)[3])
{
for (int i=0; i<imax; i++)
{
printf("%2d %3.3f %3.3f %3.3f\n",i, w[i][0], w[i][1], w[i][2]);
}
printf("\n");
}
int main(void)
{
int imax = 8;
float (*w)[3];
float (*d_f)[3], *d_w;
dim3 grid(imax,3);
w = (float (*)[3])malloc(imax*3*sizeof(float));
for(int i=0; i<imax; i++)
{
for(int j=0; j<3; j++)
{
w[i][j] = i + 0.01f*j;
}
}
print_to_stdio(imax, w);
size_t pitch;
HANDLE_ERROR( cudaMallocPitch((void**)&d_w, &pitch, 3*sizeof(float), imax) );
HANDLE_ERROR( cudaMemcpy2D(d_w, // device destination
pitch, // device pitch (calculated above)
w, // src on host
3*sizeof(float), // pitch on src (no padding so just width of row)
3*sizeof(float), // width of data in bytes
imax, // height of data
cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, tex_w.channelDesc, 3, imax, pitch) );
tex_w.normalized = false; // don't use normalized values
tex_w.filterMode = cudaFilterModeLinear;
tex_w.addressMode[0] = cudaAddressModeClamp; // don't wrap around indices
tex_w.addressMode[1] = cudaAddressModeClamp;
// d_f will have result array
cudaMalloc( &d_f, 3*imax*sizeof(float) );
// just use threads for simplicity
kernel<<<1,grid>>>(imax, d_f);
cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);
cudaUnbindTexture(tex_w);
cudaFree(d_w);
cudaFree(d_f);
print_to_stdio(imax, w);
free(w);
return 0;
}