纹理提取比直接全局访问慢,第7章自;Cuda举例说明”;书
我正在阅读和测试《Cuda By example.通用GPU编程简介》一书中的示例。 在测试第7章中关于纹理内存的示例时,我意识到通过纹理缓存访问全局内存比直接访问慢得多(我的NVIDIA GPU是GeForceGTX 260,计算能力1.3,我使用的是NVDIA CUDA 4.2):纹理提取比直接全局访问慢,第7章自;Cuda举例说明”;书,cuda,textures,Cuda,Textures,我正在阅读和测试《Cuda By example.通用GPU编程简介》一书中的示例。 在测试第7章中关于纹理内存的示例时,我意识到通过纹理缓存访问全局内存比直接访问慢得多(我的NVIDIA GPU是GeForceGTX 260,计算能力1.3,我使用的是NVDIA CUDA 4.2): 256*256图像的纹理提取(1D或2D)每帧时间:93毫秒 不使用纹理的每帧时间(仅直接全局访问)256*256:8.5毫秒 我已经反复检查了几次代码,并且一直在阅读SDK附带的“CUDA C编程指南”和“
- 256*256图像的纹理提取(1D或2D)每帧时间:93毫秒
- 不使用纹理的每帧时间(仅直接全局访问)256*256:8.5毫秒
//#define TEXTURE
//#define TEXTURE2
#ifdef TEXTURE
// According to C programming guide, it should be static (3.2.10.1.1)
static texture<float> texConstSrc;
static texture<float> texIn;
static texture<float> texOut;
#endif
__global__ void copy_const_kernel( float *iptr
#ifdef TEXTURE2
){
#else
,const float *cptr ) {
#endif
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
#ifdef TEXTURE2
float c = tex1Dfetch(texConstSrc,offset);
#else
float c = cptr[offset];
#endif
if ( c != 0) iptr[offset] = c;
}
__global__ void blend_kernel( float *outSrc,
#ifdef TEXTURE
bool dstOut ) {
#else
const float *inSrc ) {
#endif
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
int left = offset - 1;
int right = offset + 1;
if (x == 0) left++;
if (x == SXRES-1) right--;
int top = offset - SYRES;
int bottom = offset + SYRES;
if (y == 0) top += SYRES;
if (y == SYRES-1) bottom -= SYRES;
#ifdef TEXTURE
float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn,top);
l = tex1Dfetch(texIn,left);
c = tex1Dfetch(texIn,offset);
r = tex1Dfetch(texIn,right);
b = tex1Dfetch(texIn,bottom);
} else {
t = tex1Dfetch(texOut,top);
l = tex1Dfetch(texOut,left);
c = tex1Dfetch(texOut,offset);
r = tex1Dfetch(texOut,right);
b = tex1Dfetch(texOut,bottom);
}
outSrc[offset] = c + SPEED * (t + b + r + l - 4 * c);
#else
outSrc[offset] = inSrc[offset] + SPEED * ( inSrc[top] +
inSrc[bottom] + inSrc[left] + inSrc[right] -
inSrc[offset]*4);
#endif
}
// globals needed by the update routine
struct DataBlock {
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
cudaEvent_t start, stop;
float totalTime;
float frames;
unsigned size;
unsigned char *output_host;
};
void anim_gpu( DataBlock *d, int ticks ) {
checkCudaErrors( cudaEventRecord( d->start, 0 ) );
dim3 blocks(SXRES/16,SYRES/16);
dim3 threads(16,16);
#ifdef TEXTURE
volatile bool dstOut = true;
#endif
for (int i=0; i<90; i++) {
#ifdef TEXTURE
float *in, *out;
if (dstOut) {
in = d->dev_inSrc;
out = d->dev_outSrc;
} else {
out = d->dev_inSrc;
in = d->dev_outSrc;
}
#ifdef TEXTURE2
copy_const_kernel<<<blocks,threads>>>( in );
#else
copy_const_kernel<<<blocks,threads>>>( in,
d->dev_constSrc );
#endif
blend_kernel<<<blocks,threads>>>( out, dstOut );
dstOut = !dstOut;
#else
copy_const_kernel<<<blocks,threads>>>( d->dev_inSrc,
d->dev_constSrc );
blend_kernel<<<blocks,threads>>>( d->dev_outSrc,
d->dev_inSrc );
swap( d->dev_inSrc, d->dev_outSrc );
#endif
}
// Some stuff for the events
// ...
}
/#定义纹理
//#定义纹理2
#ifdef纹理
//根据C编程指南,它应该是静态的(3.2.10.1.1)
静态纹理;
静态纹理纹理;
静态纹理输出;
#恩迪夫
__全局无效复制常量内核(浮点*iptr
#ifdef纹理2
){
#否则
,常数浮点*cptr){
#恩迪夫
//从threadIdx/BlockIdx映射到像素位置
int x=threadIdx.x+blockIdx.x*blockDim.x;
int y=线程IDX.y+块IDX.y*块DIM.y;
int offset=x+y*blockDim.x*gridDim.x;
#ifdef纹理2
浮点c=tex1Dfetch(texConstSrc,偏移量);
#否则
浮点数c=cptr[偏移量];
#恩迪夫
如果(c!=0)iptr[offset]=c;
}
__全局混合内核(浮点*outSrc,
#ifdef纹理
bool(输出){
#否则
常量浮点*inSrc){
#恩迪夫
//从threadIdx/BlockIdx映射到像素位置
int x=threadIdx.x+blockIdx.x*blockDim.x;
int y=线程IDX.y+块IDX.y*块DIM.y;
int offset=x+y*blockDim.x*gridDim.x;
int left=偏移量-1;
int right=偏移量+1;
如果(x==0)左++;
如果(x==SXRES-1)正确--;
int top=偏移量-SYRES;
int-bottom=偏移量+SYRES;
如果(y==0)top+=SYRES;
如果(y==SYRES-1)底部-=SYRES;
#ifdef纹理
浮子t、l、c、r、b;
如果(dstOut){
t=tex1Dfetch(texIn,顶部);
l=tex1Dfetch(texIn,左);
c=tex1Dfetch(texIn,偏移量);
r=tex1Dfetch(texIn,右);
b=tex1Dfetch(texIn,底部);
}否则{
t=tex1Dfetch(texOut,顶部);
l=tex1Dfetch(texOut,左);
c=tex1Dfetch(texOut,偏移量);
r=tex1Dfetch(texOut,右);
b=tex1Dfetch(texOut,底部);
}
outSrc[offset]=c+速度*(t+b+r+l-4*c);
#否则
outSrc[offset]=inSrc[offset]+速度*(inSrc[top]+
inSrc[底部]+inSrc[左]+inSrc[右]-
inSrc[偏移量]*4);
#恩迪夫
}
//更新例程所需的全局变量
结构数据块{
无符号字符*输出\u位图;
浮球*dev_inSrc;
浮动*dev_outSrc;
浮点数*dev_constSrc;
cudaEvent\u t启动、停止;
浮动总时间;
浮动框架;
无符号大小;
无符号字符*输出\主机;
};
无效动画gpu(数据块*d,整数刻度){
检查CUDAERRORS(cudaEventRecord(d->start,0));
dim3区块(SXRES/16、SYRES/16);
dim3螺纹(16,16);
#ifdef纹理
volatile bool dstOut=true;
#恩迪夫
对于(int i=0;idev_inSrc;
out=d->dev_outSrc;
}否则{
out=d->dev_inSrc;
in=d->dev_outSrc;
}
#ifdef纹理2
复制常量内核(in);
#否则
复制常量内核(在,
d->dev_constSrc);
#恩迪夫
混合_内核(out,dstOut);
dstOut=!dstOut;
#否则
复制常量内核(d->dev\u inSrc,
d->dev_constSrc);
混合内核(d->dev\u outSrc,
d->dev_inSrc);
交换(d->dev_inSrc,d->dev_outSrc);
#恩迪夫
}
//一些活动的东西
// ...
}
我一直在用nvvp(NVIDIA探查器)测试结果
结果非常奇怪,因为它们显示存在大量纹理缓存未命中(这可能是性能不佳的原因)。
探查器的结果还显示了即使使用指南“CUPTI_用户指南”也难以理解的信息:
- text_cache_hit:纹理缓存命中数(根据1.3功能,它们仅占一个SM)
- text_cache_miss:纹理缓存未命中的数量(根据1.3功能,它们仅占一个SM)