如何在CUDA中为一维数组使用纹理内存_Cuda

如何在CUDA中为一维数组使用纹理内存

cuda

如何在CUDA中为一维数组使用纹理内存,cuda,Cuda,我编写了以下代码以了解如何为1D数组使用纹理内存。但是tex1D函数没有从数组中获取对应线程id的值。请更正此代码，并告诉我如何高效地为1D数组使用纹理内存 __global__ void sum(float *b,cudaTextureObject_t texObj) { b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x); //printf("\n%f\n",tex1Dfetch<float>(te

我编写了以下代码以了解如何为1D数组使用纹理内存。但是tex1D函数没有从数组中获取对应线程id的值。请更正此代码，并告诉我如何高效地为1D数组使用纹理内存

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }
    int main()
    {
    float *a,*b;
    float *d_a,*d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,5>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<5;i++)
        printf("%f\t",b[i]);
      cudaDestroyTextureObject(texObj); 
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }

\uuuuu全局\uuuuuuu无效和（float*b，cudaTextureObject\u t texObj）
{
b[threadIdx.x]=tex1D（texObj，threadIdx.x）；
//printf（“\n%f\n”，tex1Dfetch（texObj，threadIdx.x））；
}
int main（）
{
浮动*a、*b；
浮动*d_a，*d_b；
int i；
a=（浮动*）malloc（浮动*5）；
b=（浮动*）malloc（浮动*5）；
对于（i=0；i而言，至少存在两个问题：
最后，您仅将一个浮动量从设备复制回主机：
cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                 ^^^^^^^^^^^^^

如果要打印5个值，应将5个值复制回：
cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);


您已选择：
这意味着您应该传递0到1之间的浮点坐标作为索引，而不是0到4之间的整数坐标：
 b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                    ^^^^^^^^^^^

b[threadIdx.x]=tex1D（texObj，threadIdx.x）；
^^^^^^^^^^^

改用类似的方式：
 b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));

b[threadIdx.x]=tex1D（texObj，（（float）threadIdx.x/5.0f））；

通过这些更改，我得到了合理的结果。下面是一个完整的代码：
$ cat t3.cu
#include <stdio.h>

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));

    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }


int main()
    {
    float *a,*b;
    float *d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,4>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<4;i++)
        printf("%f\t",b[i]);
      printf("\n");
      cudaDestroyTextureObject(texObj);
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000        1.500000        2.500000        3.500000
========= ERROR SUMMARY: 0 errors
$

$cat t3.cu
#包括
__全局无效和（float*b，cudaTextureObject\u t texObj）
{
b[threadIdx.x]=tex1D（texObj，（（float）（threadIdx.x+1）/5.0f））；
//printf（“\n%f\n”，tex1Dfetch（texObj，threadIdx.x））；
}
int main（）
{
浮动*a、*b；
浮动*d_b；
int i；
a=（浮动*）malloc（浮动*5）；
b=（浮动*）malloc（浮动*5）；
对于（i=0；i而言，至少存在两个问题：
最后，您仅将一个浮动量从设备复制回主机：
cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                 ^^^^^^^^^^^^^

如果要打印5个值，应将5个值复制回：
cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);


您已选择：
这意味着您应该传递0到1之间的浮点坐标作为索引，而不是0到4之间的整数坐标：
 b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                    ^^^^^^^^^^^

b[threadIdx.x]=tex1D（texObj，threadIdx.x）；
^^^^^^^^^^^

改用类似的方式：
 b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));

b[threadIdx.x]=tex1D（texObj，（（float）threadIdx.x/5.0f））；

通过这些更改，我得到了合理的结果。下面是一个完整的代码：
$ cat t3.cu
#include <stdio.h>

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));

    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }


int main()
    {
    float *a,*b;
    float *d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,4>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<4;i++)
        printf("%f\t",b[i]);
      printf("\n");
      cudaDestroyTextureObject(texObj);
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000        1.500000        2.500000        3.500000
========= ERROR SUMMARY: 0 errors
$

$cat t3.cu
#包括
__全局无效和（float*b，cudaTextureObject\u t texObj）
{
b[threadIdx.x]=tex1D（texObj，（（float）（threadIdx.x+1）/5.0f））；
//printf（“\n%f\n”，tex1Dfetch（texObj，threadIdx.x））；
}
int main（）
{
浮动*a、*b；
浮动*d_b；
int i；
a=（浮动*）malloc（浮动*5）；
b=（浮动*）malloc（浮动*5）；
对于（i＝0；我欢迎堆栈溢出），虽然我们可能会指出代码中的明显错误，但我们不是调试服务。请考虑阅读一些帮助您自己解决问题的方法，或者将问题缩小到针对该站点的特定内容。欢迎访问堆栈溢出，而我们可以指出OBVI。代码中的错误，我们不是调试服务。请考虑阅读一些帮助您自己解决问题的方法，或者将问题缩小到对这个站点足够具体的问题。