“设置CUDA 2D”；“无符号字符”；用于线性插值的纹理_Cuda_Textures_Interpolation

“设置CUDA 2D”；“无符号字符”；用于线性插值的纹理

cuda

“设置CUDA 2D”；“无符号字符”；用于线性插值的纹理,cuda,textures,interpolation,Cuda,Textures,Interpolation,我有一个表示2D数组的无符号字符的线性数组。我想将其放入CUDA 2D纹理中，并对其执行（浮点）线性插值，即，让纹理调用获取4个最近的无符号字符邻域，在内部将它们转换为浮点，在它们之间进行插值，并返回结果的浮点值我在设置纹理并将其绑定到纹理引用时遇到一些困难。我已经阅读了CUDA参考手册和附录，但我没有任何运气下面是设置和绑定1）浮点纹理和2）无符号字符纹理的可运行代码。浮点代码运行良好。但是，如果取消注释底部的两个已注释的无符号字符行，则会引发“invalid argument”错误 #i

我有一个表示2D数组的无符号字符的线性数组。我想将其放入CUDA 2D纹理中，并对其执行（浮点）线性插值，即，让纹理调用获取4个最近的无符号字符邻域，在内部将它们转换为浮点，在它们之间进行插值，并返回结果的浮点值

我在设置纹理并将其绑定到纹理引用时遇到一些困难。我已经阅读了CUDA参考手册和附录，但我没有任何运气

下面是设置和绑定1）浮点纹理和2）无符号字符纹理的可运行代码。浮点代码运行良好。但是，如果取消注释底部的两个已注释的无符号字符行，则会引发“invalid argument”错误

#include <cstdio>
#include <cuda_runtime.h>

typedef unsigned char uchar;

// Define (global) texture references; must use "cudaReadModeNormalizedFloat"
// for ordinal textures
texture<float, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefFloat;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefUChar;

// Define size of (row major) textures
size_t const WIDTH  = 1000;
size_t const HEIGHT = 1000;
size_t const TOT_PIX = WIDTH*HEIGHT;

int main(void)
{
   // Set texel formats
   cudaChannelFormatDesc descFloat = cudaCreateChannelDesc<float>();
   cudaChannelFormatDesc descUChar = cudaCreateChannelDesc<uchar>();

   // Choose to perform texture 2D linear interpolation
   texRefFloat.filterMode = cudaFilterModeLinear;
   texRefUChar.filterMode = cudaFilterModeLinear;

   // Allocate texture device memory
   float * d_buffFloat; cudaMalloc(&d_buffFloat, sizeof(float)*TOT_PIX);
   uchar * d_buffUChar; cudaMalloc(&d_buffUChar, sizeof(uchar)*TOT_PIX);

   // Bind texture references to textures
   cudaError_t errFloat = cudaSuccess;
   cudaError_t errUChar = cudaSuccess;

   errFloat = cudaBindTexture2D(0, texRefFloat, d_buffFloat, descFloat,
                  WIDTH, HEIGHT, sizeof(float)*WIDTH);
   // Uncomment the following two lines for an error
   //errUChar = cudaBindTexture2D(0, texRefUChar, d_buffUChar, descUChar,
   //               WIDTH, HEIGHT, sizeof(uchar)*WIDTH);

   // Check for errors during binding
   if (errFloat != cudaSuccess)
   {
      printf("Error binding float texture reference: %s\n",
          cudaGetErrorString(errFloat));
      exit(-1);
   }

   if (errUChar != cudaSuccess)
   {
      printf("Error binding unsigned char texture reference: %s\n",
          cudaGetErrorString(errUChar));
      exit(-1);
   }

   return 0;
}

#包括
#包括
typedef无符号字符；
//定义（全局）纹理参考；必须使用“cudaReadModeNormalizedFloat”
//对于有序纹理
纹理纹理；
纹理特征；
//定义（行主）纹理的大小
尺寸常数宽度=1000；
尺寸常数高度=1000；
尺寸=宽度*高度；
内部主（空）
{
//设置texel格式
cudaChannelFormatDesc descFloat=cudaCreateChannelDesc（）；
cudaChannelFormatDesc descUChar=cudaCreateChannelDesc（）；
//选择以执行纹理2D线性插值
texRefFloat.filterMode=cudaFilterModeLinear；
texRefUChar.filterMode=cudaFilterModeLinear；
//分配纹理设备内存
浮点数*d_buffFloat；cudaMalloc（&d_buffFloat，sizeof（float）*TOT_PIX）；
uchar*d_buffUChar；cudaMalloc（&d_buffUChar，sizeof（uchar）*TOT_PIX）；
//将纹理引用绑定到纹理
cudaError\u t errFloat=cudaSuccess；
cudaError\u t errUChar=cudaSuccess；
errFloat=cudaBindTexture2D（0，texRefFloat，d_buffFloat，descFloat，
宽度、高度、尺寸（浮动）*宽度）；
//取消对以下两行的注释以显示错误
//errUChar=cudaBindTexture2D（0，texRefUChar，d_buffUChar，descUChar，
//宽度、高度、尺寸（uchar）*宽度）；
//在绑定期间检查错误
if（errFloat！=cudaSuccess）
{
printf（“绑定浮动纹理引用时出错：%s\n”，
cudaGetErrorString（errFloat））；
出口（-1）；
}
if（errUChar！=cudaSuccess）
{
printf（“绑定未签名字符纹理引用时出错：%s\n”，
cudaGetErrorString（errUChar））；
出口（-1）；
}
返回0；
}

任何帮助/见解都将不胜感激

Aaron

纹理的每一行都必须正确对齐。如果将纹理绑定到普通数组（与CUDA数组相反），通常无法保证这一点。要将普通内存绑定到2D纹理，您需要使用

cudamallocitch（）

分配内存。这将设置行间距，使其适合绑定到纹理。请注意，将0作为第一个参数传递给纹理绑定API调用不是一个好的做法。此参数用于CUDA向应用程序返回偏移量。如果偏移量不为零，则需要在纹理访问期间将其添加到纹理坐标

下面是一个快速示例，演示如何从元素为

无符号字符的纹理中读取插值
#include <stdlib.h>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;

__global__ void kernel (int m, int n, float shift_x, float shift_y) 
{
    float val;
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            val = tex2D (tex, col+0.5f+shift_x, row+0.5f+shift_y);
            printf ("%.2f  ", val);
        }
        printf ("\n");
    }
}

int main (void)
{
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    unsigned char arr[4][3]= {{11,12,13},{21,22,23},{31,32,33},{251,252,253}};
    unsigned char *arr_d = 0;

    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
                                n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
                                       n, m, pitch));
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
    }
    printf ("reading array straight\n");
    kernel<<<1,1>>>(m, n, 0.0f, 0.0f);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    printf ("reading array shifted in x-direction\n");
    kernel<<<1,1>>>(m, n, 0.5f, 0.0f);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    printf ("reading array shifted in y-direction\n");
    kernel<<<1,1>>>(m, n, 0.0f, 0.5f);
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    CUDA_SAFE_CALL (cudaFree (arr_d));
    return EXIT_SUCCESS;
}

纹理的每一行都必须正确对齐。如果将纹理绑定到普通数组（与CUDA数组相反），通常无法保证这一点。要将普通内存绑定到2D纹理，需要使用cudamallocitch（）分配内存。这将设置行间距，使其适合绑定到纹理。请注意，将0
作为第一个参数传递给纹理绑定API调用不是一种好的做法。此参数用于CUDA向应用程序返回偏移量。如果偏移量不为零，则需要在纹理访问期间将其添加到纹理坐标。哇！完整的工作代码来说明你的答案！非常感谢！因此，在没有cudamallocitch（）的情况下可以设置浮动纹理只是一种侥幸吗？我不知道纹理行的对齐要求是什么，但仅使用cudaMalloc（）
就可能意外满足它。为便于讨论，假设所需的行对齐为32字节。然后每行1000float
等于4000字节，平均除以32，因为32*125=4000。但1000unsigned char=1000字节，不能平均除以32。在Pascal架构上，纹理行对齐要求是512字节边界。在一些早期的体系结构上，这个数字更小，但从来没有小到32字节。（32字节的数字是可以进行合并内存访问的最小对齐方式。）
reading array straight
0.04  0.05  0.05
0.08  0.09  0.09
0.12  0.13  0.13
0.98  0.99  0.99
reading array shifted in x-direction
0.05  0.05  0.05
0.08  0.09  0.09
0.12  0.13  0.13
0.99  0.99  0.99
reading array shifted in y-direction
0.06  0.07  0.07
0.10  0.11  0.11
0.55  0.56  0.56
0.98  0.99  0.99