Memory 如何访问内核中的常量内存？_Memory_Cuda

Memory 如何访问内核中的常量内存？

memory cuda

Memory 如何访问内核中的常量内存？,memory,cuda,Memory,Cuda,我无法访问我的固定内存中的数据，我不知道为什么。以下是我的代码片段： #define N 10 __constant__ int constBuf_d[N]; __global__ void foo( int *results, int *constBuf ) { int tdx = threadIdx.x; int idx = blockIdx.x * blockDim.x + tdx; if( idx < N ) { results

我无法访问我的固定内存中的数据，我不知道为什么。以下是我的代码片段：

#define N 10
__constant__ int constBuf_d[N];

__global__ void foo( int *results, int *constBuf )
{
    int tdx = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tdx;

    if( idx < N )
    {
         results[idx] = constBuf[idx];
    }
}

// main routine that executes on the host
int main(int argc, char* argv[])
{
    int *results_h = new int[N];
    int *results_d = NULL;

    cudaMalloc((void **)&results_d, N*sizeof(int));

    int arr[10] = { 16, 2, 77, 40, 12, 3, 5, 3, 6, 6 };

    int *cpnt;
    cudaError_t err = cudaGetSymbolAddress((void **)&cpnt, "constBuf_d");

    if( err )
        cout << "error!";

    cudaMemcpyToSymbol((void**)&cpnt, arr, N*sizeof(int), 0, cudaMemcpyHostToDevice);

    foo <<< 1, 256 >>> ( results_d, cpnt );

    cudaMemcpy(results_h, results_d, N*sizeof(int), cudaMemcpyDeviceToHost);

    for( int i=0; i < N; ++i )
        printf("%i ", results_h[i] );
}

#定义N 10
__常数常数[N]；
__全局无效foo（int*results，int*constBuf）
{
int tdx=threadIdx.x；
int idx=blockIdx.x*blockDim.x+tdx；
if（idx


由于某些原因，我在结果中只得到“0”。我正在运行CUDA 4.0，它有一个功能为1.1的卡
有什么想法吗？谢谢
 如果在代码中添加正确的错误检查，您会发现cudaMemcpyToSymbol
由于无效的设备符号错误而失败。您需要按名称传递符号，或者改用cudaMemcpy
。因此：
cudaGetSymbolAddress((void **)&cpnt, "constBuf_d");
cudaMemcpy(cpnt, arr, N*sizeof(int), cudaMemcpyHostToDevice); 

或
或
会有用的。话虽如此，将常量内存地址作为参数传递给内核是使用常量内存的错误方式——它会阻止编译器生成通过常量内存缓存访问内存的指令。比较为内核生成的计算能力1.2 PTX：
    .entry _Z3fooPiS_ (
        .param .u32 __cudaparm__Z3fooPiS__results,
        .param .u32 __cudaparm__Z3fooPiS__constBuf)
    {
    .reg .u16 %rh<4>;
    .reg .u32 %r<12>;
    .reg .pred %p<3>;
    .loc    16  7   0
$LDWbegin__Z3fooPiS_:
    mov.u16     %rh1, %ctaid.x;
    mov.u16     %rh2, %ntid.x;
    mul.wide.u16    %r1, %rh1, %rh2;
    cvt.s32.u16     %r2, %tid.x;
    add.u32     %r3, %r2, %r1;
    mov.u32     %r4, 9;
    setp.gt.s32     %p1, %r3, %r4;
    @%p1 bra    $Lt_0_1026;
    .loc    16  14  0
    mul.lo.u32  %r5, %r3, 4;
    ld.param.u32    %r6, [__cudaparm__Z3fooPiS__constBuf];
    add.u32     %r7, %r6, %r5;
    ld.global.s32   %r8, [%r7+0];
    ld.param.u32    %r9, [__cudaparm__Z3fooPiS__results];
    add.u32     %r10, %r9, %r5;
    st.global.s32   [%r10+0], %r8;
$Lt_0_1026:
    .loc    16  16  0
    exit;
$LDWend__Z3fooPiS_:
    } // _Z3fooPiS_

.entry\u Z3fooPiS\u(
.param.u32_uuucudaparm_uuuz3foopis_uuu结果，
.param.u32_uucudaparm_uuuz3foopis_uuuconstbuf）
{
.reg.u16%相对湿度；
.reg.u32%r；
.reg.pred%p；
.loc 16 7 0
$LDWbegin_uuz3foopis_uu：
mov.u16%rh1，%ctaid.x；
mov.u16%rh2，%ntid.x；
mul.wide.u16%r1、%rh1、%rh2；
cvt.s32.u16%r2，%tid.x；
添加.u32%r3，%r2，%r1；
mov.u32%r4，9；
setp.gt.s32%p1、%r3、%r4；
@%p1文胸$Lt_0_1026；
loc 16 14 0
mul.lo.u32%r5，%r3,4；
ld.param.u32%r6，[[uu cudaparm_uuz3foopis_uuconstbuf]；
添加.u32%r7、%r6、%r5；
ld.global.s32%r8，[%r7+0]；
ld.param.u32%r9，[[uu cudaparm_uuz3foopis_uu结果]；
添加.u32%r10、%r9、%r5；
st.global.s32[%r10+0]，%r8；
$Lt_0_1026：
.loc 16 0
出口
$LDWend_uuuz3foopis_uuu：
}//Z3fooPiS_

使用此内核：
__global__ void foo2( int *results )
{
    int tdx = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tdx;

    if( idx < N )
    {
         results[idx] = constBuf_d[idx];
    }
}

\uuuuu全局\uuuuuu无效foo2（int*结果）
{
int tdx=threadIdx.x；
int idx=blockIdx.x*blockDim.x+tdx；
if（idx

产生
    .entry _Z4foo2Pi (
        .param .u32 __cudaparm__Z4foo2Pi_results)
    {
    .reg .u16 %rh<4>;
    .reg .u32 %r<12>;
    .reg .pred %p<3>;
    .loc    16  18  0
$LDWbegin__Z4foo2Pi:
    mov.u16     %rh1, %ctaid.x;
    mov.u16     %rh2, %ntid.x;
    mul.wide.u16    %r1, %rh1, %rh2;
    cvt.s32.u16     %r2, %tid.x;
    add.u32     %r3, %r2, %r1;
    mov.u32     %r4, 9;
    setp.gt.s32     %p1, %r3, %r4;
    @%p1 bra    $Lt_1_1026;
    .loc    16  25  0
    mul.lo.u32  %r5, %r3, 4;
    mov.u32     %r6, constBuf_d;
    add.u32     %r7, %r5, %r6;
    ld.const.s32    %r8, [%r7+0];
    ld.param.u32    %r9, [__cudaparm__Z4foo2Pi_results];
    add.u32     %r10, %r9, %r5;
    st.global.s32   [%r10+0], %r8;
$Lt_1_1026:
    .loc    16  27  0
    exit;
$LDWend__Z4foo2Pi:
    } // _Z4foo2Pi

.entry\u Z4foo2Pi(
.param.u32_uuucudaparm_uuuz4foo2pi_结果）
{
.reg.u16%相对湿度；
.reg.u32%r；
.reg.pred%p；
.loc 16 18 0
$LDWbegin_uuz4foo2pi:
mov.u16%rh1，%ctaid.x；
mov.u16%rh2，%ntid.x；
mul.wide.u16%r1、%rh1、%rh2；
cvt.s32.u16%r2，%tid.x；
添加.u32%r3，%r2，%r1；
mov.u32%r4，9；
setp.gt.s32%p1、%r3、%r4；
@%p1文胸$Lt_1_1026；
.loc 16 25 0
mul.lo.u32%r5，%r3,4；
mov.u32%r6，constBuf_d；
添加.u32%r7、%r5、%r6；
ld.const.s32%r8，[%r7+0]；
ld.param.u32%r9，[[uuuu cudaparm\uuuuuu Z4foo2Pi\u结果]；
添加.u32%r10、%r9、%r5；
st.global.s32[%r10+0]，%r8；
$Lt_1_1026：
loc 16 27 0
出口
$LDWend_u_Z4foo2Pi:
}//"Z4foo2Pi

请注意，在第二种情况下，通过ld.const.s32
而不是ld.global.s32
访问constBuf\u d
，因此使用恒定内存缓存。
优秀答案@talonmes。但我想说的是，cuda 5已经发生了变化。在函数MemcpyToSymbol（）中，不再支持char*参数
CUDA 5发行说明如下：
** The use of a character string to indicate a device symbol, which was possible with certain API functions, is no longer supported. Instead, the symbol should be used directly.

相反，必须按如下方式将副本复制到恒定内存中：
cudaMemcpyToSymbol( dev_x, x, N * sizeof(float) );

在本例中，“dev_x”是指向常量内存的指针，“x”是指向需要复制到dev_x中的主机内存的指针
** The use of a character string to indicate a device symbol, which was possible with certain API functions, is no longer supported. Instead, the symbol should be used directly.

cudaMemcpyToSymbol( dev_x, x, N * sizeof(float) );