可以将一个短数组传递到CUDA内核中吗_Cuda

可以将一个短数组传递到CUDA内核中吗

cuda

可以将一个短数组传递到CUDA内核中吗,cuda,Cuda,我已经编写了一个CUDA内核，当我将一个短数组复制到设备内存，然后将其传递到内核时，它就不工作了。下面的简化代码表达了我的问题 KernelCaller() { const int size = 1; short hostArray[size]{41}; short* devPointer; cudaMalloc((void**)&devicePointer, size * sizeof(short)); cudaMemcpy(devPointer

我已经编写了一个CUDA内核，当我将一个短数组复制到设备内存，然后将其传递到内核时，它就不工作了。下面的简化代码表达了我的问题

KernelCaller()
{
    const int size = 1;
    short hostArray[size]{41};
    short* devPointer;
    cudaMalloc((void**)&devicePointer, size * sizeof(short));
    cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
    cudaKernel<<<1,1>>>(devPointer);

}

__global__
void cudaKernel(short* arr)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    short val = arr[idx];
}

KernelCaller（）
{
常数int size=1；
短主机阵列[size]{41}；
短*devPointer；
Cudamaloc（（空心**）和设备指针，尺寸*尺寸（短）；
cudaMemcpy（devPointer，hostArray，size*sizeof（short），cudaMemcpyHostToDevice）；
库达克内尔（德普ointer）；
}
__全球的__
void cudaKernel（短*arr）
{
int idx=blockIdx.x*blockDim.x+threadIdx.x；
短val=arr[idx]；
}

此时，

val

的值是1063714857，我希望它是41。

我假设问题是41英寸，十六进制是0x29，我得到的值是0x3F670029，所以看起来它读取了太多字节，因为0x29在开头。当我切换到一个浮点数组时，它工作得非常好，但我试图节省内存。CUDA不允许一系列的短路吗？

我已经实现了您的代码，并获得了预期的输出

这是密码

 #include<stdio.h>
__global__ void cudaKernel(short* arr)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    short val = arr[idx];
    # if __CUDA_ARCH__>=200
        printf("Inside kernel %d\n",val);
    #endif
    arr[idx] = val;
}

int main()
{
    const int size = 1;
    short hostArray[size]{41};
    printf("Before kernel call %d\n",hostArray[0]);
    short *devPointer;
    cudaMalloc((void**)&devPointer, size * sizeof(short));
    cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
    cudaKernel<<<1,1>>>(devPointer);
    cudaMemcpy(hostArray, devPointer, size * sizeof(short), cudaMemcpyDeviceToHost);
    printf("After kernel call %d\n",hostArray[0]);
    cudaFree(devPointer);
    return 0;
}

因此，是的，我们可以将一系列短消息传递到CUDA内核。

我已经实现了您的代码，并获得了预期的输出

这是密码

 #include<stdio.h>
__global__ void cudaKernel(short* arr)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    short val = arr[idx];
    # if __CUDA_ARCH__>=200
        printf("Inside kernel %d\n",val);
    #endif
    arr[idx] = val;
}

int main()
{
    const int size = 1;
    short hostArray[size]{41};
    printf("Before kernel call %d\n",hostArray[0]);
    short *devPointer;
    cudaMalloc((void**)&devPointer, size * sizeof(short));
    cudaMemcpy(devPointer, hostArray, size * sizeof(short), cudaMemcpyHostToDevice);
    cudaKernel<<<1,1>>>(devPointer);
    cudaMemcpy(hostArray, devPointer, size * sizeof(short), cudaMemcpyDeviceToHost);
    printf("After kernel call %d\n",hostArray[0]);
    cudaFree(devPointer);
    return 0;
}

因此，是的，我们可以将一系列短消息传递到CUDA内核。

——它当然可以。如果pastebin链接的代码失败，您的CUDA安装可能会被破坏——它肯定会被破坏。如果pastebin链接上的代码失败，则CUDA安装可能已中断