Cuda-不倒车使用名称空间std； #包括 #包括 #包括常量int threadsPerBlock=256；常数int blocksPerGrid=1024；常数int N=64； __全局无效反向（整数*数据，整数计数）{ __共享_u u; int缓存[threadsPerBlock]； int tid=threadIdx.x+blockIdx.x*blockDim.x； int cacheIndex=threadIdx.x； int tr=count-cacheIndex-1；如果（tid_Cuda

Cuda-不倒车使用名称空间std； #包括 #包括 #包括常量int threadsPerBlock=256；常数int blocksPerGrid=1024；常数int N=64； __全局无效反向（整数*数据，整数计数）{ __共享_u u; int缓存[threadsPerBlock]； int tid=threadIdx.x+blockIdx.x*blockDim.x； int cacheIndex=threadIdx.x； int tr=count-cacheIndex-1；如果（tid

cuda

Cuda-不倒车使用名称空间std； #包括 #包括 #包括常量int threadsPerBlock=256；常数int blocksPerGrid=1024；常数int N=64； __全局无效反向（整数*数据，整数计数）{ __共享_u u; int缓存[threadsPerBlock]； int tid=threadIdx.x+blockIdx.x*blockDim.x； int cacheIndex=threadIdx.x； int tr=count-cacheIndex-1；如果（tid,cuda,Cuda,您启动的线程太多了。对于您拥有的算法，所需的线程数为N。但您正在启动1024*256个线程或者，也可能是一种很好的编码实践，就是使用线程检查将代码包装到内核中，如： using namespace std; #include <stdio.h> #include <stdlib.h> #include <iostream> const int threadsPerBlock = 256; const int blocksPerGrid = 1024; co

您启动的线程太多了。对于您拥有的算法，所需的线程数为N。但您正在启动1024*256个线程

或者，也可能是一种很好的编码实践，就是使用线程检查将代码包装到内核中，如：

using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <iostream>

const int threadsPerBlock = 256;
const int blocksPerGrid = 1024;
const int N = 64;

 __global__ void reverse(int *data, int count){

        __shared__ int cache[threadsPerBlock];
        int tid = threadIdx.x + blockIdx.x * blockDim.x;

        int cacheIndex = threadIdx.x;
        int tr = count-cacheIndex-1;
        if(tid< count/2)
        cache[cacheIndex] = data[cacheIndex];

        __syncthreads();
        data[cacheIndex] = cache[tr];
    }

int main(void){

    int a[N];
    int *devA;

    generate(a,N);

    cudaMalloc((void**)&devA, N * sizeof(int));


    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);

    reverse<<<blocksPerGrid,threadsPerBlock>>>(devA,N);

    cudaMemcpy(a,devA, N * sizeof(int), cudaMemcpyDeviceToHost);


    cout << a[63];

    cudaFree(devA);

}

intidx=threadIdx.x+blockDim.x*blockIdx.x；
如果（idxi编辑了我的内核函数。请看它。它不起作用。为什么不使用我在回答中链接到的alrikai提出的内核？对内核所做的编辑的问题是，你不理解if语句应该做什么。你没有使用我暗示的花括号。你所有的内核代码都是uld以我建议的if语句为条件。我想用共享内存来做。）好的，我修改了我的答案，它现在应该可以工作了。但是，正如我指出的，共享内存方法只适用于适合单个线程块的N。因此，如果N超过1024（或者256，如果不增加每个块的线程数），共享内存方法将中断，因为threadblocks之间没有同步，也无法保证threadblocks的执行顺序。共享内存对此解决方案没有任何好处。我编辑了我的原始答案。请查看上面我的原始答案，您将看到它已从我最初的答案更改特德。
    int idx = threadIdx.x + blockDim.x*blockIdx.x;

    if (idx<count){
      // put your kernel code here
    }