Concurrency cuda并发内核执行未成功

Concurrency cuda并发内核执行未成功,concurrency,cuda,Concurrency,Cuda,下面是我从“cuda by example”一书中编辑的代码,用于测试cuda并发内核执行 static void HandleError( cudaError_t err, const char *file, int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetEr

下面是我从“cuda by example”一书中编辑的代码,用于测试cuda并发内核执行

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) { 
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }   
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define N   (1024*1024*10)
#define FULL_DATA_SIZE   (N*2)


__global__ void kernel( int *a, int *b, int *c ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        int idx1 = (idx + 1) % 256;
        int idx2 = (idx + 2) % 256;
        float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
        float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
        c[idx] = (as + bs) / 2;
    }
}


int main( void ) {
    cudaDeviceProp  prop;
    int whichDevice;
    HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
    HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
    if (!prop.deviceOverlap) {
        printf( "Device will not handle overlaps, so no speed up from streams\n" );
        return 0;
    }

    cudaEvent_t     start, stop;
    float           elapsedTime;

    cudaStream_t    stream0, stream1;
    int *host_a, *host_b, *host_c;
    int *dev_a0, *dev_b0, *dev_c0;
    int *dev_a1, *dev_b1, *dev_c1;

    // start the timers
    HANDLE_ERROR( cudaEventCreate( &start ) );
    HANDLE_ERROR( cudaEventCreate( &stop ) );

    // initialize the streams
    HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream1 ) );

    // allocate the memory on the GPU
    HANDLE_ERROR( cudaMalloc( (void**)&dev_a0,
                              N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_b0,
                              N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_c0,
                              N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_a1,
                              N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_b1,
                              N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_c1,
                              N * sizeof(int) ) );

    // allocate host locked memory, used to stream
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_a,
                              FULL_DATA_SIZE * sizeof(int),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_b,
                              FULL_DATA_SIZE * sizeof(int),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_c,
                              FULL_DATA_SIZE * sizeof(int),
                              cudaHostAllocDefault ) );

    for (int i=0; i<FULL_DATA_SIZE; i++) {
        host_a[i] = rand();
        host_b[i] = rand();
    }

    HANDLE_ERROR( cudaEventRecord( start, 0 ) );
    // now loop over full data, in bite-sized chunks
    for (int i=0; i<FULL_DATA_SIZE; i+= N*2) {
        // enqueue kernels in stream0 and stream1   
        kernel<<<N/256,256,0,stream0>>>( dev_a0, dev_b0, dev_c0 );
        kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );
    }
    HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );

    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );

    HANDLE_ERROR( cudaEventSynchronize( stop ) );
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        start, stop ) );
    printf( "Time taken:  %3.1f ms\n", elapsedTime );

    // cleanup the streams and memory
    HANDLE_ERROR( cudaFreeHost( host_a ) );
    HANDLE_ERROR( cudaFreeHost( host_b ) );
    HANDLE_ERROR( cudaFreeHost( host_c ) );
    HANDLE_ERROR( cudaFree( dev_a0 ) );
    HANDLE_ERROR( cudaFree( dev_b0 ) );
    HANDLE_ERROR( cudaFree( dev_c0 ) );
    HANDLE_ERROR( cudaFree( dev_a1 ) );
    HANDLE_ERROR( cudaFree( dev_b1 ) );
    HANDLE_ERROR( cudaFree( dev_c1 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream1 ) );

    return 0;
}
静态无效句柄错误(cudaError\u t err,
常量字符*文件,
int行){
if(err!=cudaSuccess){
printf(“%s在%s中的第%d行\n”,cudaGetErrorString(错误),
文件、行);
退出(退出失败);
}   
}
#定义句柄错误(err)(句柄错误(err,文件,行)
#定义N(1024*1024*10)
#定义完整数据大小(N*2)
__全局无效内核(int*a,int*b,int*c){
int idx=threadIdx.x+blockIdx.x*blockDim.x;
if(idx对于(int i=0;i是否指定代码应编译的计算体系结构?默认值为1.0,如果我没有弄错的话,它不支持并发内核。请尝试将以下内容添加到nvcc调用中:

--generate_code code=sm_21,arch=compute_20

我不知道你的卡支持哪种计算体系结构,但是你应该可以在网上的某个地方找到它。但是也许可以先试试上面的方法,如果失败了,试试
sm_20
,而不是
sm_21

嗯,这实际上让事情变得更糟。它仍然没有重叠,而且GPU的时间已经增加了一倍。向下看e as代码生成对设备是否支持并发内核执行没有影响。这个问题是重复的。您的内核正在生成N/256=40960个块。CC 2.0设备上的工作分发器正在将第一个内核中的所有块分发到第二个内核中的任何块之前。考虑这是因为M2090中的16条SMs中的每一条都有一个线程块堆栈。第一个内核调用加载的线程块堆栈在每个SM上有2560个线程块。在处理第二个内核之前,所有这些线程块都必须耗尽。因此几乎没有重叠。