CUDA:cudaDeviceSynchronize返回错误代码30

CUDA:cudaDeviceSynchronize返回错误代码30,cuda,Cuda,下面的代码是关于Sha-1算法的 内核中有一个大循环(HashRounds=0x40000262144次)。 现在的情况是: 1.在调试模式下运行时,它将报告错误30。但是如果我减少周期时间,例如50000次,则不会报告错误 2.在释放模式下运行时,这是正常的。但是当我增加线程比例时,比如block=48,thread=192,它将报告相同的问题 环境:GTX560Ti+Win8+Visual Studio 2012+Cuda5.5 恳求你的帮助 #include <stdio.h>

下面的代码是关于Sha-1算法的

内核中有一个大循环(HashRounds=0x40000262144次)。 现在的情况是:

1.在调试模式下运行时,它将报告错误30。但是如果我减少周期时间,例如50000次,则不会报告错误

2.在释放模式下运行时,这是正常的。但是当我增加线程比例时,比如block=48,thread=192,它将报告相同的问题

环境:GTX560Ti+Win8+Visual Studio 2012+Cuda5.5

恳求你的帮助

#include <stdio.h>
#include <string.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
typedef unsigned int    uint32;
typedef unsigned char   byte;
typedef struct {
    uint32 state[5];
    uint32 count[2];
    byte buffer[64];
    byte workspace[64]; // Temporary buffer.
} hash_context;

#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */

#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
    |(rol(block->l[i],8)&0x00FF00FF))

#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
    ^block->l[(i+2)&15]^block->l[i&15],1))

/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) {z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);}
#define R1(v,w,x,y,z,i) {z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);}
#define R2(v,w,x,y,z,i) {z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);}
#define R3(v,w,x,y,z,i) {z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);}
#define R4(v,w,x,y,z,i) {z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);}

cudaError_t addWithCuda();
__global__ void cryptKernel();
__device__ void hash_initial(hash_context* context);
__device__ void hash_process( hash_context * context, byte * data, size_t len);
__device__ void SHA1Transform(uint32 state[5], byte workspace[64], byte buffer[64]);

int main()
{
    cudaError_t cudaStatus = addWithCuda();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }
    printf("over");
    getchar();
    return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda()
{
    cudaError_t cudaStatus;
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }
    cryptKernel<<<1,1>>>();
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }
Error:
    return cudaStatus;
}

__global__ void cryptKernel()
{
    int i = blockIdx.x*blockDim.x+threadIdx.x;
    byte RawPsw[24] = {'\0'};
    hash_context c;
    hash_initial(&c);
    const int HashRounds=0x40000;
    for (int I=0;I<HashRounds;I++)
    {
        hash_process( &c, RawPsw, 24);
    }
}

__device__ void SHA1Transform(uint32 state[5], byte workspace[64], byte buffer[64])
{
    uint32 a, b, c, d, e;
    typedef union {
        byte c[64];
        uint32 l[16];
    } CHAR64LONG16;
    CHAR64LONG16* block;
    block = (CHAR64LONG16*)buffer;
    /* Copy context->state[] to working vars */
    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    /* 4 rounds of 20 operations each. Loop unrolled. */
    R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
    R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
    R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
    R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
    R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
    R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
    R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
    R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
    R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
    R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
    R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
    R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
    R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
    R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
    R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
    R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
    R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
    R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
    R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
    R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
    /* Add the working vars back into context.state[] */
    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
}
__device__ void hash_initial(hash_context* context)
{
    /* SHA1 initialization constants */
    context->state[0] = 0x67452301;
    context->state[1] = 0xEFCDAB89;
    context->state[2] = 0x98BADCFE;
    context->state[3] = 0x10325476;
    context->state[4] = 0xC3D2E1F0;
    context->count[0] = context->count[1] = 0;
}
/* Run your data through this. */
__device__ void hash_process( hash_context * context, byte * data, size_t len)
{
    unsigned int i, j;
    uint32 blen = ((uint32)len)<<3;

    j = (context->count[0] >> 3) & 63;
    if ((context->count[0] += blen) < blen ) context->count[1]++;
    context->count[1] += (uint32)(len >> 29);
    if ((j + len) > 63) {
        memcpy(&context->buffer[j], data, (i = 64-j));
        SHA1Transform(context->state, context->workspace, context->buffer);
        for ( ; i + 63 < len; i += 64) {

            SHA1Transform(context->state, context->workspace, &data[i]);

        }
        j = 0;
    }
    else i = 0;
    if (len > i)
      memcpy(&context->buffer[j], &data[i], len - i);
}
#包括
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
typedef无符号整数uint32;
typedef无符号字符字节;
类型定义结构{
uint32状态[5];
uint32计数[2];
字节缓冲区[64];
字节工作区[64];//临时缓冲区。
}散列上下文;
#定义rol(值,位)((值)>(32-(位)))
/*blk0()和blk()执行初始展开*/
/*我从SSLeay那里得到了在圆函数中扩展的想法*/
#定义blk0(i)(块->l[i]=(rol(块->l[i],24)&0xFF00FF00)\
|(rol(块->l[i],8)和0x00FF00FF))
#定义blk(i)(块->l[i&15]=rol(块->l[(i+13)&15]^块->l[(i+8)&15]\
^区块->l[(i+2)和15]^区块->l[i+15],1))
/*(R0+R1)、R2、R3、R4是SHA1中使用的不同操作*/
#定义R0(v,w,x,y,z,i){z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);}
#定义R1(v,w,x,y,z,i){z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);}
#定义R2(v,w,x,y,z,i){z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);}
#定义R3(v,w,x,y,z,i){z+=((w,x)和y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);}
#定义R4(v,w,x,y,z,i){z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);}
cudaError_t addWithCuda();
__全局_uu; void cryptKernel();
__设备\无效哈希\初始值(哈希\上下文*上下文);
__设备无效哈希进程(哈希上下文*上下文,字节*数据,大小长度);
__设备无效SHA1转换(uint32状态[5],字节工作区[64],字节缓冲区[64]);
int main()
{
cudaError_t cudaStatus=addWithCuda();
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“addWithCuda失败!”);
返回1;
}
//必须在退出之前调用cudaDeviceReset,以便进行分析和
//跟踪工具,如Nsight和visualprofiler,用于显示完整的跟踪。
cudaStatus=cudadeviceset();
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“cudaDeviceReset失败!”);
返回1;
}
printf(“超过”);
getchar();
返回0;
}
//辅助函数,用于使用CUDA并行添加向量。
cudaError\u t addWithCuda()
{
cudaError\u t cudaStatus;
//选择要在哪个GPU上运行,在多GPU系统上更改此选项。
cudaStatus=cudaSetDevice(0);
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“cudaSetDevice失败!是否安装了支持CUDA的GPU?”);
转到错误;
}
cryptokernel();
//检查启动内核时是否有任何错误
cudaStatus=cudaGetLastError();
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“addKernel启动失败:%s\n”,cudaGetErrorString(cudaStatus));
转到错误;
}
//cudaDeviceSynchronize等待内核完成,然后返回
//在启动过程中遇到的任何错误。
cudaStatus=cudaDeviceSynchronize();
if(cudaStatus!=cudaSuccess){
fprintf(stderr,“cudaDeviceSynchronize在启动addKernel!\n后返回错误代码%d”,cudaStatus);
转到错误;
}
错误:
返回CUDA状态;
}
__全局_uu; void cryptKernel()
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
字节RawPsw[24]={'\0'};
hash_上下文c;
hash_首字母(&c);
常量int HashRounds=0x40000;
对于(int I=0;Istate[]到工作变量*/
a=状态[0];
b=状态[1];
c=状态[2];
d=状态[3];
e=状态[4];
/*4轮,每次20次。循环展开*/
R0(a,b,c,d,e,0);R0(e,a,b,c,d,1);R0(d,e,a,b,c,2);R0(c,d,e,a,b,3);
R0(b,c,d,e,a,4);R0(a,b,c,d,e,5);R0(e,a,b,c,d,6);R0(d,e,a,b,c,7);
R0(c,d,e,a,b,8);R0(b,c,d,e,a,9);R0(a,b,c,d,e,10);R0(e,a,b,c,d,11);
R0(d,e,a,b,c,12);R0(c,d,e,a,b,13);R0(b,c,d,e,a,14);R0(a,b,c,d,e,15);
R1(e,a,b,c,d,16);R1(d,e,a,b,c,17);R1(c,d,e,a,b,18);R1(b,c,d,e,a,19);
R2(a,b,c,d,e,20);R2(e,a,b,c,d,21);R2(d,e,a,b,c,22);R2(c,d,e,a,b,23);
R2(b,c,d,e,a,24);R2(a,b,c,d,e,25);R2(e,a,b,c,d,26);R2(d,e,a,b,c,27);
R2(c,d,e,a,b,28);R2(b,c,d,e,a,29);R2(a,b,c,d,e,30);R2(e,a,b,c,d,31);
R2(d,e,a,b,c,32);R2(c,d,e,a,b,33);R2(b,c,d,e,a,34);R2(a,b,c,d,e,35);
R2(e,a,b,c,d,36);R2(d,e,a,b,c,37);R2(c,d,e,a,b,38);R2(b,c,d,e,a,39);
R3(a,b,c,d,e,40);R3(e,a,b,c,d,41);R3(d,e,a,b,c,42);R3(c,d,e,a,b,43);
R3(b,c,d,e,a,44);R3(a,b,c,d,e,45);R3(e,a,b,c,d,46);R3(d,e,a,b,c,47);
R3(c,d,e,a,b,48);R3(b,c,d,e,a,49);R3(a,b,c,d,e,50);R3(e,a,b,c,d,51);
R3(d,e,a,b,c,52);R3(c,d,e,a,b,53);R3(b,c,d,e,a,54);R3(a,b,c,d,e,55);
R3(e,a,b,c,d,56);R3(d,e,a,b,c,57);R3(c,d,e,a,b,58);R3(b,c,d,e,a,59);
R4(a,b,c,d,e,60);R4(e,a,b,c,d,61);R4(d,e,a,b,c,62);R4(c,d,e,a,b,63);
R4(b,c,d,e,a,64);R4(a,b,c,d,e,65);R4(e,a,b,c,d,66);R4(d,e,a,b,c,67);
R4(c,d,e,a,b,68);R4(b,c,d,e,a,69);R4(a,b,c,d,e,70);R4(e,a,b,c,d,71);
R4(d,e,a,b,c,72);R4(c,d,e,a,b,73);R4(b,c,d,e,a,74);R4(a,b,c,d,e,75);
R4(e,a,b,c,d,76);R4(d,e,a,b,c,77);R4(c,d,e,a,b,78);R4(b,c,d,e,a,79);
/*将工作变量添加回上下文中。状态[]*/
状态[0]+=a;
状态[1]+=b;
状态[2]+=c;
状态[3]+=d;
状态[4]+=e;
}
__设备\无效哈希\首字母(哈希\上下文*上下文)
{
/*SHA1初始化常量*/
上下文->状态[0]=0x67452301;
上下文->状态[1]=0xEFCDAB89;
上下文->状态[2]=0x98BADCFE;
上下文->状态[3]=0x10325476;
上下文->状态[4]=0xC3D2E1F0;
上下文->计数[0]=上下文->计数[1]=0;
}
/*通过此文件运行您的数据*/
__设备\无效哈希\进程(