Function OpenCL编译器未能编译函数

Function OpenCL编译器未能编译函数,function,compiler-construction,opencl,Function,Compiler Construction,Opencl,我尝试在OpenCL内核上移植函数,但编译器给了我下一个错误: cvmsErrorCompilerFailure: LLVM compiler has failed to compile a function. 以下是我的内核代码: //#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable //#############################################################

我尝试在OpenCL内核上移植函数,但编译器给了我下一个错误:

  cvmsErrorCompilerFailure: LLVM compiler has failed to compile a function.
以下是我的内核代码:

//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable

//##############################################################################
// Utils function
//##############################################################################
void mmemcpy(unsigned char *dst, const unsigned char *src, size_t len) {
    for (size_t i = 0; i < len; i++)
        dst[i] = src[i];
}


//##############################################################################
// Tree hashing constants definition
//##############################################################################
/* tweak word T[1]: bit field starting positions */
#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */

#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */

/* tweak word T[1]: flag bit definition(s) */
#define SKEIN_T1_FLAG_FIRST     (((unsigned long)  1 ) << SKEIN_T1_POS_FIRST)
#define SKEIN_T1_FLAG_FINAL     (((unsigned long)  1 ) << SKEIN_T1_POS_FINAL)
#define SKEIN_T1_FLAG_BIT_PAD   (((unsigned long)  1 ) << SKEIN_T1_POS_BIT_PAD)

//##############################################################################
// Skein macros
//##############################################################################
#ifndef RotL_64
#define RotL_64(x, N)    (((x) << (N)) | ((x) >> (64-(N))))
#endif

//##############################################################################
// Skein block
//##############################################################################
#ifndef SKEIN_USE_ASM
#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
#endif

#ifndef SKEIN_LOOP
#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
#endif

#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
#define KW_TWK_BASE     (0)
#define KW_KEY_BASE     (3)
#define ks              (kw + KW_KEY_BASE)                
#define ts              (kw + KW_TWK_BASE)

//##############################################################################
// Port settings
//##############################################################################
/* Platform settings */
#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */

/* Define platfor here */
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN

#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
/* here for big-endian */
#define SKEIN_NEED_SWAP (1)
#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
/* here for little-endian */
#define SKEIN_NEED_SWAP (0)
#if PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
#define Skein_Put64_LSB_First(dst08, src64, bCnt) mmemcpy(dst08, src64, bCnt)
#define Skein_Get64_LSB_First(dst64, src08, wCnt) mmemcpy(dst64, src08, 8*(wCnt))
#endif
#endif

//##############################################################################
// Skein.h
//##############################################################################
enum {   
    /* Skein_256 round rotation constants */
    R_256_0_0=14, R_256_0_1=16,
    R_256_1_0=52, R_256_1_1=57,
    R_256_2_0=23, R_256_2_1=40,
    R_256_3_0= 5, R_256_3_1=37,
    R_256_4_0=25, R_256_4_1=33,
    R_256_5_0=46, R_256_5_1=12,
    R_256_6_0=58, R_256_6_1=22,
    R_256_7_0=32, R_256_7_1=32,

    /* Skein_512 round rotation constants */
    R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
    R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
    R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
    R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
    R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
    R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
    R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
    R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,

    /* Skein1024 round rotation constants */
    R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
    R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
    R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
    R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
    R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
    R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
    R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
    R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
};

#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
#endif

#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((unsigned long) (hi32)) << 32))
#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)

//##############################################################################
// Skein structures
//##############################################################################
enum {
    SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
    SKEIN_FAIL            =      1,
    SKEIN_BAD_HASHLEN     =      2
};

#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */

#define  SKEIN_256_STATE_WORDS ( 4)
#define  SKEIN_512_STATE_WORDS ( 8)
#define  SKEIN1024_STATE_WORDS (16)
#define  SKEIN_MAX_STATE_WORDS (16)

#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)

#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)

#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)

#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
#define SKEIN_512_ROUNDS_TOTAL (72)
#define SKEIN1024_ROUNDS_TOTAL (80)

typedef struct {
    unsigned long  hashBitLen;                             /* size of hash result, in bits */
    unsigned long  bCnt;                                   /* current byte count in buffer b[] */
    unsigned long  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
} Skein_Ctxt_Hdr_t;

typedef struct {                                    /*  256-bit Skein hash context structure */
    Skein_Ctxt_Hdr_t h;                             /* common header context variables */
    unsigned long  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
    unsigned char  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
} Skein_256_Ctxt_t;

//##############################################################################
// 256-bit Skein 
//##############################################################################
/*****************************  Skein_256 ******************************/
void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const unsigned *blkPtr, unsigned long blkCnt, unsigned long byteCntAdd) { /* do it in C */

    enum {
        WCNT = SKEIN_256_STATE_WORDS
    };
#undef  RCNT
#define RCNT (SKEIN_256_ROUNDS_TOTAL/8)

#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
#else
#define SKEIN_UNROLL_256 (0)
#endif

#if SKEIN_UNROLL_256
#if (RCNT % SKEIN_UNROLL_256)
#error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
#endif
    unsigned long  r;
    unsigned long  kw[WCNT+4+RCNT*2];           /* key schedule words : chaining vars + tweak + "rotation"*/
#else
    unsigned long  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
#endif
    unsigned long  X0, X1, X2, X3;                        /* local copy of context vars, for speed */
    unsigned long  w[WCNT];                           /* local copy of input block */

   /* never call with blkCnt == 0! */
    if (!(blkCnt != 0))
        return;

    ts[0] = ctx->h.T[0];
    ts[1] = ctx->h.T[1];
    do  {
        /* this implementation only supports 2**64 input bytes (no carry out here) */
        ts[0] += byteCntAdd;                    /* update processed length */

        /* precompute the key schedule for this block */
        ks[0] = ctx->X[0];     
        ks[1] = ctx->X[1];
        ks[2] = ctx->X[2];
        ks[3] = ctx->X[3];
        ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;

        ts[2] = ts[0] ^ ts[1];

        Skein_Get64_LSB_First(w, blkPtr, WCNT);   /* get input block in little-endian format */

        X0 = w[0] + ks[0];                      /* do the first full key injection */
        X1 = w[1] + ks[1] + ts[0];
        X2 = w[2] + ks[2] + ts[1];
        X3 = w[3] + ks[3];

        blkPtr += SKEIN_256_BLOCK_BYTES;

        /* run the rounds */

// IN THIS MACRO WHEN I EXCHANGED X##p1 and X##p in RotL_64 for numbers 1, 2, THEN COMPILATION IS OK
#define Round256(p0, p1, p2, p3, ROT, rNum)                     \
X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \

#if SKEIN_UNROLL_256 == 0                       
#define R256(p0, p1, p2, p3, ROT, rNum)  /* fully unrolled */   \
Round256(p0, p1, p2, p3, ROT, rNum)

#define I256(R)                                                 \
X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
X3   += ks[((R)+4) % 5] +     (R)+1;
#else                                       /* looping version */
#define R256(p0, p1, p2, p3, ROT, rNum)                         \
Round256(p0, p1, p2, p3, ROT, rNum)                             \

#define I256(R)                                                 \
X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
X3   += ks[r+(R)+3] +    r+(R)   ;                              \
ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
ts[r + (R)+2    ]   = ts[r+(R)-1];                              

        for (r = 1; r < 2*RCNT; r += 2*SKEIN_UNROLL_256)  /* loop thru it */
#endif  
        {    
#define R256_8_rounds(R)          \
R256(0,1,2,3,R_256_0,8*(R) + 1);  \
R256(0,3,2,1,R_256_1,8*(R) + 2);  \
R256(0,1,2,3,R_256_2,8*(R) + 3);  \
R256(0,3,2,1,R_256_3,8*(R) + 4);  \
I256(2*(R));                      \
R256(0,1,2,3,R_256_4,8*(R) + 5);  \
R256(0,3,2,1,R_256_5,8*(R) + 6);  \
R256(0,1,2,3,R_256_6,8*(R) + 7);  \
R256(0,3,2,1,R_256_7,8*(R) + 8);  \
I256(2*(R)+1);

            R256_8_rounds( 0);

#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))

#if   R256_Unroll_R( 1)
            R256_8_rounds( 1);
#endif
#if   R256_Unroll_R( 2)
            R256_8_rounds( 2);
#endif
#if   R256_Unroll_R( 3)
            R256_8_rounds( 3);
#endif
#if   R256_Unroll_R( 4)
            R256_8_rounds( 4);
#endif
#if   R256_Unroll_R( 5)
            R256_8_rounds( 5);
#endif
#if   R256_Unroll_R( 6)
            R256_8_rounds( 6);
#endif
#if   R256_Unroll_R( 7)
            R256_8_rounds( 7);
#endif
#if   R256_Unroll_R( 8)
            R256_8_rounds( 8);
#endif
#if   R256_Unroll_R( 9)
            R256_8_rounds( 9);
#endif
#if   R256_Unroll_R(10)
            R256_8_rounds(10);
#endif
#if   R256_Unroll_R(11)
            R256_8_rounds(11);
#endif
#if   R256_Unroll_R(12)
            R256_8_rounds(12);
#endif
#if   R256_Unroll_R(13)
            R256_8_rounds(13);
#endif
#if   R256_Unroll_R(14)
            R256_8_rounds(14);
#endif
#if  (SKEIN_UNROLL_256 > 14)
#error  "need more unrolling in Skein_256_Process_Block"
#endif
        }

        // WHEN I COMMENT NEXT 4 LINES, THEN COMPILATION IS OK
        ctx->X[0] = X0 ^ w[0]; 
        ctx->X[1] = X1 ^ w[1];
        ctx->X[2] = X2 ^ w[2];
        ctx->X[3] = X3 ^ w[3];

        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
    }
    while (--blkCnt);
    ctx->h.T[0] = ts[0];
    ctx->h.T[1] = ts[1];
}

__kernel void update( __global Skein_256_Ctxt_t *gctx) {
    int glId = get_global_id(0);
    int lId = get_local_id(0);
    int grId = get_group_id(0);

    // TODO now is one hashState per block, try one hs per thread to accelerate computing
    gctx += grId;

    Skein_256_Ctxt_t ctx;
    ctx.h.hashBitLen = 256;
    Skein_256_Process_Block(&ctx, &ctx.b, 1, SKEIN_256_BLOCK_BYTES);
    *gctx = ctx;
}
/\pragma OPENCL扩展cl\u khr\u字节可寻址存储:启用
//##############################################################################
//Utils函数
//##############################################################################
无效mmemcpy(无符号字符*dst,常量无符号字符*src,大小长度){
对于(大小i=0;iX[1];
ks[2]=ctx->X[2];
ks[3]=ctx->X[3];
ks[4]=ks[0]^ks[1]^ks[2]^ks[3]^ks奇偶校验;
ts[2]=ts[0]^ts[1];
Skein_Get64_LSB_First(w,blkPtr,WCNT);/*以小端格式获取输入块*/
X0=w[0]+ks[0];/*进行第一次全键注入*/
X1=w[1]+ks[1]+ts[0];
X2=w[2]+ks[2]+ts[1];
X3=w[3]+ks[3];
blkPtr+=SKEIN_256_块_字节;
/*轮换*/
//在这个宏中,当我将RotL#u 64中的X#p1和X#p交换为数字1和2时,编译就可以了
#定义Round256(p0、p1、p2、p3、ROT、rNum)\
X##p0+=X##p1;X##p1=RotL#u64(X##p1,ROT##u0);X##p1^=X##p0\
X##p2+=X#p3;X#p3=RotL#U 64(X#p3,ROT#U 1);X#p3^=X#p2\
#如果SKEIN_UNROLL_256==0
#定义R256(p0、p1、p2、p3、ROT、rNum)/*完全展开*/\
第256轮(p0、p1、p2、p3、ROT、rNum)
#定义I256(R)\
X0+=ks[((R)+1)%5];/*注入密钥计划值*/\
X1+=ks[(R)+2)%5]+ts[(R)+1)%3]\
X2+=ks[(R)+3)%5]+ts[(R)+2)%3]\
X3+=ks[(R)+4)%5]+(R)+1;
#else/*循环版本*/
#定义R256(p0、p1、p2、p3、ROT、rNum)\
第256轮(p0、p1、p2、p3、ROT、rNum)\
#定义I256(R)\
X0+=ks[r+(r)+0];/*注入密钥计划值*/\
X1+=ks[r+(r)+1]+ts[r+(r)+0]\
X2+=ks[r+(r)+2]+ts[r+(r)+1]\
X3+=ks[r+(r)+3]+r+(r)\
ks[r+(r)+4]=ks[r+(r)-1];/*旋转键明细表*/\
ts[r+(r)+2]=ts[r+(r)-1];
对于(r=1;r<2*RCNT;r+=2*SKEIN_UNROLL_256)/*循环通过它*/
#恩迪夫
{    
#定义R256_8_轮(R)\
R256(0,1,2,3,R_256_0,8*(R)+1)\
R256(0,3,2,1,R_256_1,8*(R)+2)\
R256(0,1,2,3,R_256_2,8*(R)+3)\
R256(0,3,2,1,R_256_3,8*(R)+4)\
I256(2*(R))\
R256(0,1,2,3,R_256_4,8*(R)+5)\
R256(0,3,2,1,R_256_5,8*(R)+6)\
R256(0,1,2,3,R_256_6,8*(R)+7)\
R256(0,3,2,1,R_256_7,8*(R)+8)\
I256(2*(R)+1);
R256_8_轮(0);
#定义R256展开(NN)((展开256==0和展开256轮总数/8>(NN))(展开256>(NN)))
#如果R256展开(1)
R256_8_轮(1);
#恩迪夫
#如果R256展开(2)
R256_8_轮(2);
#恩迪夫
#如果R256展开(3)
R256_8_轮(3);
#恩迪夫
#如果R256展开(4)
R256_8_轮(4);
#恩迪夫
#如果R256展开(5)
R256_8_轮(5);
#恩迪夫
#如果R256展开(6)
R256_8_轮(6);
#恩迪夫
#如果R256展开(7)
R256_8_轮(7);
#恩迪夫
#如果R256展开(8)
R256_8_轮(8);
#恩迪夫
#如果R256展开(9)
R256_8_轮(9);
#恩迪夫
#如果R256展开(10)
R256_8_轮(10);
#恩迪夫
#如果R256展开(11)
R256_8_轮(11);
#恩迪夫
#如果R256展开(12)
R256_8_轮(12);
#恩迪夫
#如果R256展开(13)
R256_8_轮(13);
#恩迪夫
#如果R256展开(14)
R256_8_轮(14);
#恩迪夫
#如果(绞纱展开256>14)
#错误“Skein_256_进程_块中需要更多展开”
#恩迪夫
}
//当我注释接下来的4行时,编译就可以了
ctx->X[0]=X0^w[0];
ctx->X[1]=X1^w[1];
ctx->X[2]=X2^w[2];
ctx->X[3]=X3^w[3];
ts[1]&=~SKEIN_T1_FLAG_FIRST;
}
而(--blkCnt);
ctx->h.T[0]=ts[0];
ctx->h.T[1]=ts[1];
}
__内核无效更新(uu全局Skein_256_Ctxt_t*gctx){
int glId=get_global_id(0);
int lId=get_local_id(0);
int grId=get_group_id(0);
//TODO现在是每个块一个hashState,请尝试每个线程一个hs以加速计算
gctx+=网格;
Skein_256_Ctxt_t ctx;
ctx.h.hashBitLen=256;
Skein_256_进程块(&ctx,&ctx.b,1,Skein_256_块字节);
*gctx=ctx;
}

当我注释标记行(我用大写字母写了这一行)或更改值(不是两个)时,编译是正常的。有人能帮我吗?

在我看来,您似乎忘记指定要使用的内存类型了