2
我嘗試上的OpenCL內核移植功能,但編譯器給了我一個錯誤:的OpenCL編譯器未能編譯功能
cvmsErrorCompilerFailure: LLVM compiler has failed to compile a function.
這裏是我的內核代碼:
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
//##############################################################################
// Utils function
//##############################################################################
void mmemcpy(unsigned char *dst, const unsigned char *src, size_t len) {
for (size_t i = 0; i < len; i++)
dst[i] = src[i];
}
//##############################################################################
// Tree hashing constants definition
//##############################################################################
/* tweak word T[1]: bit field starting positions */
#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */
#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */
#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */
#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */
#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */
#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */
/* tweak word T[1]: flag bit definition(s) */
#define SKEIN_T1_FLAG_FIRST (((unsigned long) 1) << SKEIN_T1_POS_FIRST)
#define SKEIN_T1_FLAG_FINAL (((unsigned long) 1) << SKEIN_T1_POS_FINAL)
#define SKEIN_T1_FLAG_BIT_PAD (((unsigned long) 1) << SKEIN_T1_POS_BIT_PAD)
//##############################################################################
// Skein macros
//##############################################################################
#ifndef RotL_64
#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64-(N))))
#endif
//##############################################################################
// Skein block
//##############################################################################
#ifndef SKEIN_USE_ASM
#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
#endif
#ifndef SKEIN_LOOP
#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
#endif
#define BLK_BITS (WCNT*64) /* some useful definitions for code here */
#define KW_TWK_BASE (0)
#define KW_KEY_BASE (3)
#define ks (kw + KW_KEY_BASE)
#define ts (kw + KW_TWK_BASE)
//##############################################################################
// Port settings
//##############################################################################
/* Platform settings */
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
/* Define platfor here */
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
/* here for big-endian */
#define SKEIN_NEED_SWAP (1)
#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
/* here for little-endian */
#define SKEIN_NEED_SWAP (0)
#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */
#define Skein_Put64_LSB_First(dst08, src64, bCnt) mmemcpy(dst08, src64, bCnt)
#define Skein_Get64_LSB_First(dst64, src08, wCnt) mmemcpy(dst64, src08, 8*(wCnt))
#endif
#endif
//##############################################################################
// Skein.h
//##############################################################################
enum {
/* Skein_256 round rotation constants */
R_256_0_0=14, R_256_0_1=16,
R_256_1_0=52, R_256_1_1=57,
R_256_2_0=23, R_256_2_1=40,
R_256_3_0= 5, R_256_3_1=37,
R_256_4_0=25, R_256_4_1=33,
R_256_5_0=46, R_256_5_1=12,
R_256_6_0=58, R_256_6_1=22,
R_256_7_0=32, R_256_7_1=32,
/* Skein_512 round rotation constants */
R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
/* Skein1024 round rotation constants */
R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
};
#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */
#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/
#endif
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((unsigned long) (hi32)) << 32))
#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
//##############################################################################
// Skein structures
//##############################################################################
enum {
SKEIN_SUCCESS = 0, /* return codes from Skein calls */
SKEIN_FAIL = 1,
SKEIN_BAD_HASHLEN = 2
};
#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */
#define SKEIN_256_STATE_WORDS (4)
#define SKEIN_512_STATE_WORDS (8)
#define SKEIN1024_STATE_WORDS (16)
#define SKEIN_MAX_STATE_WORDS (16)
#define SKEIN_256_STATE_BYTES (8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BYTES (8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BYTES (8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
#define SKEIN_256_BLOCK_BYTES (8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_BLOCK_BYTES (8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_BLOCK_BYTES (8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */
#define SKEIN_512_ROUNDS_TOTAL (72)
#define SKEIN1024_ROUNDS_TOTAL (80)
typedef struct {
unsigned long hashBitLen; /* size of hash result, in bits */
unsigned long bCnt; /* current byte count in buffer b[] */
unsigned long T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
} Skein_Ctxt_Hdr_t;
typedef struct { /* 256-bit Skein hash context structure */
Skein_Ctxt_Hdr_t h; /* common header context variables */
unsigned long X[SKEIN_256_STATE_WORDS]; /* chaining variables */
unsigned char b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
} Skein_256_Ctxt_t;
//##############################################################################
// 256-bit Skein
//##############################################################################
/***************************** Skein_256 ******************************/
void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const unsigned *blkPtr, unsigned long blkCnt, unsigned long byteCntAdd) { /* do it in C */
enum {
WCNT = SKEIN_256_STATE_WORDS
};
#undef RCNT
#define RCNT (SKEIN_256_ROUNDS_TOTAL/8)
#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
#else
#define SKEIN_UNROLL_256 (0)
#endif
#if SKEIN_UNROLL_256
#if (RCNT % SKEIN_UNROLL_256)
#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */
#endif
unsigned long r;
unsigned long kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
#else
unsigned long kw[WCNT+4]; /* key schedule words : chaining vars + tweak */
#endif
unsigned long X0, X1, X2, X3; /* local copy of context vars, for speed */
unsigned long w[WCNT]; /* local copy of input block */
/* never call with blkCnt == 0! */
if (!(blkCnt != 0))
return;
ts[0] = ctx->h.T[0];
ts[1] = ctx->h.T[1];
do {
/* this implementation only supports 2**64 input bytes (no carry out here) */
ts[0] += byteCntAdd; /* update processed length */
/* precompute the key schedule for this block */
ks[0] = ctx->X[0];
ks[1] = ctx->X[1];
ks[2] = ctx->X[2];
ks[3] = ctx->X[3];
ks[4] = ks[0]^ks[1]^ks[2]^ks[3]^SKEIN_KS_PARITY;
ts[2] = ts[0]^ts[1];
Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
X0 = w[0] + ks[0]; /* do the first full key injection */
X1 = w[1] + ks[1] + ts[0];
X2 = w[2] + ks[2] + ts[1];
X3 = w[3] + ks[3];
blkPtr += SKEIN_256_BLOCK_BYTES;
/* run the rounds */
// IN THIS MACRO WHEN I EXCHANGED X##p1 and X##p in RotL_64 for numbers 1, 2, THEN COMPILATION IS OK
#define Round256(p0, p1, p2, p3, ROT, rNum) \
X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
#if SKEIN_UNROLL_256 == 0
#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
Round256(p0, p1, p2, p3, ROT, rNum)
#define I256(R) \
X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \
X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \
X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \
X3 += ks[((R)+4) % 5] + (R)+1;
#else /* looping version */
#define R256(p0, p1, p2, p3, ROT, rNum) \
Round256(p0, p1, p2, p3, ROT, rNum) \
#define I256(R) \
X0 += ks[r+(R)+0]; /* inject the key schedule value */ \
X1 += ks[r+(R)+1] + ts[r+(R)+0]; \
X2 += ks[r+(R)+2] + ts[r+(R)+1]; \
X3 += ks[r+(R)+3] + r+(R) ; \
ks[r + (R)+4 ] = ks[r+(R)-1]; /* rotate key schedule */\
ts[r + (R)+2 ] = ts[r+(R)-1];
for (r = 1; r < 2*RCNT; r += 2*SKEIN_UNROLL_256) /* loop thru it */
#endif
{
#define R256_8_rounds(R) \
R256(0,1,2,3,R_256_0,8*(R) + 1); \
R256(0,3,2,1,R_256_1,8*(R) + 2); \
R256(0,1,2,3,R_256_2,8*(R) + 3); \
R256(0,3,2,1,R_256_3,8*(R) + 4); \
I256(2*(R)); \
R256(0,1,2,3,R_256_4,8*(R) + 5); \
R256(0,3,2,1,R_256_5,8*(R) + 6); \
R256(0,1,2,3,R_256_6,8*(R) + 7); \
R256(0,3,2,1,R_256_7,8*(R) + 8); \
I256(2*(R)+1);
R256_8_rounds(0);
#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
#if R256_Unroll_R(1)
R256_8_rounds(1);
#endif
#if R256_Unroll_R(2)
R256_8_rounds(2);
#endif
#if R256_Unroll_R(3)
R256_8_rounds(3);
#endif
#if R256_Unroll_R(4)
R256_8_rounds(4);
#endif
#if R256_Unroll_R(5)
R256_8_rounds(5);
#endif
#if R256_Unroll_R(6)
R256_8_rounds(6);
#endif
#if R256_Unroll_R(7)
R256_8_rounds(7);
#endif
#if R256_Unroll_R(8)
R256_8_rounds(8);
#endif
#if R256_Unroll_R(9)
R256_8_rounds(9);
#endif
#if R256_Unroll_R(10)
R256_8_rounds(10);
#endif
#if R256_Unroll_R(11)
R256_8_rounds(11);
#endif
#if R256_Unroll_R(12)
R256_8_rounds(12);
#endif
#if R256_Unroll_R(13)
R256_8_rounds(13);
#endif
#if R256_Unroll_R(14)
R256_8_rounds(14);
#endif
#if (SKEIN_UNROLL_256 > 14)
#error "need more unrolling in Skein_256_Process_Block"
#endif
}
// WHEN I COMMENT NEXT 4 LINES, THEN COMPILATION IS OK
ctx->X[0] = X0^w[0];
ctx->X[1] = X1^w[1];
ctx->X[2] = X2^w[2];
ctx->X[3] = X3^w[3];
ts[1] &= ~SKEIN_T1_FLAG_FIRST;
}
while (--blkCnt);
ctx->h.T[0] = ts[0];
ctx->h.T[1] = ts[1];
}
__kernel void update(__global Skein_256_Ctxt_t *gctx) {
int glId = get_global_id(0);
int lId = get_local_id(0);
int grId = get_group_id(0);
// TODO now is one hashState per block, try one hs per thread to accelerate computing
gctx += grId;
Skein_256_Ctxt_t ctx;
ctx.h.hashBitLen = 256;
Skein_256_Process_Block(&ctx, &ctx.b, 1, SKEIN_256_BLOCK_BYTES);
*gctx = ctx;
}
當我評論標記行(我用大寫字母寫這行)或更改值(不是兩個)編譯是好的。誰能幫我?
請包括來自編譯器的錯誤消息的詳細信息。 – grrussel 2011-04-19 10:11:44
在這種情況下,在另一個實現上嘗試相同的代碼可以提供有關該問題的更多詳細信息。您可能還想在主機中生成代碼,而不是在內核代碼中使用宏。 – 2011-04-19 15:07:39
沒有編譯器的詳細消息,只有我得到的是:** cvmsErrorCompilerFailure:LLVM編譯器無法編譯函數** – 2011-04-19 18:36:56