CUDA _uthreadfence()同步与两个单独内核调用同步的性能比较

CUDA _uthreadfence()同步与两个单独内核调用同步的性能比较,cuda,synchronization,Cuda,Synchronization,我试图理解如何使用\uuu threadfence(),因为它似乎是一个强大的同步原语,可以让不同的块一起工作,而不必经历结束内核和启动新内核的巨大麻烦。CUDA C编程指南中有一个例子(附录B.5),在SDK中的“ThreadFenceReduce”示例中得到了充实,因此它似乎是我们“应该”使用的东西 但是,当我尝试使用\uuu threadfence()时,速度惊人地慢。有关示例,请参见下面的代码。据我所知,\uuu threadfence()应该在继续之前确保所有来自当前线程块的挂起内存传

我试图理解如何使用
\uuu threadfence()
,因为它似乎是一个强大的同步原语,可以让不同的块一起工作,而不必经历结束内核和启动新内核的巨大麻烦。CUDA C编程指南中有一个例子(附录B.5),在SDK中的“ThreadFenceReduce”示例中得到了充实,因此它似乎是我们“应该”使用的东西

但是,当我尝试使用
\uuu threadfence()
时,速度惊人地慢。有关示例,请参见下面的代码。据我所知,
\uuu threadfence()
应该在继续之前确保所有来自当前线程块的挂起内存传输都已完成。我相信,内存延迟比一微秒要好一些,因此在GTX680上处理所包含代码中64KB内存传输的总时间应该在一微秒左右。相反,
\uuu threadfence()
指令似乎需要大约
20
微秒!我不必使用
\uuu threadfence()
进行同步,而是可以在不到三分之一的时间内结束内核,并启动一个全新的内核(在相同的默认流中进行同步)

这是怎么回事?我的代码中有我没有注意到的错误吗?或者
\uuu threadfence()
真的比应该的速度慢了20倍,而且
6x
比整个内核启动+清理慢吗

threadfence内核运行1000次的时间:27.716831毫秒
回答:120
仅前3条线路的1000次运行时间,包括threadfence:25.962912 ms
通过拆分为两个内核进行无threadfence的同步:7.653344 ms
答复:120

#包括“cuda.h”
#包括
__设备\无符号整数计数=0;
__共享\uuubool isLastBlockDone;
__设备内部划痕[16];
__设备垃圾[16000];
__设备输入应答;
__global_u; void usethreadfinence()//就像CUDA C编程指南B.5中的代码示例一样
{
if(threadIdx.x==0)scratch[blockIdx.x]=blockIdx.x;
垃圾[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;//执行更多的内存写入,使内核变得不平凡
__螺纹围栏();
if(threadIdx.x==0){
unsigned int value=atomicInc(&count,gridDim.x);
isLastBlockDone=(值==(gridDim.x-1));
}
__同步线程();
if(isLastBlockDone&&threadIdx.x==0){
//最后一个块对存储在scratch[0..gridDim.x-1]中的结果求和
整数和=0;

对于(int i=0;i我已经在两个不同的卡上测试了用CUDA 6.0编译的代码:GT540M(费米)和开普勒K20c(开普勒),结果如下

GT540M

Time for 1000 runs of the threadfence kernel: 303.373688 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 300.395416 ms
Synchronizing without threadfence, by splitting to two kernels: 597.729919 ms
Answer: 120
开普勒K20c

Time for 1000 runs of the threadfence kernel: 10.164096 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 8.808896 ms
Synchronizing without threadfence, by splitting to two kernels: 17.330784 ms
Answer: 120
我没有观察到
\uu threadfence()
在其他两种情况下有任何特别缓慢的行为

这可以通过使用反汇编代码来证明

使用ThreadFence()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................
justthreadfence()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................
usetwokernels\u 1()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................
usetwokernels\u 1()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................

可以看出,
justthreadfencev()
的指令严格包含在
usethreadfines()
的指令中,而
usetwokernels_1()
usetwokernels_2()
的指令实际上是
justthreadfencev()指令的一个分区
。因此,时间上的差异可以归因于第二个内核的内核启动开销。

我已经在两个不同的卡上测试了使用CUDA 6.0编译的代码:GT540M(费米)和开普勒K20c(开普勒),结果如下

GT540M

Time for 1000 runs of the threadfence kernel: 303.373688 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 300.395416 ms
Synchronizing without threadfence, by splitting to two kernels: 597.729919 ms
Answer: 120
开普勒K20c

Time for 1000 runs of the threadfence kernel: 10.164096 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 8.808896 ms
Synchronizing without threadfence, by splitting to two kernels: 17.330784 ms
Answer: 120
我没有观察到
\uu threadfence()
在其他两种情况下有任何特别缓慢的行为

这可以通过使用反汇编代码来证明

使用ThreadFence()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................
justthreadfence()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................
usetwokernels\u 1()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................
usetwokernels\u 1()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;
    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */
    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................
    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................

可以看出,
justthreadfencev()
的指令严格包含在
usethreadfines()
的指令中,而
usetwokernels_1()
usetwokernels_2()
的指令实际上是
justthreadfencev()指令的一个分区
。因此,时间上的差异可以归因于第二个内核的内核启动开销。

我建议每个版本独立计时(每次执行一次)。您的内核执行时间可能会受到创建cuda上下文所需时间的影响。有关cuda上下文详细信息,请参阅此问题/答案:您是否正在使用cuda 5.5运行?如果是,您是否可以使用cuda 5.0重试测试?我一直在使用cuda 5.0。时间是从将代码放入新的cuda 5.0 Visual Stu开始的dio项目,具有所有默认设置,除了将编译更改为针对3.0设备编译,并将其置于发布模式。但总的来说,我对更改CUDA版本、获得不同的CUDA卡等都很满意。我只是想了解如何编写高性能代码,并且一直在这个问题上磕磕绊绊,这应该是一个什么样的问题轻量级同步原语的成本似乎比人们预期的要高出几个数量级。我还没有尝试过windows。在使用CUDA 5.0的Linux下,我始终得到的结果是,您的
\u threadfence()
方法稍微快一点(~20%)比你的两个内核的方法。我已经在一些不同的机器上尝试过这个方法/OS的/GPU我没有一台可以方便地使用GTX680的windows机器。但是我在我的CUDA 5 Win7 64位Quadro1000M(cc 2.1)笔记本电脑上试过你的代码,我包括