内联PTX CUDA中向量的简单添加
我尝试编写简单的代码,将V1vector与V2相加,并在V3中保存值。这是CUDA的工作fin,但我不能用PTX写,有人能帮忙吗内联PTX CUDA中向量的简单添加,cuda,ptx,Cuda,Ptx,我尝试编写简单的代码,将V1vector与V2相加,并在V3中保存值。这是CUDA的工作fin,但我不能用PTX写,有人能帮忙吗 __global__ void addKernelPTXv4(float4 *cc, const float4 *aa, const float4 *bb) { int i = threadIdx.x; cc[i].x = aa[i].x + bb[i].x; cc[i].y = aa[i].y + bb[i].y; cc[i].
__global__ void addKernelPTXv4(float4 *cc, const float4 *aa, const float4 *bb)
{
int i = threadIdx.x;
cc[i].x = aa[i].x + bb[i].x;
cc[i].y = aa[i].y + bb[i].y;
cc[i].z = aa[i].z + bb[i].z;
cc[i].w = aa[i].w + bb[i].w;
}
当我打印向量CC时,在运行PTX代码之后,它是零evrywhere。你们能告诉我为什么代码有问题吗
//////////////加
在Visual Studio 2015中编译ptx/asm代码后,我检查ptx输出文件:
// .globl _Z14addKernelPTXv4P6float4PKS_S2_
.visible .entry _Z14addKernelPTXv4P6float4PKS_S2_(
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_0,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_1,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_2
)
{
.reg .b64 %rd<5>;
ret;
}
是。。。。空的那么问题出在哪里呢
编译CUDA版本时,输出PTX为:
// .globl _Z14addKernelPTXv4P6float4PKS_S2_
.visible .entry _Z14addKernelPTXv4P6float4PKS_S2_(
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_0,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_1,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_2
)
{
.reg .f32 %f<21>;
.reg .b32 %r<2>;
.reg .b64 %rd<11>;
ld.param.u64 %rd1, [_Z14addKernelPTXv4P6float4PKS_S2__param_0];
ld.param.u64 %rd2, [_Z14addKernelPTXv4P6float4PKS_S2__param_1];
ld.param.u64 %rd3, [_Z14addKernelPTXv4P6float4PKS_S2__param_2];
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
mov.u32 %r1, %tid.x;
mul.wide.s32 %rd7, %r1, 16;
add.s64 %rd8, %rd6, %rd7;
add.s64 %rd9, %rd5, %rd7;
add.s64 %rd10, %rd4, %rd7;
ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd8];
ld.global.v4.f32 {%f5, %f6, %f7, %f8}, [%rd9];
add.f32 %f11, %f4, %f8;
add.f32 %f14, %f3, %f7;
add.f32 %f17, %f2, %f6;
add.f32 %f20, %f1, %f5;
st.global.v4.f32 [%rd10], {%f20, %f17, %f14, %f11};
ret;
}
为什么要清除我的代码?怎么了?她是解决代码:
__global__ void addKernelPTXv4(float4 *ccc, const float4 *aaa, const float4 *bbb)
{
asm volatile ("{ \n\t"
".reg.f32 aa<4>, bb<4>, cc<4>; \n\t"
".reg.s32 rr0; \n\t"
".reg.s64 rrd<4>; \n\t"
"mov.s32 rr0, %tid.x; \n\t"
"mul.wide.s32 rrd0, rr0, 16; \n\t"
"add.s64 rrd1, %0, rrd0; \n\t"
"add.s64 rrd2, %1, rrd0; \n\t"
"add.s64 rrd3, %2, rrd0; \n\t"
"ld.global.v4.f32 { aa0, aa1, aa2, aa3 }, [rrd2]; \n\t"
"ld.global.v4.f32 { bb0, bb1, bb2, bb3 }, [rrd3]; \n\t"
"add.f32 cc0, aa0, bb0; \n\t"
"add.f32 cc1, aa1, bb1; \n\t"
"add.f32 cc2, aa2, bb2; \n\t"
"add.f32 cc3, aa3, bb3; \n\t"
"st.global.v4.f32 [rrd1], { cc0, cc1, cc2, cc3 }; \n\t"
"} \n\t"
:
: "l"(ccc), "l"(aaa), "l"(bbb)
: "memory"
);
}
汇编后:
// .globl _Z14addKernelPTXv4P6float4PKS_S2_
.visible .entry _Z14addKernelPTXv4P6float4PKS_S2_(
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_0,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_1,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_2
)
{
.reg .b64 %rd<4>;
ld.param.u64 %rd1, [_Z14addKernelPTXv4P6float4PKS_S2__param_0];
ld.param.u64 %rd2, [_Z14addKernelPTXv4P6float4PKS_S2__param_1];
ld.param.u64 %rd3, [_Z14addKernelPTXv4P6float4PKS_S2__param_2];
// inline asm
{
.reg.f32 aa<4>, bb<4>, cc<4>;
.reg.s32 rr0;
.reg.s64 rrd<4>;
mov.s32 rr0, %tid.x;
mul.wide.s32 rrd0, rr0, 16;
add.s64 rrd1, %rd1, rrd0;
add.s64 rrd2, %rd2, rrd0;
add.s64 rrd3, %rd3, rrd0;
ld.global.v4.f32 { aa0, aa1, aa2, aa3 }, [rrd2];
ld.global.v4.f32 { bb0, bb1, bb2, bb3 }, [rrd3];
add.f32 cc0, aa0, bb0;
add.f32 cc1, aa1, bb1;
add.f32 cc2, aa2, bb2;
add.f32 cc3, aa3, bb3;
st.global.v4.f32 [rrd1], { cc0, cc1, cc2, cc3 };
}
// inline asm
ret;
}
// .globl _Z14addKernelPTXv4P6float4PKS_S2_
.visible .entry _Z14addKernelPTXv4P6float4PKS_S2_(
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_0,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_1,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_2
)
{
.reg .f32 %f<21>;
.reg .b32 %r<2>;
.reg .b64 %rd<11>;
ld.param.u64 %rd1, [_Z14addKernelPTXv4P6float4PKS_S2__param_0];
ld.param.u64 %rd2, [_Z14addKernelPTXv4P6float4PKS_S2__param_1];
ld.param.u64 %rd3, [_Z14addKernelPTXv4P6float4PKS_S2__param_2];
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
mov.u32 %r1, %tid.x;
mul.wide.s32 %rd7, %r1, 16;
add.s64 %rd8, %rd6, %rd7;
add.s64 %rd9, %rd5, %rd7;
add.s64 %rd10, %rd4, %rd7;
ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd8];
ld.global.v4.f32 {%f5, %f6, %f7, %f8}, [%rd9];
add.f32 %f11, %f4, %f8;
add.f32 %f14, %f3, %f7;
add.f32 %f17, %f2, %f6;
add.f32 %f20, %f1, %f5;
st.global.v4.f32 [%rd10], {%f20, %f17, %f14, %f11};
ret;
}
因此,可以从CUDA编译比VS.2015短一点的PTX代码- 一,。这不是一个很好的答案,但请尝试将此代码与从同一内核的C版本获得的PTX进行比较。2.请提供一个可验证的示例,即具有此内核的完整小程序。您已声明cc为仅输出值=l。由于您正在读取值,可能是+l?我从=l更改为+l,情况也是一样的,因此没有帮助:添加或使用asm volatile。目前,您的asm代码被声明为无效,因此编译器可以自由地对其进行优化-
__global__ void addKernelPTXv4(float4 *c, const float4 *a, const float4 *b)
{
int i = threadIdx.x;
c[i].x = a[i].x + b[i].x;
c[i].y = a[i].y + b[i].y;
c[i].z = a[i].z + b[i].z;
c[i].w = a[i].w + b[i].w;
}
// .globl _Z14addKernelPTXv4P6float4PKS_S2_
.visible .entry _Z14addKernelPTXv4P6float4PKS_S2_(
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_0,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_1,
.param .u64 _Z14addKernelPTXv4P6float4PKS_S2__param_2
)
{
.reg .f32 %f<21>;
.reg .b32 %r<2>;
.reg .b64 %rd<11>;
ld.param.u64 %rd1, [_Z14addKernelPTXv4P6float4PKS_S2__param_0];
ld.param.u64 %rd2, [_Z14addKernelPTXv4P6float4PKS_S2__param_1];
ld.param.u64 %rd3, [_Z14addKernelPTXv4P6float4PKS_S2__param_2];
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
mov.u32 %r1, %tid.x;
mul.wide.s32 %rd7, %r1, 16;
add.s64 %rd8, %rd6, %rd7;
add.s64 %rd9, %rd5, %rd7;
add.s64 %rd10, %rd4, %rd7;
ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd8];
ld.global.v4.f32 {%f5, %f6, %f7, %f8}, [%rd9];
add.f32 %f11, %f4, %f8;
add.f32 %f14, %f3, %f7;
add.f32 %f17, %f2, %f6;
add.f32 %f20, %f1, %f5;
st.global.v4.f32 [%rd10], {%f20, %f17, %f14, %f11};
ret;
}