C++ 如何优化SIMD转置功能(8x4=>;4x8)?
我需要用AVX优化8x4和4x8浮点矩阵的转置。我用Agner Fog的 teal任务-构建BVH和sum min-max。转置用于每个循环的最后阶段(它们也通过多线程进行优化,但任务可能非常多) 代码现在看起来像:C++ 如何优化SIMD转置功能(8x4=>;4x8)?,c++,matrix,simd,avx,C++,Matrix,Simd,Avx,我需要用AVX优化8x4和4x8浮点矩阵的转置。我用Agner Fog的 teal任务-构建BVH和sum min-max。转置用于每个循环的最后阶段(它们也通过多线程进行优化,但任务可能非常多) 代码现在看起来像: void transpose(register Vec4f (&fin)[8], register Vec8f (&mat)[4]) { for (int i = 0;i < 8;i++) { fin[i] = lookup<28
void transpose(register Vec4f (&fin)[8], register Vec8f (&mat)[4]) {
for (int i = 0;i < 8;i++) {
fin[i] = lookup<28>(Vec4i(0, 8, 16, 24) + i, (float *)mat);
}
}
void转置(寄存器Vec4f(&fin)[8],寄存器Vec8f(&mat)[4]){
对于(int i=0;i<8;i++){
fin[i]=查找(Vec4i(0,8,16,24)+i,(float*)mat);
}
}
需要优化的变体。如何为SIMD优化此功能
我最近用vectorclass编写了自己的转置变体(4x8和8x4)。版本1.0
void transpose(register Vec4f(&fin)[8], register Vec8f(&mat)[4]) {
register Vec8f a00 = blend8f<0, 8, 1, 9, 2, 10, 3, 11>(mat[0], mat[1]);
register Vec8f a10 = blend8f<0, 8, 1, 9, 2, 10, 3, 11>(mat[2], mat[3]);
register Vec8f a01 = blend8f<4, 12, 5, 13, 6, 14, 7, 15>(mat[0], mat[1]);
register Vec8f a11 = blend8f<4, 12, 5, 13, 6, 14, 7, 15>(mat[2], mat[3]);
register Vec8f v0_1 = blend8f<0, 1, 8, 9, 2, 3, 10, 11>(a00, a10);
register Vec8f v2_3 = blend8f<4, 5, 12, 13, 6, 7, 14, 15>(a00, a10);
register Vec8f v4_5 = blend8f<0, 1, 8, 9, 2, 3, 10, 11>(a01, a11);
register Vec8f v6_7 = blend8f<4, 5, 12, 13, 6, 7, 14, 15>(a01, a11);
fin[0] = v0_1.get_low();
fin[1] = v0_1.get_high();
fin[2] = v2_3.get_low();
fin[3] = v2_3.get_high();
fin[4] = v4_5.get_low();
fin[5] = v4_5.get_high();
fin[6] = v6_7.get_low();
fin[7] = v6_7.get_high();
}
void transpose(register Vec8f(&fin)[4], register Vec4f(&mat)[8]) {
register Vec8f a0_1 = Vec8f(mat[0], mat[1]);
register Vec8f a2_3 = Vec8f(mat[2], mat[3]);
register Vec8f a4_5 = Vec8f(mat[4], mat[5]);
register Vec8f a6_7 = Vec8f(mat[6], mat[7]);
register Vec8f a00 = blend8f<0, 4, 8 , 12, 1, 5, 9 , 13>(a0_1, a2_3);
register Vec8f a10 = blend8f<0, 4, 8 , 12, 1, 5, 9 , 13>(a4_5, a6_7);
register Vec8f a01 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(a0_1, a2_3);
register Vec8f a11 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(a4_5, a6_7);
fin[0] = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(a00, a10);
fin[1] = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(a00, a10);
fin[2] = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(a01, a11);
fin[3] = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(a01, a11);
}
void转置(寄存器Vec4f(&fin)[8],寄存器Vec8f(&mat)[4]){
寄存器vec8fa00=blend8f(mat[0],mat[1]);
寄存器vec8fa10=blend8f(mat[2],mat[3]);
寄存器vec8fa01=blend8f(mat[0],mat[1]);
寄存器vec8fa11=blend8f(mat[2],mat[3]);
寄存器Vec8f v0_1=blend8f(a00,a10);
寄存器Vec8f v2_3=blend8f(a00,a10);
寄存器Vec8f v4_5=blend8f(a01,a11);
寄存器Vec8f v6_7=blend8f(a01,a11);
fin[0]=v0_1.获取_低();
fin[1]=v0_1.获得_高();
fin[2]=v2_3.get_low();
fin[3]=v2_3.get_high();
fin[4]=v4_5.get_low();
fin[5]=v4_5.获得_高();
fin[6]=v6_7.get_low();
fin[7]=v6_7.获取_高();
}
无效转置(寄存器Vec8f(&fin)[4],寄存器Vec4f(&mat)[8]){
寄存器vec8fa0_1=Vec8f(mat[0],mat[1]);
寄存器vec8fa2_3=Vec8f(mat[2],mat[3]);
寄存器Vec8f a4_5=Vec8f(mat[4],mat[5]);
寄存器Vec8f a6_7=Vec8f(mat[6],mat[7]);
寄存器vec8fa00=blend8f(a0_1,a2_3);
寄存器vec8fa10=blend8f(a4_5,a6_7);
寄存器vec8fa01=blend8f(a0_1,a2_3);
寄存器vec8fa11=blend8f(a4_5,a6_7);
翅片[0]=blend8f(a00,a10);
翅片[1]=blend8f(a00,a10);
翅片[2]=blend8f(a01,a11);
翅片[3]=blend8f(a01,a11);
}
需要2.0版 我没有使用vectorclass库的经验,但是通过简单地查看
查找
模板函数的源代码,您似乎在做一些效率极低的事情
我在下面用SSE/AVX内部函数提出了一个简单有效的解决方案。我不知道如何用vectorclass
库对其进行完整编码。但是,您可以使用转换运算符从类Vec4f
和Vec8f
中提取原始数据,如\uuuuum128
和\uuuuum256
。适当的构造函数允许您将原始结果转换回向量类
在带有内部函数的纯SSE中,在头
xmmintrin.h
中有一个宏\u MM\u TRANSPOSE4\u PS
。它将4x4浮点矩阵与单独128位寄存器中的每一行进行转换。如果您只有SSE(即没有AVX),那么只需调用此宏两次就可以了。
代码如下:
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) { \
__m128 tmp3, tmp2, tmp1, tmp0; \
tmp0 = _mm_shuffle_ps(row0, row1, 0x44); \
tmp2 = _mm_shuffle_ps(row0, row1, 0xEE); \
tmp1 = _mm_shuffle_ps(row2, row3, 0x44); \
tmp3 = _mm_shuffle_ps(row2, row3, 0xEE); \
row0 = _mm_shuffle_ps(tmp0, tmp1, 0x88); \
row1 = _mm_shuffle_ps(tmp0, tmp1, 0xDD); \
row2 = _mm_shuffle_ps(tmp2, tmp3, 0x88); \
row3 = _mm_shuffle_ps(tmp2, tmp3, 0xDD); \
}
void Transpose8x4(__m256 dst[4], __m128 src[8]) {
__m256 row0 = _mm256_setr_m128(src[0], src[4]);
__m256 row1 = _mm256_setr_m128(src[1], src[5]);
__m256 row2 = _mm256_setr_m128(src[2], src[6]);
__m256 row3 = _mm256_setr_m128(src[3], src[7]);
__m256 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm256_shuffle_ps(row0, row1, 0x44);
tmp2 = _mm256_shuffle_ps(row0, row1, 0xEE);
tmp1 = _mm256_shuffle_ps(row2, row3, 0x44);
tmp3 = _mm256_shuffle_ps(row2, row3, 0xEE);
row0 = _mm256_shuffle_ps(tmp0, tmp1, 0x88);
row1 = _mm256_shuffle_ps(tmp0, tmp1, 0xDD);
row2 = _mm256_shuffle_ps(tmp2, tmp3, 0x88);
row3 = _mm256_shuffle_ps(tmp2, tmp3, 0xDD);
dst[0] = row0; dst[1] = row1; dst[2] = row2; dst[3] = row3;
}
在AVX中,具有256位操作数的指令通常只对操作数的两半(称为通道)执行SSE等效运算。而且内在的\umm256\u shuffle\u ps
也不例外:它只是像它的\u mm
等价物一样洗牌两个128位通道。If表示如果我们将宏中的\u mm
前缀更改为\u mm256
前缀,它将转置两个4x4矩阵:一个位于四个256位寄存器的下通道,另一个位于四个256位寄存器的上通道。我们只需将生成的256位寄存器分成两半,并正确排序即可
产生的代码如下所示。我已经检查过它是否正常工作。它似乎只有12条指令,所以我想它会非常快
void Transpose4x8(__m128 dst[8], __m256 src[4]) {
__m256 row0 = src[0], row1 = src[1], row2 = src[2], row3 = src[3];
__m256 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm256_shuffle_ps(row0, row1, 0x44);
tmp2 = _mm256_shuffle_ps(row0, row1, 0xEE);
tmp1 = _mm256_shuffle_ps(row2, row3, 0x44);
tmp3 = _mm256_shuffle_ps(row2, row3, 0xEE);
row0 = _mm256_shuffle_ps(tmp0, tmp1, 0x88);
row1 = _mm256_shuffle_ps(tmp0, tmp1, 0xDD);
row2 = _mm256_shuffle_ps(tmp2, tmp3, 0x88);
row3 = _mm256_shuffle_ps(tmp2, tmp3, 0xDD);
dst[0] = _mm256_castps256_ps128(row0);
dst[1] = _mm256_castps256_ps128(row1);
dst[2] = _mm256_castps256_ps128(row2);
dst[3] = _mm256_castps256_ps128(row3);
dst[4] = _mm256_extractf128_ps(row0, 1);
dst[5] = _mm256_extractf128_ps(row1, 1);
dst[6] = _mm256_extractf128_ps(row2, 1);
dst[7] = _mm256_extractf128_ps(row3, 1);
}
更新逆换位的方法完全相同,只是有些事情的顺序相反。代码如下:
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) { \
__m128 tmp3, tmp2, tmp1, tmp0; \
tmp0 = _mm_shuffle_ps(row0, row1, 0x44); \
tmp2 = _mm_shuffle_ps(row0, row1, 0xEE); \
tmp1 = _mm_shuffle_ps(row2, row3, 0x44); \
tmp3 = _mm_shuffle_ps(row2, row3, 0xEE); \
row0 = _mm_shuffle_ps(tmp0, tmp1, 0x88); \
row1 = _mm_shuffle_ps(tmp0, tmp1, 0xDD); \
row2 = _mm_shuffle_ps(tmp2, tmp3, 0x88); \
row3 = _mm_shuffle_ps(tmp2, tmp3, 0xDD); \
}
void Transpose8x4(__m256 dst[4], __m128 src[8]) {
__m256 row0 = _mm256_setr_m128(src[0], src[4]);
__m256 row1 = _mm256_setr_m128(src[1], src[5]);
__m256 row2 = _mm256_setr_m128(src[2], src[6]);
__m256 row3 = _mm256_setr_m128(src[3], src[7]);
__m256 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm256_shuffle_ps(row0, row1, 0x44);
tmp2 = _mm256_shuffle_ps(row0, row1, 0xEE);
tmp1 = _mm256_shuffle_ps(row2, row3, 0x44);
tmp3 = _mm256_shuffle_ps(row2, row3, 0xEE);
row0 = _mm256_shuffle_ps(tmp0, tmp1, 0x88);
row1 = _mm256_shuffle_ps(tmp0, tmp1, 0xDD);
row2 = _mm256_shuffle_ps(tmp2, tmp3, 0x88);
row3 = _mm256_shuffle_ps(tmp2, tmp3, 0xDD);
dst[0] = row0; dst[1] = row1; dst[2] = row2; dst[3] = row3;
}
向量类库(VCL)使用模板元编程来确定用于排列和混合的最佳本质。然而,当涉及到排列和混合时,您通常仍然需要了解硬件的限制以获得最佳结果
我将Stgatilov已经很好的答案转换为使用VCL,它产生了理想的汇编(八次随机播放)。以下是函数:
void tran8x4_AVX(float *a, float *b) {
Vec8f tmp0, tmp1, tmp2, tmp3;
Vec8f row0, row1, row2, row3;
row0 = Vec8f().load(&a[8*0]);
row1 = Vec8f().load(&a[8*1]);
row2 = Vec8f().load(&a[8*2]);
row3 = Vec8f().load(&a[8*3]);
tmp0 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row0, row1);
tmp2 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row0, row1);
tmp1 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row2, row3);
tmp3 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row2, row3);
row0 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp0, tmp1);
row1 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp0, tmp1);
row2 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp2, tmp3);
row3 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp2, tmp3);
row0.get_low().store(&b[ 4*0]);
row1.get_low().store(&b[ 4*1]);
row2.get_low().store(&b[ 4*2]);
row3.get_low().store(&b[ 4*3]);
row0.get_high().store(&b[ 4*4]);
row1.get_high().store(&b[ 4*5]);
row2.get_high().store(&b[ 4*6]);
row3.get_high().store(&b[ 4*7]);
}
这是一个完整的测试
#include <stdio.h>
#include "vectorclass.h"
void tran8x4(float *a, float *b) {
for(int i=0; i<4; i++) {
for(int j=0; j<8; j++) {
b[j*4+i] = a[i*8+j];
}
}
}
void tran8x4_AVX(float *a, float *b) {
Vec8f tmp0, tmp1, tmp2, tmp3;
Vec8f row0, row1, row2, row3;
row0 = Vec8f().load(&a[8*0]);
row1 = Vec8f().load(&a[8*1]);
row2 = Vec8f().load(&a[8*2]);
row3 = Vec8f().load(&a[8*3]);
tmp0 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row0, row1);
tmp2 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row0, row1);
tmp1 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row2, row3);
tmp3 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row2, row3);
row0 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp0, tmp1);
row1 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp0, tmp1);
row2 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp2, tmp3);
row3 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp2, tmp3);
row0.get_low().store(&b[ 4*0]);
row1.get_low().store(&b[ 4*1]);
row2.get_low().store(&b[ 4*2]);
row3.get_low().store(&b[ 4*3]);
row0.get_high().store(&b[ 4*4]);
row1.get_high().store(&b[ 4*5]);
row2.get_high().store(&b[ 4*6]);
row3.get_high().store(&b[ 4*7]);
}
int main() {
float a[32], b1[32], b2[32];
for(int i=0; i<32; i++) a[i] = i;
for(int i=0; i<4; i++) {
for(int j=0; j<8; j++) {
printf("%2.0f ", a[i*8+j]);
} puts("");
}
tran8x4(a,b1);
tran8x4_AVX(a,b2);
puts("");
for(int i=0; i<8; i++) {
for(int j=0; j<4; j++) {
printf("%2.0f ", b1[i*4+j]);
} puts("");
}
puts("");
for(int i=0; i<8; i++) {
for(int j=0; j<4; j++) {
printf("%2.0f ", b2[i*4+j]);
} puts("");
}
}
#包括
#包括“vectorclass.h”
void tran8x4(浮动*a,浮动*b){
对于(inti=0;iSorry,但还需要反向转置。遗憾的是,我没有在主要问题中说过。@user2454034:添加了反向转置,实际上完全相同。