C++ 我应该如何将n128转换为n64x2?
我有一个uun128,我想用作vtbl2_uu8内在函数的输入,但它不喜欢它。就我所知,vreinterpret似乎不必有一个能在n128上工作的变体,而且这些东西似乎对重新解释cast很挑剔。我更习惯SSE2所以。。。有关于手臂霓虹灯的指导吗 编辑: 更具体地说,我能知道为什么:C++ 我应该如何将n128转换为n64x2?,c++,arm,simd,neon,C++,Arm,Simd,Neon,我有一个uun128,我想用作vtbl2_uu8内在函数的输入,但它不喜欢它。就我所知,vreinterpret似乎不必有一个能在n128上工作的变体,而且这些东西似乎对重新解释cast很挑剔。我更习惯SSE2所以。。。有关于手臂霓虹灯的指导吗 编辑: 更具体地说,我能知道为什么: static __forceinline __n128 byteshuffle( _In_ const __n128& x, _In_ cons
static __forceinline __n128 byteshuffle(
_In_ const __n128& x,
_In_ const __n128& mask)
{
uint8x8x2_t in =
{
x.n128_u64[0],
x.n128_u64[1]
};
__n128 out;
out.n128_u64[0] = vtbl2_u8(in, mask.n128_u64[0]);
out.n128_u64[1] = vtbl2_u8(in, mask.n128_u64[1]);
return out;
}
不编译吗?错误是“不存在合适的构造函数来将两条vtbl行上的“const unsigned long long long”转换为“\uu n64”。使用
vreinterpress\u X\Y
宏获取现有寄存器,并将类型“强制转换”为某种其他形式,以传递到另一个内部寄存器。例如,这段代码在一次加载中加载两个16位有符号的短字符作为32位无符号整数,但是我必须使用vreinterpret\u s16\u u32
,因为我实际上不想将数据视为uint32x2\u t
,而是希望它是int16x4\u t
,它的大小在字节中完全相同(即,它们都映射到值)
类似地,我对XMVectorPermute使用了vtbl4_u8
请注意,vtbl
功能非常强大,但使用起来有点复杂。对于“常见”swizzle模式,我实现了XMVectorSwizzle
和XMVectorPermute
的模板形式,因此我可以专门处理不需要完整表查找的情况:
// General swizzle template
template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
inline XMVECTOR XMVectorSwizzle(FXMVECTOR V)
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
}
// Specialized swizzles
template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V)
{ return V; }
template<> inline XMVECTORXMVectorSwizzle<0,0,0,0>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<1,1,1,1>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,2,2,2>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<3,3,3,3>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,3,2>(FXMVECTOR V)
{ return vrev64q_f32(V); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,0,1>(FXMVECTOR V)
{ float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,2,3>(FXMVECTOR V)
{ float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,1,0>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,3,2>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,3,2>(FXMVECTOR V)
{ return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,2,3>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,1,0>(FXMVECTOR V)
{ return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,0,1>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,1,0>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<0,0,2,2>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,1,3,3>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,0,1,1>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<2,2,3,3>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,2,0,2>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,3,1,3>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V)
{ return vextq_f32(V, V, 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V)
{ return vextq_f32(V, V, 2); }
template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V)
{ return vextq_f32(V, V, 3); }
//通用swizzle模板
模板
内联XMVECTOR XMVectorSwizzle(FXMVECTOR V)
{
静态断言(SwizzleX提供了信息性的答案,但我想我还是不太明白;我已经编辑了我的问题--你能再看一看吗?需要注意的关键是,vtbl2_u8
不在\uuuu n128
上运行,它在两个不同类型的\uuu n64
寄存器上运行。没有完整的128位版本的ARM-NEONvtbl
固有。这不同于说vadd_u8
(代码>版本)和vaddq_u8
(代码>版本)。也就是说,没有vtblXq__Y
instrinsic。顺便说一句,我发现在编写我的ARM-NEON实现时非常有用。如果您使用的是VS 2012、VS 2013或VS 2015,请查看编译器附带的Windows 8.x SDK中的DirectXMath。它是一个全内联头实现,因此您可以看到C、SSE和ARM-NEON版本所有函数都是并排的。
// DirectXMathVector.inl
inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V,
uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3)
{
assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
static const uint32_t ControlElement[ 4 ] =
{
0x03020100, // XM_SWIZZLE_X
0x07060504, // XM_SWIZZLE_Y
0x0B0A0908, // XM_SWIZZLE_Z
0x0F0E0D0C, // XM_SWIZZLE_W
};
int8x8x2_t tbl;
tbl.val[0] = vget_low_f32(V);
tbl.val[1] = vget_high_f32(V);
uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0])
| (((uint64_t)ControlElement[E1]) << 32) );
const uint8x8_t rL = vtbl2_u8( tbl, idx );
idx = vcreate_u32( ((uint64_t)ControlElement[E2])
| (((uint64_t)ControlElement[E3]) << 32) );
const uint8x8_t rH = vtbl2_u8( tbl, idx );
return vcombine_f32( rL, rH );
}
// General swizzle template
template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
inline XMVECTOR XMVectorSwizzle(FXMVECTOR V)
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
}
// Specialized swizzles
template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V)
{ return V; }
template<> inline XMVECTORXMVectorSwizzle<0,0,0,0>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<1,1,1,1>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_low_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,2,2,2>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<3,3,3,3>(FXMVECTOR V)
{ return vdupq_lane_f32( vget_high_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,3,2>(FXMVECTOR V)
{ return vrev64q_f32(V); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,0,1>(FXMVECTOR V)
{ float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,2,3>(FXMVECTOR V)
{ float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,1,0>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,3,2>(FXMVECTOR V)
{ float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,3,2>(FXMVECTOR V)
{ return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,2,3>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,1,0>(FXMVECTOR V)
{ return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,0,1>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,1,0>(FXMVECTOR V)
{ return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<0,0,2,2>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,1,3,3>(FXMVECTOR V)
{ return vtrnq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,0,1,1>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<2,2,3,3>(FXMVECTOR V)
{ return vzipq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,2,0,2>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,3,1,3>(FXMVECTOR V)
{ return vuzpq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V)
{ return vextq_f32(V, V, 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V)
{ return vextq_f32(V, V, 2); }
template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V)
{ return vextq_f32(V, V, 3); }