SSE:重新解释铸造<__m128*>;而不是_mm_加载_ps 我在C++中编写一个简单的卷积函数的过程,从非常基本的“滑动窗口”卷积到正则积(现在没有FFT的东西),最多见,AVX,可能是OpenCL。不过,我遇到了SSE的问题。我的代码如下所示: for (x = 0; x < SIZEX - KSIZEX + 1; ++x) { for (y = 0; y < SIZEY - KSIZEY + 1; ++y) { tmp = 0.0f; float fDPtmp = 0.0f; float *Kp = &K[0]; for (xi = 0; xi < KSIZEX; ++xi, Kp=Kp+4) { float *Cp = &C[(x+xi)*SIZEY + y]; __m128 *KpSSE = reinterpret_cast<__m128*>(&K); __m128 *CpSSE = reinterpret_cast<__m128*>(&C[(x + xi)*SIZEY + y]); __m128 DPtmp = _mm_dp_ps(*KpSSE, *CpSSE, 0xFF); _mm_store_ss(&fDPtmp, DPtmp); tmp += fDPtmp; } R[k] = tmp; ++k; } } __declspec(align(16)) float* ReadMatrix(string path) { streampos size; ifstream file(path, ios::in | ios::binary | ios::ate); if (file.is_open()) { size = file.tellg(); __declspec(align(16)) float *C = new float[size]; file.seekg(0, ios::beg); file.read(reinterpret_cast<char*>(&C[0]), size); file.close(); return C; } else cout << "Unable to open file" << endl; }
代码在y=1时崩溃,所以我觉得处理指针的方式可能有错误。有趣的是,如果我用_mm_set_ps替换reinterpret_cast,即SSE:重新解释铸造<__m128*>;而不是_mm_加载_ps 我在C++中编写一个简单的卷积函数的过程,从非常基本的“滑动窗口”卷积到正则积(现在没有FFT的东西),最多见,AVX,可能是OpenCL。不过,我遇到了SSE的问题。我的代码如下所示: for (x = 0; x < SIZEX - KSIZEX + 1; ++x) { for (y = 0; y < SIZEY - KSIZEY + 1; ++y) { tmp = 0.0f; float fDPtmp = 0.0f; float *Kp = &K[0]; for (xi = 0; xi < KSIZEX; ++xi, Kp=Kp+4) { float *Cp = &C[(x+xi)*SIZEY + y]; __m128 *KpSSE = reinterpret_cast<__m128*>(&K); __m128 *CpSSE = reinterpret_cast<__m128*>(&C[(x + xi)*SIZEY + y]); __m128 DPtmp = _mm_dp_ps(*KpSSE, *CpSSE, 0xFF); _mm_store_ss(&fDPtmp, DPtmp); tmp += fDPtmp; } R[k] = tmp; ++k; } } __declspec(align(16)) float* ReadMatrix(string path) { streampos size; ifstream file(path, ios::in | ios::binary | ios::ate); if (file.is_open()) { size = file.tellg(); __declspec(align(16)) float *C = new float[size]; file.seekg(0, ios::beg); file.read(reinterpret_cast<char*>(&C[0]), size); file.close(); return C; } else cout << "Unable to open file" << endl; },c++,x86,sse,simd,C++,X86,Sse,Simd,代码在y=1时崩溃,所以我觉得处理指针的方式可能有错误。有趣的是,如果我用_mm_set_ps替换reinterpret_cast,即 __m128 KpSSE = _mm_set_ps(Kp[0], Kp[1], Kp[2], Kp[3]); __m128 CpSSE = _mm_set_ps(Cp[0], Cp[1], Cp[2], Cp[3]); __m128 DPtmp = _mm_dp_ps(KpSSE, CpSSE, 0xFF); _mm_store_ss(&fDPtmp,
__m128 KpSSE = _mm_set_ps(Kp[0], Kp[1], Kp[2], Kp[3]);
__m128 CpSSE = _mm_set_ps(Cp[0], Cp[1], Cp[2], Cp[3]);
__m128 DPtmp = _mm_dp_ps(KpSSE, CpSSE, 0xFF);
_mm_store_ss(&fDPtmp, DPtmp);
整个程序运行得很好,但速度较慢,我将这归咎于所有的复制操作
有人能告诉我我到底做错了什么吗
多谢各位
拍
更新:好的,正如Paul指出的,问题在于ReadMatrix(或者另一种解决方案是使用_mm_loadu_ps)。对于ReadMatrix(),它看起来是这样的:
for (x = 0; x < SIZEX - KSIZEX + 1; ++x)
{
for (y = 0; y < SIZEY - KSIZEY + 1; ++y)
{
tmp = 0.0f;
float fDPtmp = 0.0f;
float *Kp = &K[0];
for (xi = 0; xi < KSIZEX; ++xi, Kp=Kp+4)
{
float *Cp = &C[(x+xi)*SIZEY + y];
__m128 *KpSSE = reinterpret_cast<__m128*>(&K);
__m128 *CpSSE = reinterpret_cast<__m128*>(&C[(x + xi)*SIZEY + y]);
__m128 DPtmp = _mm_dp_ps(*KpSSE, *CpSSE, 0xFF);
_mm_store_ss(&fDPtmp, DPtmp);
tmp += fDPtmp;
}
R[k] = tmp;
++k;
}
}
__declspec(align(16)) float* ReadMatrix(string path)
{
streampos size;
ifstream file(path, ios::in | ios::binary | ios::ate);
if (file.is_open())
{
size = file.tellg();
__declspec(align(16)) float *C = new float[size];
file.seekg(0, ios::beg);
file.read(reinterpret_cast<char*>(&C[0]), size);
file.close();
return C;
}
else cout << "Unable to open file" << endl;
}
\uuuu declspec(align(16))float*ReadMatrix(字符串路径)
{
streampos大小;
ifstream文件(路径,ios::in | ios::binary | ios::ate);
if(file.is_open())
{
size=file.tellg();
__declspec(align(16))float*C=新的float[大小];
seekg(0,ios::beg);
file.read(reinterpret_cast(&C[0]),大小);
file.close();
返回C;
}
否则这不会像你认为的那样:
__declspec(align(16)) float *C = ReadMatrix("E:\\Code\\conv\\C.bin");
alignment指令在这里实现的只是将指针本身(即C
)与16字节边界对齐,而不是指针的内容
您需要修复ReadMatrix
,以便它返回适当对齐的数据,或者使用\u mm\u loadu\u ps
,正如其他人已经建议的那样
不要使用\u mm\u set\u ps
,因为这会在引擎盖下生成大量指令,而不像\u mm\u loadu\u ps
那样映射到单个指令
更新
您在ReadMatrix中重复了相同的错误:
__declspec(align(16)) float *C = new float[size];
同样,这不能保证数据的对齐,只能保证指针C
本身的对齐。要修复此分配,可以使用\u mm\u malloc
或\u aligned\u malloc
:
float *C = _mm_malloc(size * sizeof(*C), 16);
或
在ReadMatrix
中,您无法保证新的
表达式返回正确对齐的指针。分配给对齐指针并不重要(我甚至不确定您的语法是否表示指针本身对齐,或者指针指向什么)
您需要使用\u mm\u align
,或\u mm\u malloc
,或其他一些对齐的分配工具。您不能在此处使用reinterpret\u cast,我知道\u mmloadu\u ps很慢。但还有另一种方法。在对其执行操作之前,展开循环,读取对齐的数据,并在新值中进行移位和掩码。这将是错误的正确。也就是说,您可以在内部循环中执行以下操作:
__m128i x = _mm_load_ps(p);
__m128i y = _mm_load_ps(p + sizeof(float));
__m128i z;
// do your operation on x 1st time this iteration here
z = _mm_slli_si128(y, sizeof(float) * 3);
x = _mm_srli_si128(x, sizeof(float));
x = _mm_or_si128(x, z);
// do your operation on x 2nd time this iteration here
z = _mm_slli_si128(y, sizeof(float) * 2);
x = _mm_srli_si128(x, sizeof(float) * 2);
x = _mm_or_si128(x, z);
// do your operation on x 3rd time this iteration here
z = _mm_slli_si128(y, sizeof(float));
x = _mm_srli_si128(x, sizeof(float) * 3);
x = _mm_or_si128(x, z);
// do your operation on x 4th time this iteration here
x = y; // don’t need to read in x next iteration, only y
loopCounter += 4 * sizeof(float);
您不能将指针指向\uuuum128
。这没有意义,因为\uuuum128
映射到任何XMM[0-7]注册。reinterpret\u cast
是错误的,你应该使用\u mm\u loadu\u ps
@UmNyobe:不,代码还可以,如果数据正确对齐就可以了。嗨,保罗。非常感谢你的帮助。它还没有完全起作用(见原始帖子)。如果它对你来说不是太大的负担,你(当然还有其他人)可以吗,再看一看?当然-请看上面答案中的更新。Paul,这似乎有点不吉利。添加了更改(请参见上面的更新),但它仍然不起作用。在y=1时在\um128 DPtmp=\umm\udp\ups(*kpse,*CpSSE,0xFF)崩溃
,这很奇怪,因为调试显示这两个_mm128很好地填充了。请您再看一看好吗?看起来CpSSE在y的第二次迭代中会错位,因为y=1。@PaulR,如何处理不是8/16乘法的数据并使用SSE?
__m128i x = _mm_load_ps(p);
__m128i y = _mm_load_ps(p + sizeof(float));
__m128i z;
// do your operation on x 1st time this iteration here
z = _mm_slli_si128(y, sizeof(float) * 3);
x = _mm_srli_si128(x, sizeof(float));
x = _mm_or_si128(x, z);
// do your operation on x 2nd time this iteration here
z = _mm_slli_si128(y, sizeof(float) * 2);
x = _mm_srli_si128(x, sizeof(float) * 2);
x = _mm_or_si128(x, z);
// do your operation on x 3rd time this iteration here
z = _mm_slli_si128(y, sizeof(float));
x = _mm_srli_si128(x, sizeof(float) * 3);
x = _mm_or_si128(x, z);
// do your operation on x 4th time this iteration here
x = y; // don’t need to read in x next iteration, only y
loopCounter += 4 * sizeof(float);