SSE：重新解释铸造<__m128*>；而不是_mm_加载_ps 我在C++中编写一个简单的卷积函数的过程，从非常基本的“滑动窗口”卷积到正则积（现在没有FFT的东西），最多见，AVX，可能是OpenCL。不过，我遇到了SSE的问题。我的代码如下所示： for (x = 0; x < SIZEX - KSIZEX + 1; ++x) { for (y = 0; y < SIZEY - KSIZEY + 1; ++y) { tmp = 0.0f; float fDPtmp = 0.0f; float *Kp = &K[0]; for (xi = 0; xi < KSIZEX; ++xi, Kp=Kp+4) { float *Cp = &C[(x+xi)*SIZEY + y]; __m128 *KpSSE = reinterpret_cast<__m128*>(&K); __m128 *CpSSE = reinterpret_cast<__m128*>(&C[(x + xi)*SIZEY + y]); __m128 DPtmp = _mm_dp_ps(*KpSSE, *CpSSE, 0xFF); _mm_store_ss(&fDPtmp, DPtmp); tmp += fDPtmp; } R[k] = tmp; ++k; } } __declspec(align(16)) float* ReadMatrix(string path) { streampos size; ifstream file(path, ios::in | ios::binary | ios::ate); if (file.is_open()) { size = file.tellg(); __declspec(align(16)) float *C = new float[size]; file.seekg(0, ios::beg); file.read(reinterpret_cast<char*>(&C[0]), size); file.close(); return C; } else cout << "Unable to open file" << endl; }_C++_X86_Sse_Simd

SSE：重新解释铸造<__m128*>；而不是_mm_加载_ps 我在C++中编写一个简单的卷积函数的过程，从非常基本的“滑动窗口”卷积到正则积（现在没有FFT的东西），最多见，AVX，可能是OpenCL。不过，我遇到了SSE的问题。我的代码如下所示： for (x = 0; x < SIZEX - KSIZEX + 1; ++x) { for (y = 0; y < SIZEY - KSIZEY + 1; ++y) { tmp = 0.0f; float fDPtmp = 0.0f; float *Kp = &K[0]; for (xi = 0; xi < KSIZEX; ++xi, Kp=Kp+4) { float *Cp = &C[(x+xi)*SIZEY + y]; __m128 *KpSSE = reinterpret_cast<__m128*>(&K); __m128 *CpSSE = reinterpret_cast<__m128*>(&C[(x + xi)*SIZEY + y]); __m128 DPtmp = _mm_dp_ps(*KpSSE, *CpSSE, 0xFF); _mm_store_ss(&fDPtmp, DPtmp); tmp += fDPtmp; } R[k] = tmp; ++k; } } __declspec(align(16)) float* ReadMatrix(string path) { streampos size; ifstream file(path, ios::in | ios::binary | ios::ate); if (file.is_open()) { size = file.tellg(); __declspec(align(16)) float *C = new float[size]; file.seekg(0, ios::beg); file.read(reinterpret_cast<char*>(&C[0]), size); file.close(); return C; } else cout << "Unable to open file" << endl; }

c++ x86

SSE：重新解释铸造<__m128*>；而不是_mm_加载_ps 我在C++中编写一个简单的卷积函数的过程，从非常基本的“滑动窗口”卷积到正则积（现在没有FFT的东西），最多见，AVX，可能是OpenCL。不过，我遇到了SSE的问题。我的代码如下所示： for (x = 0; x < SIZEX - KSIZEX + 1; ++x) { for (y = 0; y < SIZEY - KSIZEY + 1; ++y) { tmp = 0.0f; float fDPtmp = 0.0f; float *Kp = &K[0]; for (xi = 0; xi < KSIZEX; ++xi, Kp=Kp+4) { float *Cp = &C[(x+xi)*SIZEY + y]; __m128 *KpSSE = reinterpret_cast<__m128*>(&K); __m128 *CpSSE = reinterpret_cast<__m128*>(&C[(x + xi)*SIZEY + y]); __m128 DPtmp = _mm_dp_ps(*KpSSE, *CpSSE, 0xFF); _mm_store_ss(&fDPtmp, DPtmp); tmp += fDPtmp; } R[k] = tmp; ++k; } } __declspec(align(16)) float* ReadMatrix(string path) { streampos size; ifstream file(path, ios::in | ios::binary | ios::ate); if (file.is_open()) { size = file.tellg(); __declspec(align(16)) float *C = new float[size]; file.seekg(0, ios::beg); file.read(reinterpret_cast<char*>(&C[0]), size); file.close(); return C; } else cout << "Unable to open file" << endl; },c++,x86,sse,simd,C++,X86,Sse,Simd,代码在y=1时崩溃，所以我觉得处理指针的方式可能有错误。有趣的是，如果我用_mm_set_ps替换reinterpret_cast，即 __m128 KpSSE = _mm_set_ps(Kp[0], Kp[1], Kp[2], Kp[3]); __m128 CpSSE = _mm_set_ps(Cp[0], Cp[1], Cp[2], Cp[3]); __m128 DPtmp = _mm_dp_ps(KpSSE, CpSSE, 0xFF); _mm_store_ss(&fDPtmp,

代码在y=1时崩溃，所以我觉得处理指针的方式可能有错误。有趣的是，如果我用_mm_set_ps替换reinterpret_cast，即

__m128 KpSSE = _mm_set_ps(Kp[0], Kp[1], Kp[2], Kp[3]);
__m128 CpSSE = _mm_set_ps(Cp[0], Cp[1], Cp[2], Cp[3]);
__m128 DPtmp = _mm_dp_ps(KpSSE, CpSSE, 0xFF);
_mm_store_ss(&fDPtmp, DPtmp);

整个程序运行得很好，但速度较慢，我将这归咎于所有的复制操作

有人能告诉我我到底做错了什么吗

多谢各位

拍

更新：好的，正如Paul指出的，问题在于ReadMatrix（或者另一种解决方案是使用_mm_loadu_ps）。对于ReadMatrix（），它看起来是这样的：

for (x = 0; x < SIZEX - KSIZEX + 1; ++x)
{
    for (y = 0; y < SIZEY - KSIZEY + 1; ++y)
    {           
        tmp = 0.0f;

        float fDPtmp = 0.0f;
        float *Kp = &K[0];


        for (xi = 0; xi < KSIZEX; ++xi, Kp=Kp+4)
        {                               
            float *Cp = &C[(x+xi)*SIZEY + y];

            __m128 *KpSSE = reinterpret_cast<__m128*>(&K);
            __m128 *CpSSE = reinterpret_cast<__m128*>(&C[(x + xi)*SIZEY + y]);
            __m128 DPtmp = _mm_dp_ps(*KpSSE, *CpSSE, 0xFF);
            _mm_store_ss(&fDPtmp, DPtmp);

            tmp += fDPtmp;
        }

        R[k] = tmp;
        ++k;
    }
}

__declspec(align(16)) float* ReadMatrix(string path)
{
    streampos size;

    ifstream file(path, ios::in | ios::binary | ios::ate);

    if (file.is_open())
    {
        size = file.tellg();
        __declspec(align(16)) float *C = new float[size];
        file.seekg(0, ios::beg);
        file.read(reinterpret_cast<char*>(&C[0]), size);
        file.close();

        return C;
    }
    else cout << "Unable to open file" << endl;
}

\uuuu declspec（align（16））float*ReadMatrix（字符串路径）
{
streampos大小；
ifstream文件（路径，ios:：in | ios:：binary | ios:：ate）；
if（file.is_open（））
{
size=file.tellg（）；
__declspec（align（16））float*C=新的float[大小]；
seekg（0，ios:：beg）；
file.read（reinterpret_cast（&C[0]），大小）；
file.close（）；
返回C；
}
否则这不会像你认为的那样：
__declspec(align(16)) float *C = ReadMatrix("E:\\Code\\conv\\C.bin");

alignment指令在这里实现的只是将指针本身（即C
）与16字节边界对齐，而不是指针的内容
您需要修复ReadMatrix
，以便它返回适当对齐的数据，或者使用\u mm\u loadu\u ps
，正如其他人已经建议的那样
不要使用\u mm\u set\u ps
，因为这会在引擎盖下生成大量指令，而不像\u mm\u loadu\u ps
那样映射到单个指令
更新
您在ReadMatrix中重复了相同的错误：
__declspec(align(16)) float *C = new float[size];

同样，这不能保证数据的对齐，只能保证指针C
本身的对齐。要修复此分配，可以使用\u mm\u malloc
或\u aligned\u malloc
：
float *C = _mm_malloc(size * sizeof(*C), 16); 

或
在ReadMatrix
中，您无法保证新的
表达式返回正确对齐的指针。分配给对齐指针并不重要（我甚至不确定您的语法是否表示指针本身对齐，或者指针指向什么）
您需要使用\u mm\u align
，或\u mm\u malloc
，或其他一些对齐的分配工具。
您不能在此处使用reinterpret\u cast，我知道\u mmloadu\u ps很慢。但还有另一种方法。在对其执行操作之前，展开循环，读取对齐的数据，并在新值中进行移位和掩码。这将是错误的正确。也就是说，您可以在内部循环中执行以下操作：
__m128i x = _mm_load_ps(p);
__m128i y = _mm_load_ps(p + sizeof(float));
__m128i z;

// do your operation on x 1st time this iteration here

z = _mm_slli_si128(y, sizeof(float) * 3);
x = _mm_srli_si128(x, sizeof(float));
x = _mm_or_si128(x, z);

// do your operation on x 2nd time this iteration here

z = _mm_slli_si128(y, sizeof(float) * 2);
x = _mm_srli_si128(x, sizeof(float) * 2);
x = _mm_or_si128(x, z);

// do your operation on x 3rd time this iteration here

z = _mm_slli_si128(y, sizeof(float));
x = _mm_srli_si128(x, sizeof(float) * 3);
x = _mm_or_si128(x, z);

// do your operation on x 4th time this iteration here

x = y; // don’t need to read in x next iteration, only y

loopCounter += 4 * sizeof(float);

您不能将指针指向\uuuum128
。这没有意义，因为\uuuum128
映射到任何XMM[0-7]注册。reinterpret\u cast
是错误的，你应该使用\u mm\u loadu\u ps
@UmNyobe:不，代码还可以，如果数据正确对齐就可以了。嗨，保罗。非常感谢你的帮助。它还没有完全起作用（见原始帖子）。如果它对你来说不是太大的负担，你（当然还有其他人）可以吗，再看一看？当然-请看上面答案中的更新。Paul，这似乎有点不吉利。添加了更改（请参见上面的更新），但它仍然不起作用。在y=1时在\um128 DPtmp=\umm\udp\ups（*kpse，*CpSSE，0xFF）崩溃，这很奇怪，因为调试显示这两个_mm128很好地填充了。请您再看一看好吗？看起来CpSSE在y的第二次迭代中会错位，因为y=1。@PaulR，如何处理不是8/16乘法的数据并使用SSE？
__m128i x = _mm_load_ps(p);
__m128i y = _mm_load_ps(p + sizeof(float));
__m128i z;

// do your operation on x 1st time this iteration here

z = _mm_slli_si128(y, sizeof(float) * 3);
x = _mm_srli_si128(x, sizeof(float));
x = _mm_or_si128(x, z);

// do your operation on x 2nd time this iteration here

z = _mm_slli_si128(y, sizeof(float) * 2);
x = _mm_srli_si128(x, sizeof(float) * 2);
x = _mm_or_si128(x, z);

// do your operation on x 3rd time this iteration here

z = _mm_slli_si128(y, sizeof(float));
x = _mm_srli_si128(x, sizeof(float) * 3);
x = _mm_or_si128(x, z);

// do your operation on x 4th time this iteration here

x = y; // don’t need to read in x next iteration, only y

loopCounter += 4 * sizeof(float);