C++ 使用SSE计算矩阵积比使用直接算法要慢得多_C++_Matrix_Sse

C++ 使用SSE计算矩阵积比使用直接算法要慢得多

c++ matrix

C++ 使用SSE计算矩阵积比使用直接算法要慢得多,c++,matrix,sse,C++,Matrix,Sse,我想用直接算法乘以两个矩阵，一次： template <typename T> void multiplicate_straight(T ** A, T ** B, T ** C, int sizeX) { T ** D = AllocateDynamicArray2D<T>(sizeX, sizeX); transpose_matrix(B, D,sizeX); for(int i = 0; i < sizeX; i++) {

我想用直接算法乘以两个矩阵，一次：

template <typename T>
void multiplicate_straight(T ** A, T ** B, T ** C, int sizeX)
{
    T ** D = AllocateDynamicArray2D<T>(sizeX, sizeX);
    transpose_matrix(B, D,sizeX);
    for(int i = 0; i < sizeX; i++)
    {
        for(int j = 0; j < sizeX; j++)
        {
            for(int g = 0; g < sizeX; g++)
            {
                C[i][j] += A[i][g]*D[j][g];
            }
        }
    }
    FreeDynamicArray2D<T>(D);
}

模板
无效乘法（T**A、T**B、T**C、int sizeX）
{
T**D=AllocatedDynamicArray2d（sizeX，sizeX）；
转置矩阵（B，D，sizeX）；
对于（int i=0；i


一次通过使用SSE函数。为此，我创建了两个函数：
template <typename T>
void SSE_vectormult(T * A, T * B, int size)
{

    __m128d a;
    __m128d b;
    __m128d c;
#ifdef linux
    double A2[2], B2[2], C[2] __attribute__ ((aligned(16)));
#endif
#ifdef _WIN32
    __declspec(align(16)) double A2[2], B2[2], C[2];
#endif
    for(int i = 0; i < size; i+=2)
    {
        //std::cout << "In SSE_vectormult: i is: " << i << '\n';
        A2[0] = A[i];
        B2[0] = B[i];
        A2[1] = A[i+1];
        B2[1] = B[i+1];
        //std::cout << "Values from A and B written to A2 and B2\n";
        a = _mm_load_pd(A2);
        b = _mm_load_pd(B2);
        //std::cout << "Values converted to a and b\n";
        c = _mm_mul_pd(a,b);
        _mm_store_pd(C, c);
        A[i] = C[0];
        A[i+1] = C[1];
    };
}

模板
无效SSE_向量（T*A、T*B、整数大小）
{
__m128da；
__m128d-b；
__m128d-c；
#ifdef linux
双A2[2]，B2[2]，C[2]uuuuu属性（对齐（16））；
#恩迪夫
#ifdef_WIN32
__declspec（align（16））双A2[2]，B2[2]，C[2]；
#恩迪夫
对于（int i=0；i//std:：cout我认为这应该与SSE的第一个循环做相同的事情，假设sizeX是2的倍数，并且内存是16字节对齐的
通过展开循环并使用多个临时变量（在最后添加到一起），您可能会获得更高的性能。您也可以尝试AVX和新的
模板
无效乘法2（T**A、T**B、T**C、int sizeX）
{
T**D=AllocatedDynamicArray2d（sizeX，sizeX）；
转置矩阵（B，D，sizeX）；
对于（int i=0；i
在标量代码中使用转置是正确的想法，但在使用SSE时，您并不需要确切的转置
让我们坚持使用浮点（SGEMM）。您想对SSE一次做四个点积。您想要C=A*B
。让我们看看8x8矩阵。假设B
是：
(0   1  2  3) ( 4  5  6  7)
(8   9 10 11) (12 13 14 15) 
(16 17 18 19) (20 21 22 23)
(24 25 26 27) (28 29 30 31)
(32 33 34 35) (36 37 38 39)
(40 41 42 43) (44 45 46 47)
(48 49 50 51) (52 53 54 55)
(56 57 58 59) (60 61 62 63)

因此，对于SSE，您可以：
C[0][0] C[0][1] C[0][2] C[0][3] = 
A[0][0]*(0 1 2 3) + A[0][1]*(8 9 10 11) + A[0][2]*(16 17 18 19)...+ A[0][7]*(56 57 58 59)

这会一次得到四个点积。问题是你必须向下移动B
中的一列，并且这些值不在同一个缓存线中。如果四个宽度的每列在内存中是连续的，那就更好了。因此，与其对每个元素进行转置，不如转置宽度为4的条带这：
(0  1  2  3)( 8  9 10 11)(16 17 18 19)(24 25 26 27)(32 33 34 35)(40 41 42 43)(48 49 50 51)(56 57 58 59)
(4  5  6  7)(12 13 14 15)(20 21 22 23)(28 29 30 31)(36 37 38 39)(44 45 46 47)(52 53 54 55)(60 61 62 63)

如果将括号中的四个值视为一个单位，这相当于将8x2矩阵转换为2x8矩阵。请注意，B
的四个宽度列在内存中是连续的。这对缓存友好得多。对于8x8矩阵，这实际上不是一个问题，但对于1024x1矩阵来说024矩阵。请参阅下面的代码了解如何执行此操作。对于AVX，转置宽度为8的条带（这意味着对于8x8矩阵，您无需做任何事情）。对于双倍宽度，SSE为2，AVX为4
假设矩阵适合缓存，这应该比标量代码快四倍。但是，对于大型矩阵，此方法仍然会受到内存限制，因此您的SSE代码可能不会比标量代码快很多（但不应该更糟）
但是，如果使用循环平铺并在平铺中重新排列矩阵（适合二级缓存），而不是整个矩阵，则矩阵乘法会受到计算限制，而不是内存限制，即使对于不适合三级缓存的非常大的矩阵也是如此。这是另一个主题
编辑：一些（未测试的）代码与标量代码进行比较。我将循环展开了2
void SGEMM_SSE(const float *A, const float *B, float *C, const int sizeX) {
    const int simd_width = 4;
    const int unroll = 2;
    const int strip_width = simd_width*unroll
    float *D = (float*)_mm_malloc(sizeof(float)*sizeX*sizeX, 16);
    transpose_matrix_strip(B, D,sizeX, strip_width); //tranpose B in strips of width eight
    for(int i = 0; i < sizeX; i++) {
        for(int j = 0; j < sizeX; j+=strip_width) {
            float4 out_v1 = 0; //broadcast (0,0,0,0)
            float4 out_V2 = 0;
            //now calculate eight dot products
            for(int g = 0; g < sizeX; g++) {
                //load eight values rrom D into two SSE registers
                float4 vec4_1.load(&D[j*sizeX + strip_width*g]);
                float4 vec4_2.load(&D[j*sizeX + strip_width*g + simd_width]);
                out_v1 += A[i][g]*vec4_v1;
                out_v2 += A[i][g]*vec4_v2;
            }
            //store eight dot prodcuts into C
            out_v1.store(&C[i*sizeX + j]);
            out_v2.store(&C[i*sizeX + j + simd_width]);
        }
    }
    _mm_free(D);
}

void transpose_matrix_strip(const float* A, float* B, const int N, const int strip_width) {
    //#pragma omp parallel for
    for(int n=0; n<N*N; n++) {
        int k = strip_width*(n/N/strip_width);
        int i = (n/strip_width)%N;
        int j = n%strip_width;
        B[n] = A[N*i + k + j];
    }
}

void SGEMM_SSE（常量浮点*A、常量浮点*B、浮点*C、常量整数sizeX）{
const int simd_width=4；
常数int展开=2；
const int strip_WITH=simd_WITH*展开
浮点数*D=（浮点数*）\u mm\u malloc（浮点数）*sizeX*sizeX，16）；
转置矩阵带（B，D，sizeX，带宽度）；//在宽度为8的带中转置B
对于（int i=0；i对于（int n=0；n看起来你陷入了通常的陷阱：数据移动太多，计算太少。你有一条实际的算术指令用于9+内存访问。这是一个1/9的比率。如果你想要任何一种像样的加速，你需要这个比率至少是2/1。@Mystical:那么有没有可能to重写函数以提高速度，或者我的问题不适合这样做？矩阵乘法可以用具有高计算/内存访问率的方式完成。但这并不容易。适用于这两种方法的一个小建议是跳过转置，并在第二个矩阵的第一维（而不是第二维）上求和。即：C[i] [j]+=A[i][g]*D[g][j]；@Matt:但是这
(0  1  2  3)( 8  9 10 11)(16 17 18 19)(24 25 26 27)(32 33 34 35)(40 41 42 43)(48 49 50 51)(56 57 58 59)
(4  5  6  7)(12 13 14 15)(20 21 22 23)(28 29 30 31)(36 37 38 39)(44 45 46 47)(52 53 54 55)(60 61 62 63)

void SGEMM_SSE(const float *A, const float *B, float *C, const int sizeX) {
    const int simd_width = 4;
    const int unroll = 2;
    const int strip_width = simd_width*unroll
    float *D = (float*)_mm_malloc(sizeof(float)*sizeX*sizeX, 16);
    transpose_matrix_strip(B, D,sizeX, strip_width); //tranpose B in strips of width eight
    for(int i = 0; i < sizeX; i++) {
        for(int j = 0; j < sizeX; j+=strip_width) {
            float4 out_v1 = 0; //broadcast (0,0,0,0)
            float4 out_V2 = 0;
            //now calculate eight dot products
            for(int g = 0; g < sizeX; g++) {
                //load eight values rrom D into two SSE registers
                float4 vec4_1.load(&D[j*sizeX + strip_width*g]);
                float4 vec4_2.load(&D[j*sizeX + strip_width*g + simd_width]);
                out_v1 += A[i][g]*vec4_v1;
                out_v2 += A[i][g]*vec4_v2;
            }
            //store eight dot prodcuts into C
            out_v1.store(&C[i*sizeX + j]);
            out_v2.store(&C[i*sizeX + j + simd_width]);
        }
    }
    _mm_free(D);
}

void transpose_matrix_strip(const float* A, float* B, const int N, const int strip_width) {
    //#pragma omp parallel for
    for(int n=0; n<N*N; n++) {
        int k = strip_width*(n/N/strip_width);
        int i = (n/strip_width)%N;
        int j = n%strip_width;
        B[n] = A[N*i + k + j];
    }
}