Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/c/62.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C 为什么在这个矩阵乘法代码中_mm512_store_pd超慢?_C_Performance_Gcc_Matrix_Avx512 - Fatal编程技术网

C 为什么在这个矩阵乘法代码中_mm512_store_pd超慢?

C 为什么在这个矩阵乘法代码中_mm512_store_pd超慢?,c,performance,gcc,matrix,avx512,C,Performance,Gcc,Matrix,Avx512,我正在玩avx512和矩阵乘法,但我一定是做错了什么,因为当我尝试使用_mm512_store_pd存储结果时,我的性能很差 以下是相关的代码片段,首先是我正在使用的数据结构以及我如何初始化它: typedef struct { double* values; int nb_l; int nb_c; } matrix; matrix* alloc_matrix(int nb_l, int nb_c){ matrix* tmp_m

我正在玩avx512和矩阵乘法,但我一定是做错了什么,因为当我尝试使用_mm512_store_pd存储结果时,我的性能很差

以下是相关的代码片段,首先是我正在使用的数据结构以及我如何初始化它:


typedef struct {
        double* values;
        int nb_l;
        int nb_c;
} matrix;

matrix* alloc_matrix(int nb_l, int nb_c){
        matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
        tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
        tmp_matrix->nb_l = nb_l;
        tmp_matrix->nb_c = nb_c;
        return tmp_matrix;
}
下面是我如何将代码中其他地方初始化的两个矩阵相乘:

matrix* mult_matrix(matrix* A, matrix* B){
        /* avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        __m512d A_broadcast, B_l_8, res_ptr_8;
        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                // broadcast current value of A  eight times
                A_broadcast = _mm512_set1_pd(A->values[idx_A]);
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
                        B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
                        res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
                        _mm512_store_pd(
                                &res_ptr[offset_B] , 
                                _mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8)
                                );
                }
        }
        return res_matrix;
矩阵*mult_矩阵(矩阵*A,矩阵*B){
/*avx512*/
矩阵*res_矩阵=零矩阵(A->nb_l,B->nb_c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
__m512d A_广播、B_l_8、res_ptr_8;
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
//广播A的当前值八次
A_broadcast=_mm512_set1_pd(A->values[idx_A]);
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量_B=0;偏移量_Bnb_c;偏移量_B+=8){
B_l_8=_mm512_load_pd(&B_ptr[offset_B]);
res_ptr_8=_mm512_load_pd(&res_ptr[offset_B]);
_mm512_商店_pd(
&res_ptr[偏移量B],
_mm512_fmadd_pd(A_广播、B_l_8、res_ptr_8)
);
}
}
返回res_矩阵;
结果还可以,但_mm512 _store _pd占用了约90%的执行时间,实际上,此avx512代码仅比其非avx版本快

我已经尝试了我能想到的一切,但是我不明白为什么我在这段代码中有如此令人失望的表现。你知道吗

谢谢

编辑1

这是非avx代码

        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c; 
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c; 
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){                    
                        res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
                }
        }
        return res_matrix;



matrix*res\u matrix=zero\u matrix(A->nb\u l,B->nb\u c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量B=0;偏移量Bnb\c;偏移量B++){
res_ptr[offset_B]+=A->values[idx_A]*B_ptr[offset_B];
}
}
返回res_矩阵;
所有矩阵均为512x512随机矩阵,每次乘法重复50次,运行时间取平均值

最后,为了测试我的代码的avx和非avx版本,下面的代码段应该是可以的

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
#include <immintrin.h>

typedef struct {
        double* values;
        int nb_l;
        int nb_c;
} matrix;

matrix* alloc_matrix(int nb_l, int nb_c){
        matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
        tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
        tmp_matrix->nb_l = nb_l;
        tmp_matrix->nb_c = nb_c;
        return tmp_matrix;
}

void free_matrix(matrix** to_free){
        free((*to_free)->values);
        free(*to_free);
}

matrix* zero_matrix(int nb_l, int nb_c){
        matrix* z_matrix;
        z_matrix = alloc_matrix(nb_l, nb_c);
        for (int idx=0; idx < nb_l * nb_c; idx++){
                z_matrix->values[idx] = 0.0;
        }
        return z_matrix;
}
matrix* rand_matrix(int nb_l, int nb_c, double max_abs_val){
        static struct timeval seed; //static variables are zeroed at initialization
        matrix* rnd_matrix;
        rnd_matrix = alloc_matrix(nb_l, nb_c);

        if (seed.tv_sec == 0){ //ts_sec will never be zero after gettimeofday, whereas tv_usec could
                gettimeofday(&seed, NULL);
                srand((unsigned) seed.tv_usec);
        }
        for (int idx=0; idx < nb_l * nb_c; idx++){
                rnd_matrix->values[idx] = max_abs_val * ((double)rand() / RAND_MAX * 2.0 - 1.0);
        }

        return rnd_matrix;
}

matrix* mult_matrix_avx(matrix* A, matrix* B){
        /* pas trop mal en avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        __m512d A_broadcast, B_l_8, res_ptr_8;
        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                A_broadcast = _mm512_set1_pd(A->values[idx_A]); // broadcast current value of A eight times
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
                        B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
                        res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
                        _mm512_store_pd(&res_ptr[offset_B] , _mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8));
                }
        }
        return res_matrix;
}

matrix* mult_matrix(matrix* A, matrix* B){
        /* non avx512 */
        matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
        double* res_ptr; // start index of the current line in res_matrix
        double* B_ptr; // start index of the current line in B

        for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
                res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
                B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
                for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){
                        res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
                }
        }
        return res_matrix;
}
int main(int argc, char *argv[]){
        struct timeval before;
        struct timeval after;

        matrix* A = rand_matrix(512, 512, 5);
        matrix* B = rand_matrix(512, 512, 5);
        matrix *C;
        gettimeofday(&before, NULL);
        for (int j=0; j<50;j++){
                C = mult_matrix_avx(A, B);
                free_matrix(&C); // we will measure the same overhead here and in the non avx version
        }
        gettimeofday(&after, NULL);
        double delta = ((after.tv_sec - before.tv_sec) * 1000000 +
                (after.tv_usec - before.tv_usec))/50;
        printf("avx %lf ms\n", delta);
        gettimeofday(&before, NULL);
        for (int j=0; j<50;j++){
                C = mult_matrix(A, B);
                free_matrix(&C); 
        }
        gettimeofday(&after, NULL);
        delta = ((after.tv_sec - before.tv_sec) * 1000000 +
                (after.tv_usec - before.tv_usec))/50;
        printf("non avx %lf ms\n", delta);

        free_matrix(&A);
        free_matrix(&B);
        return 0;
}

#包括
#包括
#包括
#包括
#包括
#包括
类型定义结构{
双*值;
国际nb_l;
国际nb_c;
}基质;
矩阵*alloc_矩阵(int nb_l,int nb_c){
矩阵*tmp_矩阵=(矩阵*)malloc(矩阵);
tmp_矩阵->值=(双*)对齐_分配(64,sizeof(双)*nb_l*nb_c);
tmp_矩阵->nb_l=nb_l;
tmp_矩阵->nb_c=nb_c;
返回tmp_矩阵;
}
无空洞矩阵(矩阵**至无空洞){
自由((*至自由)->值);
免费(*至免费);
}
矩阵*zero_矩阵(int nb_l,int nb_c){
矩阵*z_矩阵;
z_矩阵=alloc_矩阵(nb_l,nb_c);
对于(int idx=0;idx值[idx]=0.0;
}
返回z_矩阵;
}
矩阵*rand_矩阵(整数nb_l,整数nb_c,双倍最大值){
static struct timeval seed;//静态变量在初始化时归零
矩阵*rnd_矩阵;
rnd_矩阵=alloc_矩阵(nb_l,nb_c);
如果(seed.tv_sec==0){//ts_sec在gettimeofday之后永远不会为零,而tv_usec可以
gettimeofday(&种子,NULL);
srand((未签名)seed.tv_usec);
}
对于(int idx=0;idx值[idx]=max_abs_val*((双)rand()/rand_max*2.0-1.0);
}
返回rnd_矩阵;
}
矩阵*mult\u矩阵*avx(矩阵*A,矩阵*B){
/*pas trop mal en avx512*/
矩阵*res_矩阵=零矩阵(A->nb_l,B->nb_c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
__m512d A_广播、B_l_8、res_ptr_8;
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
A_broadcast=_mm512_set1_pd(A->values[idx_A]);//广播A的当前值八次
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量_B=0;偏移量_Bnb_c;偏移量_B+=8){
B_l_8=_mm512_load_pd(&B_ptr[offset_B]);
res_ptr_8=_mm512_load_pd(&res_ptr[offset_B]);
_mm512_存储_pd(&res_ptr[offset_B],_mm512_fmadd_pd(A_广播,B_l_8,res_ptr_8));
}
}
返回res_矩阵;
}
矩阵*mult_矩阵(矩阵*A,矩阵*B){
/*非avx512*/
矩阵*res_矩阵=零矩阵(A->nb_l,B->nb_c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量B=0;偏移量Bnb\c;偏移量B+
        matrix* res_matrix = alloc_matrix(A->nb_l, B->nb_c);
        double tmp;
        for (unsigned int res_l=0; res_l < res_matrix->nb_l; res_l++){
                for (unsigned int res_c=0; res_c < res_matrix->nb_c; res_c++){
                        tmp = 0.0;
                        for (int offset = 0; offset < A->nb_c; offset++){
                                tmp += A->values[res_l * A->nb_c + offset] *
                                        B->values[offset * A->nb_c + res_c];
                        }
                        res_matrix->values[res_l * res_matrix->nb_c + res_c] = tmp;
                }

        }
        return res_matrix;
        matrix* res_matrix = alloc_matrix(A->nb_l, B->nb_c);
        __m512d res_ptr_8;

        for (unsigned int res_l=0; res_l < res_matrix->nb_l; res_l++){
                for (unsigned int res_c=0; res_c < res_matrix->nb_c; res_c+=8){
                        // compute values from res_matrix[res_l, res_c] to [res_l, res_c+7]
                        res_ptr_8 = _mm512_set1_pd(0.0);
                        for (unsigned int offset_A_c = 0; offset_A_c < A->nb_c; offset_A_c++){
                                // on the res_l th line of A pick values one at a time
                                // at coordinates A[res_l, offset_A_c].
                                // Broadcast this value eight times into a mm512 vector
                                // and perform a dot product with the 8 values found in
                                // B from coordinates [offset_A_c, res_c] to [offset_A_c, res_c + 7]
                                res_ptr_8 = _mm512_fmadd_pd(
                                        _mm512_set1_pd(A->values[res_l * A->nb_c + offset_A_c]),
                                        _mm512_load_pd(&B->values[offset_A_c * B->nb_c + res_c]),
                                        res_ptr_8);
                        }
                        _mm512_store_pd(&res_matrix->values[res_l*res_matrix->nb_c + res_c] , res_ptr_8);
                }
        }
        #define NB_L_STRIDE 8
        __m512d res_ptr_8[NB_L_STRIDE], B_ptr_8;
        for (unsigned int res_l=0; res_l < res_matrix->nb_l; res_l+=NB_L_STRIDE){
                for (unsigned int res_c=0; res_c < res_matrix->nb_c; res_c+=8){
                        for(unsigned int i=0; i<NB_L_STRIDE; i++)
                                res_ptr_8[i] = _mm512_setzero_pd();
                        for (unsigned int offset_A_c = 0; offset_A_c < A->nb_c; offset_A_c++){
                        // compute values from res_matrix[res_l, res_c] to [res_l, res_c+7]
                                // on the res_l th line of A pick values one at a time
                                // at coordinates A[res_l, offset_A_c].
                                // Broadcast this value eight times into a mm512 vector
                                // and perform a dot product with the 8 values found in
                                // B from coordinates [offset_A_c, res_c] to [offset_A_c, res_c + 7]
                                B_ptr_8 = _mm512_load_pd(&B->values[offset_A_c * B->nb_c + res_c]);
                                for(unsigned int i=0; i<NB_L_STRIDE; i++)
                                        res_ptr_8[i] = _mm512_fmadd_pd(
                                                _mm512_set1_pd(A->values[(res_l +i) * A->nb_c + offset_A_c]),
                                                B_ptr_8,
                                                res_ptr_8[i]);

                        }
                        for(unsigned int i=0; i<NB_L_STRIDE; i++)
                                _mm512_store_pd(
                                        &res_matrix->values[(res_l + i)*res_matrix->nb_c + res_c] ,
                                        res_ptr_8[i]);
                }
        }