C 为什么在这个矩阵乘法代码中_mm512_store_pd超慢?
我正在玩avx512和矩阵乘法,但我一定是做错了什么,因为当我尝试使用_mm512_store_pd存储结果时,我的性能很差 以下是相关的代码片段,首先是我正在使用的数据结构以及我如何初始化它:C 为什么在这个矩阵乘法代码中_mm512_store_pd超慢?,c,performance,gcc,matrix,avx512,C,Performance,Gcc,Matrix,Avx512,我正在玩avx512和矩阵乘法,但我一定是做错了什么,因为当我尝试使用_mm512_store_pd存储结果时,我的性能很差 以下是相关的代码片段,首先是我正在使用的数据结构以及我如何初始化它: typedef struct { double* values; int nb_l; int nb_c; } matrix; matrix* alloc_matrix(int nb_l, int nb_c){ matrix* tmp_m
typedef struct {
double* values;
int nb_l;
int nb_c;
} matrix;
matrix* alloc_matrix(int nb_l, int nb_c){
matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
tmp_matrix->nb_l = nb_l;
tmp_matrix->nb_c = nb_c;
return tmp_matrix;
}
下面是我如何将代码中其他地方初始化的两个矩阵相乘:
matrix* mult_matrix(matrix* A, matrix* B){
/* avx512 */
matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
double* res_ptr; // start index of the current line in res_matrix
double* B_ptr; // start index of the current line in B
__m512d A_broadcast, B_l_8, res_ptr_8;
for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
// broadcast current value of A eight times
A_broadcast = _mm512_set1_pd(A->values[idx_A]);
res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
_mm512_store_pd(
&res_ptr[offset_B] ,
_mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8)
);
}
}
return res_matrix;
矩阵*mult_矩阵(矩阵*A,矩阵*B){
/*avx512*/
矩阵*res_矩阵=零矩阵(A->nb_l,B->nb_c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
__m512d A_广播、B_l_8、res_ptr_8;
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
//广播A的当前值八次
A_broadcast=_mm512_set1_pd(A->values[idx_A]);
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量_B=0;偏移量_Bnb_c;偏移量_B+=8){
B_l_8=_mm512_load_pd(&B_ptr[offset_B]);
res_ptr_8=_mm512_load_pd(&res_ptr[offset_B]);
_mm512_商店_pd(
&res_ptr[偏移量B],
_mm512_fmadd_pd(A_广播、B_l_8、res_ptr_8)
);
}
}
返回res_矩阵;
结果还可以,但_mm512 _store _pd占用了约90%的执行时间,实际上,此avx512代码仅比其非avx版本快
我已经尝试了我能想到的一切,但是我不明白为什么我在这段代码中有如此令人失望的表现。你知道吗
谢谢
编辑1
这是非avx代码
matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
double* res_ptr; // start index of the current line in res_matrix
double* B_ptr; // start index of the current line in B
for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){
res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
}
}
return res_matrix;
matrix*res\u matrix=zero\u matrix(A->nb\u l,B->nb\u c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量B=0;偏移量Bnb\c;偏移量B++){
res_ptr[offset_B]+=A->values[idx_A]*B_ptr[offset_B];
}
}
返回res_矩阵;
所有矩阵均为512x512随机矩阵,每次乘法重复50次,运行时间取平均值
最后,为了测试我的代码的avx和非avx版本,下面的代码段应该是可以的
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
#include <immintrin.h>
typedef struct {
double* values;
int nb_l;
int nb_c;
} matrix;
matrix* alloc_matrix(int nb_l, int nb_c){
matrix* tmp_matrix = (matrix*)malloc(sizeof(matrix));
tmp_matrix->values = (double*)aligned_alloc(64, sizeof(double) * nb_l * nb_c);
tmp_matrix->nb_l = nb_l;
tmp_matrix->nb_c = nb_c;
return tmp_matrix;
}
void free_matrix(matrix** to_free){
free((*to_free)->values);
free(*to_free);
}
matrix* zero_matrix(int nb_l, int nb_c){
matrix* z_matrix;
z_matrix = alloc_matrix(nb_l, nb_c);
for (int idx=0; idx < nb_l * nb_c; idx++){
z_matrix->values[idx] = 0.0;
}
return z_matrix;
}
matrix* rand_matrix(int nb_l, int nb_c, double max_abs_val){
static struct timeval seed; //static variables are zeroed at initialization
matrix* rnd_matrix;
rnd_matrix = alloc_matrix(nb_l, nb_c);
if (seed.tv_sec == 0){ //ts_sec will never be zero after gettimeofday, whereas tv_usec could
gettimeofday(&seed, NULL);
srand((unsigned) seed.tv_usec);
}
for (int idx=0; idx < nb_l * nb_c; idx++){
rnd_matrix->values[idx] = max_abs_val * ((double)rand() / RAND_MAX * 2.0 - 1.0);
}
return rnd_matrix;
}
matrix* mult_matrix_avx(matrix* A, matrix* B){
/* pas trop mal en avx512 */
matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
double* res_ptr; // start index of the current line in res_matrix
double* B_ptr; // start index of the current line in B
__m512d A_broadcast, B_l_8, res_ptr_8;
for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
A_broadcast = _mm512_set1_pd(A->values[idx_A]); // broadcast current value of A eight times
res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B+=8){
B_l_8 = _mm512_load_pd(&B_ptr[offset_B]);
res_ptr_8 = _mm512_load_pd(&res_ptr[offset_B]);
_mm512_store_pd(&res_ptr[offset_B] , _mm512_fmadd_pd(A_broadcast, B_l_8, res_ptr_8));
}
}
return res_matrix;
}
matrix* mult_matrix(matrix* A, matrix* B){
/* non avx512 */
matrix* res_matrix = zero_matrix(A->nb_l, B->nb_c);
double* res_ptr; // start index of the current line in res_matrix
double* B_ptr; // start index of the current line in B
for (unsigned int idx_A = 0; idx_A < A->nb_l * A-> nb_c; idx_A++){
res_ptr = res_matrix->values + (idx_A / A->nb_c) * B->nb_c;
B_ptr = B->values + (idx_A % A->nb_c) * B->nb_c;
for (unsigned int offset_B = 0; offset_B < B->nb_c; offset_B++){
res_ptr[offset_B] += A->values[idx_A] * B_ptr[offset_B];
}
}
return res_matrix;
}
int main(int argc, char *argv[]){
struct timeval before;
struct timeval after;
matrix* A = rand_matrix(512, 512, 5);
matrix* B = rand_matrix(512, 512, 5);
matrix *C;
gettimeofday(&before, NULL);
for (int j=0; j<50;j++){
C = mult_matrix_avx(A, B);
free_matrix(&C); // we will measure the same overhead here and in the non avx version
}
gettimeofday(&after, NULL);
double delta = ((after.tv_sec - before.tv_sec) * 1000000 +
(after.tv_usec - before.tv_usec))/50;
printf("avx %lf ms\n", delta);
gettimeofday(&before, NULL);
for (int j=0; j<50;j++){
C = mult_matrix(A, B);
free_matrix(&C);
}
gettimeofday(&after, NULL);
delta = ((after.tv_sec - before.tv_sec) * 1000000 +
(after.tv_usec - before.tv_usec))/50;
printf("non avx %lf ms\n", delta);
free_matrix(&A);
free_matrix(&B);
return 0;
}
#包括
#包括
#包括
#包括
#包括
#包括
类型定义结构{
双*值;
国际nb_l;
国际nb_c;
}基质;
矩阵*alloc_矩阵(int nb_l,int nb_c){
矩阵*tmp_矩阵=(矩阵*)malloc(矩阵);
tmp_矩阵->值=(双*)对齐_分配(64,sizeof(双)*nb_l*nb_c);
tmp_矩阵->nb_l=nb_l;
tmp_矩阵->nb_c=nb_c;
返回tmp_矩阵;
}
无空洞矩阵(矩阵**至无空洞){
自由((*至自由)->值);
免费(*至免费);
}
矩阵*zero_矩阵(int nb_l,int nb_c){
矩阵*z_矩阵;
z_矩阵=alloc_矩阵(nb_l,nb_c);
对于(int idx=0;idx值[idx]=0.0;
}
返回z_矩阵;
}
矩阵*rand_矩阵(整数nb_l,整数nb_c,双倍最大值){
static struct timeval seed;//静态变量在初始化时归零
矩阵*rnd_矩阵;
rnd_矩阵=alloc_矩阵(nb_l,nb_c);
如果(seed.tv_sec==0){//ts_sec在gettimeofday之后永远不会为零,而tv_usec可以
gettimeofday(&种子,NULL);
srand((未签名)seed.tv_usec);
}
对于(int idx=0;idx值[idx]=max_abs_val*((双)rand()/rand_max*2.0-1.0);
}
返回rnd_矩阵;
}
矩阵*mult\u矩阵*avx(矩阵*A,矩阵*B){
/*pas trop mal en avx512*/
矩阵*res_矩阵=零矩阵(A->nb_l,B->nb_c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
__m512d A_广播、B_l_8、res_ptr_8;
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
A_broadcast=_mm512_set1_pd(A->values[idx_A]);//广播A的当前值八次
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量_B=0;偏移量_Bnb_c;偏移量_B+=8){
B_l_8=_mm512_load_pd(&B_ptr[offset_B]);
res_ptr_8=_mm512_load_pd(&res_ptr[offset_B]);
_mm512_存储_pd(&res_ptr[offset_B],_mm512_fmadd_pd(A_广播,B_l_8,res_ptr_8));
}
}
返回res_矩阵;
}
矩阵*mult_矩阵(矩阵*A,矩阵*B){
/*非avx512*/
矩阵*res_矩阵=零矩阵(A->nb_l,B->nb_c);
double*res\u ptr;//res\u矩阵中当前行的开始索引
double*B_ptr;//B中当前行的开始索引
对于(无符号整数idx_A=0;idx_Anb_l*A->nb_c;idx_A++){
res_ptr=res_矩阵->值+(idx_A/A->nb_c)*B->nb_c;
B_ptr=B->values+(idx_A%A->nb_c)*B->nb_c;
对于(无符号整数偏移量B=0;偏移量Bnb\c;偏移量B+
matrix* res_matrix = alloc_matrix(A->nb_l, B->nb_c);
double tmp;
for (unsigned int res_l=0; res_l < res_matrix->nb_l; res_l++){
for (unsigned int res_c=0; res_c < res_matrix->nb_c; res_c++){
tmp = 0.0;
for (int offset = 0; offset < A->nb_c; offset++){
tmp += A->values[res_l * A->nb_c + offset] *
B->values[offset * A->nb_c + res_c];
}
res_matrix->values[res_l * res_matrix->nb_c + res_c] = tmp;
}
}
return res_matrix;
matrix* res_matrix = alloc_matrix(A->nb_l, B->nb_c);
__m512d res_ptr_8;
for (unsigned int res_l=0; res_l < res_matrix->nb_l; res_l++){
for (unsigned int res_c=0; res_c < res_matrix->nb_c; res_c+=8){
// compute values from res_matrix[res_l, res_c] to [res_l, res_c+7]
res_ptr_8 = _mm512_set1_pd(0.0);
for (unsigned int offset_A_c = 0; offset_A_c < A->nb_c; offset_A_c++){
// on the res_l th line of A pick values one at a time
// at coordinates A[res_l, offset_A_c].
// Broadcast this value eight times into a mm512 vector
// and perform a dot product with the 8 values found in
// B from coordinates [offset_A_c, res_c] to [offset_A_c, res_c + 7]
res_ptr_8 = _mm512_fmadd_pd(
_mm512_set1_pd(A->values[res_l * A->nb_c + offset_A_c]),
_mm512_load_pd(&B->values[offset_A_c * B->nb_c + res_c]),
res_ptr_8);
}
_mm512_store_pd(&res_matrix->values[res_l*res_matrix->nb_c + res_c] , res_ptr_8);
}
}
#define NB_L_STRIDE 8
__m512d res_ptr_8[NB_L_STRIDE], B_ptr_8;
for (unsigned int res_l=0; res_l < res_matrix->nb_l; res_l+=NB_L_STRIDE){
for (unsigned int res_c=0; res_c < res_matrix->nb_c; res_c+=8){
for(unsigned int i=0; i<NB_L_STRIDE; i++)
res_ptr_8[i] = _mm512_setzero_pd();
for (unsigned int offset_A_c = 0; offset_A_c < A->nb_c; offset_A_c++){
// compute values from res_matrix[res_l, res_c] to [res_l, res_c+7]
// on the res_l th line of A pick values one at a time
// at coordinates A[res_l, offset_A_c].
// Broadcast this value eight times into a mm512 vector
// and perform a dot product with the 8 values found in
// B from coordinates [offset_A_c, res_c] to [offset_A_c, res_c + 7]
B_ptr_8 = _mm512_load_pd(&B->values[offset_A_c * B->nb_c + res_c]);
for(unsigned int i=0; i<NB_L_STRIDE; i++)
res_ptr_8[i] = _mm512_fmadd_pd(
_mm512_set1_pd(A->values[(res_l +i) * A->nb_c + offset_A_c]),
B_ptr_8,
res_ptr_8[i]);
}
for(unsigned int i=0; i<NB_L_STRIDE; i++)
_mm512_store_pd(
&res_matrix->values[(res_l + i)*res_matrix->nb_c + res_c] ,
res_ptr_8[i]);
}
}