C++ AVX中的6元素双精度向量矩阵向量乘法
我需要以双精度执行以下操作: 这些数字表示值如何存储在内存中。我想用AVX实现这一点。如果我先将C++ AVX中的6元素双精度向量矩阵向量乘法,c++,c,optimization,avx,C++,C,Optimization,Avx,我需要以双精度执行以下操作: 这些数字表示值如何存储在内存中。我想用AVX实现这一点。如果我先将[QK]的列填充为8个元素,然后用[x]和[QK]执行矩阵向量乘法,然后再执行点积,这是否最好 编辑:好的,所以我决定实现一个带有填充向量的32位浮点版本,如下所示: // Perform matrix vector multiplication of QK*x // Load first four columns QK into 4 ymm registers ym
[QK]
的列填充为8个元素,然后用[x]
和[QK]
执行矩阵向量乘法,然后再执行点积,这是否最好
编辑:好的,所以我决定实现一个带有填充向量的32位浮点版本,如下所示:
// Perform matrix vector multiplication of QK*x
// Load first four columns QK into 4 ymm registers
ymm0 = _mm256_load_ps((float *)(QK));
ymm1 = _mm256_load_ps((float *)(QK+8));
ymm2 = _mm256_load_ps((float *)(QK+16));
ymm3 = _mm256_load_ps((float *)(QK+24));
// Load first four values of x
ymm4 = _mm256_broadcast_ss((float *)(x));
ymm5 = _mm256_broadcast_ss((float *)(x+1));
ymm6 = _mm256_broadcast_ss((float *)(x+2));
ymm7 = _mm256_broadcast_ss((float *)(x+3));
// Multiply in place - frees up ymm4,ymm5,ymm6,ymm7
ymm0 = _mm256_mul_ps(ymm0, ymm4);
ymm1 = _mm256_mul_ps(ymm1, ymm5);
ymm2 = _mm256_mul_ps(ymm2, ymm6);
ymm3 = _mm256_mul_ps(ymm3, ymm7);
// Add together, this frees up ymm1,ymm2,and ymm3
ymm0 = _mm256_add_ps(ymm0, ymm1);
ymm2 = _mm256_add_ps(ymm2, ymm3);
ymm0 = _mm256_add_ps(ymm0, ymm2);
// Load next last columns of QK
ymm1 = _mm256_load_ps((float *)(QK+32));
ymm2 = _mm256_load_ps((float *)(QK+40));
// Load last two values of x
ymm6 = _mm256_broadcast_ss((float *)(x+4));
ymm7 = _mm256_broadcast_ss((float *)(x+5));
// Multiply in place
ymm1 = _mm256_mul_ps(ymm1, ymm6);
ymm2 = _mm256_mul_ps(ymm2, ymm7);
// Add together, this frees up every register except for ymm0
ymm0 = _mm256_add_ps(ymm0, ymm1);
ymm0 = _mm256_add_ps(ymm0, ymm2);
// Answer stored in ymm0 and ymm1
// Calculate dot product of y*(QK*x)
// Load x
ymm1 = _mm256_load_ps((float *)(y));
// Do dotproduct by using horizontal multiply followed by horizontal add
// Multiply in place
ymm0 = _mm256_mul_ps(ymm0, ymm1);
// Do horizontal sum
__m128 xmm1 = _mm256_extractf128_ps(ymm0, 1);
__m128 xmm2 = _mm256_extractf128_ps(ymm0, 0);
xmm2 = _mm_add_ps(xmm1, xmm2);
xmm1 = _mm_movehl_ps(xmm1, xmm2);
xmm2 = _mm_add_ps(xmm1, xmm2);
xmm1 = _mm_shuffle_ps(xmm2, xmm2, 1);
xmm2 = _mm_add_ss(xmm1, xmm2);
ans[0] = _mm_cvtss_f32(xmm2);
目前,它的运行速度大约是以下速度的3倍:
ans[0] = (QK[0]*x[0]+QK[8]*x[1]+QK[16]*x[2]+QK[24]*x[3]+QK[32]*x[4]+QK[40]*x[5])*y[0]+
(QK[1]*x[0]+QK[9]*x[1]+QK[17]*x[2]+QK[25]*x[3]+QK[33]*x[4]+QK[41]*x[5])*y[1]+
(QK[2]*x[0]+QK[10]*x[1]+QK[18]*x[2]+QK[26]*x[3]+QK[34]*x[4]+QK[42]*x[5])*y[2]+
(QK[3]*x[0]+QK[11]*x[1]+QK[19]*x[2]+QK[27]*x[3]+QK[35]*x[4]+QK[43]*x[5])*y[3]+
(QK[4]*x[0]+QK[12]*x[1]+QK[20]*x[2]+QK[28]*x[3]+QK[36]*x[4]+QK[44]*x[5])*y[4]+
(QK[5]*x[0]+QK[13]*x[1]+QK[21]*x[2]+QK[29]*x[3]+QK[37]*x[4]+QK[45]*x[5])*y[5];
对于500 mil迭代,标准C版本的运行时间约为9秒,单精度AVX版本的运行时间约为3.5秒。如果我在最后注释掉水平和,那么它将在0.5秒左右运行。水平和真的破坏了性能 我创建了代码来高效地完成这项工作。我使用AVX的单线程速度几乎提高了4倍(在AVX上使用Double可以达到的最佳效果)。以下是在6x6矩阵上运行2000、32000和4000000个x和y六个分量向量的结果。它们大致对应于我的系统上的L2、L3和>>L3缓存大小(每个向量使用48字节) 编辑:我在末尾添加了文本(和代码)来使用float进行编辑。单线程AVX的加速比接近8倍
i5-3317U (2 core ivy bridge)
compiled with: g++ m6.cpp -o m6 -O3 -mavx -fopenmp
nvec 2000, repeat 10000, 1 thread : time scalar/SIMD 3.95
nvec 32000, repeat 1000, 1 thread : time scalar/SIMD 3.53
nvec 4000000, repeat 10, 1 thread : time scalar/SIMD 3.28
1 thread for both the SIMD and non-SIMD functions
nvec 2000, repeat 10000, 2 threads: time scalar/SIMD 5.96
nvec 32000, repeat 1000, 2 threads: time scalar/SIMD 5.88
nvec 4000000, repeat 10, 2 threads: time scalar/SIMD 4.52
2 threads on the SIMD function vs. 1 thread on the non-SIMD function
compiled with g++ m6.cpp -o m6 -O3 -msse4.2 -fopenmp
nvec 2000, repeat 10000, 1 thread: time scalar/SIMD 2.15
nvec 32000, repeat 1000, 1 thread: time scalar/SIMD 2.06
nvec 4000000, repeat 10, 1 thread: time scalar/SIMD 2.00
基本算法对x和y向量执行SIMD,而不是对6x6矩阵执行SIMD。一个关键点是,x和y向量必须是数组结构块的数组。这也称为数组结构数组(AoSoA)或混合数组结构。您可以在“为便携式SIMD编程扩展类C语言”一文中了解更多信息
下面是代码。函数aos2aosoa
将六个组件向量的数组转换为SoA数组。如果希望充分利用SIMD,应用程序应该以这种形式生成向量(而不是进行转换)。此代码使用Agner Fog的vectorclass。只是一些头文件。这段代码也适用于SSE(但提升幅度仅为预期的2倍)
一个小警告,x和y向量的数组和结果必须是4的倍数。但是,向量的数量并不相同
像这样编译
g++ m6.cpp -o m6 -O3 -mavx -fopenmp //system with AVX
g++ m6.cpp -o m6 -O3 -msse4.2 -fopenmp //system with SSE
在visual studio中,在编译器命令行选项中放置/arch:AVX
守则:
#include <stdio.h>
#include <omp.h>
#include "vectorclass.h"
#include <stdlib.h>
double prod_scalar(double *x, double *M, double *y) {
double sum = 0.0f;
for(int i=0; i<6; i++) {
for(int j=0; j<6; j++) {
sum += x[i]*M[i*6 + j]*y[j];
}
}
return sum;
}
void prod_block4(double *x, double *M, double *y, double *result) {
Vec4d sum4 = 0.0f;
for(int i=0; i<6; i++) {
Vec4d x4 = Vec4d().load(&x[4*i]);
for(int j=0; j<6; j++) {
Vec4d y4 = Vec4d().load(&y[4*j]);
sum4 += x4*M[i*6 + j]*y4;
}
}
sum4.store(result);
}
void prod_block4_unroll2(double *x, double *M, double *y, double *result) {
Vec4d sum4_1 = 0.0f;
Vec4d sum4_2 = 0.0f;
Vec4d yrow[6];
for(int i=0; i<6; i++) {
yrow[i] = Vec4d().load(&y[4*i]);
}
for(int i=0; i<6; i++) {
Vec4d x4 = Vec4d().load(&x[4*i]);
for(int j=0; j<6; j+=2) {
sum4_1 += x4*M[i*6 + j]*yrow[j];
sum4_2 += x4*M[i*6 + j+1]*yrow[j+1];
}
}
sum4_1 += sum4_2;
sum4_1.store(result);
}
void loop_scalar(double *x, double *M, double *y, double *result, const int nvec) {
// #pragma omp parallel for
for(int i=0; i<nvec; i++) {
result[i] = prod_scalar(&x[6*i], M, &y[6*i]);
}
}
void loop_block4(double *x, double *M, double *y, double *result, const int nvec) {
// #pragma omp parallel for
for(int i=0; i<(nvec/4); i++) {
// prod_block4(&x[i*24], M, &y[i*24], &result[4*i]);
prod_block4_unroll2(&x[i*24], M, &y[i*24], &result[4*i]);
}
}
void aos2soa(double *in, double *out) {
int cnt = 0;
for(int i=0; i<6; i++) {
for(int j=0; j<4; j++) {
out[cnt++] = in[6*j + i];
}
}
}
void aos2aosoa(double *in, double *out, const int nvec) {
for(int i=0; i<(nvec/4); i++) {
aos2soa(&in[i*24], &out[i*24]);
}
}
double compare_results(double *r1, double * r2, const int nvec) {
double out = 0.0f;
for(int i=0; i<nvec; i++) {
out += r1[i] -r2[i];
//if(out!=0) printf("%f %f\n",r1[i], r2[i]);
}
return out;
}
void loop(const int nvec, const int repeat) {
double *M = (double*)_mm_malloc(6*6*sizeof(double),32);
double *x_aos = (double*)_mm_malloc(nvec*6*sizeof(double),32);
double *y_aos = (double*)_mm_malloc(nvec*6*sizeof(double),32);
double *x_aosoa = (double*)_mm_malloc(nvec*6*sizeof(double),32);
double *y_aosoa = (double*)_mm_malloc(nvec*6*sizeof(double),32);
double *results_scalar = (double*)_mm_malloc(nvec*sizeof(double),32);
double *results_vector = (double*)_mm_malloc(nvec*sizeof(double),32);
for(int i=0; i<6; i++) {
for(int j=0; j<6; j++) {
M[j*6 + i] = i*6 + j;
}
}
for(int i=0; i<(6*nvec); i++) {
double r1 = (double)rand()/RAND_MAX;
double r2 = (double)rand()/RAND_MAX;
//x_aos[i] = i;
x_aos[i] = r1;
//y_aos[i] = i;
y_aos[i] = r2;
}
aos2aosoa(x_aos, x_aosoa, nvec);
aos2aosoa(y_aos, y_aosoa, nvec);
double dtime;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) {
loop_scalar(x_aos, M, y_aos, results_scalar, nvec);
}
dtime = omp_get_wtime() - dtime;
printf("time scalar %f\n", dtime);
double dtime_old = dtime;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) {
loop_block4(x_aosoa, M, y_aosoa, results_vector, nvec);
}
dtime = omp_get_wtime() - dtime;
printf("time vector %f\n", dtime);
printf("time scalar/vector %f\n", dtime_old/dtime);
printf("difference %f\n", compare_results(results_scalar, results_vector, nvec));
_mm_free(M);
_mm_free(x_aos);
_mm_free(y_aos);
_mm_free(x_aosoa);
_mm_free(y_aosoa);
_mm_free(results_scalar);
_mm_free(results_vector);
}
int main() {
int nveca[3];
nveca[0] = 2000; // 2000*2*6*8 = 192kb //L2
nveca[1] = 32000; // 32000*2*6*8 = 3Mb //L3
nveca[2] = 4*1000000; //366Mb
int nrepeat[3];
nrepeat[0] = 10000;
nrepeat[1] = 1000;
nrepeat[2] = 10;
for(int i=0; i<3; i++) {
printf("nvec %d, repeat %d\n", nveca[i], nrepeat[i]);
loop(nveca[i], nrepeat[i]);
printf("\n");
}
}
下面是float而不是double的代码
#include <stdio.h>
#include <omp.h>
#include "vectorclass.h"
#include <stdlib.h>
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
float prod_scalar(float *x, float *M, float *y) {
float sum = 0.0f;
for(int i=0; i<6; i++) {
for(int j=0; j<6; j++) {
sum += x[i]*M[i*6 + j]*y[j];
}
}
return sum;
}
float prod_scalar_unroll2(float *x, float *M, float *y) {
float sum_1 = 0.0f;
float sum_2 = 0.0f;
for(int i=0; i<6; i++) {
for(int j=0; j<6; j+=2) {
sum_1 += x[i]*M[i*6 + j]*y[j];
sum_2 += x[i]*M[i*6 + j+1]*y[j+1];
}
}
return sum_1 + sum_2;
}
void prod_block4(float *x, float *M, float *y, float *result) {
Vec8f sum4 = 0.0f;
for(int i=0; i<6; i++) {
Vec8f x4 = Vec8f().load(&x[8*i]);
for(int j=0; j<6; j++) {
Vec8f y4 = Vec8f().load(&y[8*j]);
sum4 += x4*M[i*6 + j]*y4;
}
}
sum4.store(result);
}
void prod_block4_unroll2(float *x, float *M, float *y, float *result) {
Vec8f sum4_1 = 0.0f;
Vec8f sum4_2 = 0.0f;
Vec8f yrow[6];
for(int i=0; i<6; i++) {
yrow[i] = Vec8f().load(&y[8*i]);
}
for(int i=0; i<6; i++) {
Vec8f x4 = Vec8f().load(&x[8*i]);
for(int j=0; j<6; j+=2) {
sum4_1 += x4*M[i*6 + j]*yrow[j];
sum4_2 += x4*M[i*6 + j+1]*yrow[j+1];
}
}
sum4_1 += sum4_2;
sum4_1.store(result);
}
void loop_scalar(float *x, float *M, float *y, float *result, const int nvec) {
// #pragma omp parallel for
for(int i=0; i<nvec; i++) {
result[i] = prod_scalar(&x[6*i], M, &y[6*i]);
//result[i] = prod_scalar_unroll2(&x[6*i], M, &y[6*i]);
}
}
void loop_SIMD(float *x, float *M, float *y, float *result, const int nvec) {
//#pragma omp parallel for schedule(static,256)
//#pragma omp parallel for schedule(static)
const int N = nvec/8;
//printf("chuck %d\n", N/2);
//omp_set_num_threads(2);
//#pragma omp parallel
{
//int nthreads = omp_get_num_threads();
//int ithread = omp_get_thread_num();
//int start = (ithread*N)/nthreads;
//int end = ((ithread+1)*N)/nthreads;
//printf("ithread, start %d, end %d, chunk %d\n",start, end, end-start);
//#pragma omp for
for(int i=0; i<(nvec/8); i++) {
//for(int i=start; i<end; i++) {
// prod_block4(&x[i*24], M, &y[i*24], &result[4*i]);
prod_block4_unroll2(&x[i*48], M, &y[i*48], &result[8*i]);
}
}
}
void aos2soa(float *in, float *out) {
int cnt = 0;
for(int i=0; i<6; i++) {
for(int j=0; j<8; j++) {
out[cnt++] = in[6*j + i];
}
}
}
void aos2aosoa(float *in, float *out, const int nvec) {
for(int i=0; i<(nvec/8); i++) {
aos2soa(&in[i*48], &out[i*48]);
}
}
float compare_results(float *r1, float * r2, const int nvec) {
float out = 0.0f;
for(int i=0; i<nvec; i++) {
out += r1[i] -r2[i];
//if(out!=0) printf("%f %f\n",r1[i], r2[i]);
}
return out;
}
void loop(const int nvec, const int repeat) {
float *M = (float*)_mm_malloc(6*6*sizeof(float),64);
float *x_aos = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *y_aos = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *x_aosoa = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *y_aosoa = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *results_scalar = (float*)_mm_malloc(nvec*sizeof(float),64);
float *results_SIMD = (float*)_mm_malloc(nvec*sizeof(float),64);
for(int i=0; i<6; i++) {
for(int j=0; j<6; j++) {
M[j*6 + i] = i*6 + j;
}
}
for(int i=0; i<(6*nvec); i++) {
float r1 = (float)rand()/RAND_MAX;
float r2 = (float)rand()/RAND_MAX;
//x_aos[i] = i;
x_aos[i] = r1;
//y_aos[i] = i;
y_aos[i] = r2;
}
aos2aosoa(x_aos, x_aosoa, nvec);
aos2aosoa(y_aos, y_aosoa, nvec);
float dtime;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) {
loop_scalar(x_aos, M, y_aos, results_scalar, nvec);
}
dtime = omp_get_wtime() - dtime;
printf("time scalar %f\n", dtime);
float dtime_old = dtime;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) {
loop_SIMD(x_aosoa, M, y_aosoa, results_SIMD, nvec);
}
dtime = omp_get_wtime() - dtime;
printf("time SIMD %f\n", dtime);
printf("time scalar/SIMD %f\n", dtime_old/dtime);
printf("difference %f\n", compare_results(results_scalar, results_SIMD, nvec));
_mm_free(M);
_mm_free(x_aos);
_mm_free(y_aos);
_mm_free(x_aosoa);
_mm_free(y_aosoa);
_mm_free(results_scalar);
_mm_free(results_SIMD);
}
int main() {
int nveca[3];
nveca[0] = 2000; // 2000*2*6*8 = 192kb //L2
nveca[1] = 32000; // 32000*2*6*8 = 3Mb //L3
nveca[2] = 5*1000000; //366Mb
int nrepeat[3];
nrepeat[0] = 10000;
nrepeat[1] = 1000;
nrepeat[2] = 100;
for(int i=0; i<3; i++) {
printf("nvec %d, repeat %d\n", nveca[i], nrepeat[i]);
loop(nveca[i], nrepeat[i]);
printf("\n");
}
}
#包括
#包括
#包括“vectorclass.h”
#包括
#定义向下取整(x,s)((x)和((s)-1))
float prod_标量(float*x、float*M、float*y){
浮动总和=0.0f;
对于(int i=0;如果windows 7,您可以使用8个ymm寄存器--->6来表示矩阵元素1表示向量a,1表示向量b。对于linux,它是16个ymm免费寄存器。哎呀,我应该指定,这些是双精度的。我将编辑这个问题。@huseyintugrulbuyukisik您在x86上得到8个ymm寄存器,在x64上得到16个寄存器。除非有人花时间来解释自行完成并分析,很难明确回答您的问题。您是否考虑过将QK行的前4个元素和y行的前4个元素加载到AVX寄存器中,并将最后两个元素加载到SSE寄存器中?这可能会省去填充的麻烦,我看不到任何缺点。别忘了AVX寄存器的前128位被别名为SSE寄存器!我认为这是对混合AVX和SSE的性能惩罚的误解。只有当编译代码混合了旧的SSE指令和新的VEX编码指令时,才会产生这种惩罚。这由编译器负责,即编译器将对所有instr进行编码如果选择AVX作为体系结构,则使用VEX编码方案。只有在使用传统SSE指令调用旧的预编译库时,您才应该担心它。因此,继续使用128位和256位的内部函数。(,第13.6节)有趣的方法。但仍然存在您提到的问题(只需要结果矩阵的对角线分量)。我还认为在加载6x6 QK矩阵时仍然会出现对齐问题…我会进一步研究。我会在今晚晚些时候将我昨天所做的添加到问题中。我知道现在只能获取对角线分量。我会尝试在不久或明天发布解决方案。好的,我编辑了我的答案以显示基本算法。我您可以尝试发布一些示例代码,并在接下来的几天内清理有关如何安排内存(如果不清楚)的部分。听起来不错。我将在接下来的几天内对此进行检查。我更新了(更改了答案)。我得到了大约4倍(取决于向量的数量)与非SIMD代码相比,AVX的加速比是双倍的。我认为这与人们所期望的差不多。我验证了SIMD函数得到的结果与非SIMD函数相同。
#include <stdio.h>
#include <omp.h>
#include "vectorclass.h"
#include <stdlib.h>
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
float prod_scalar(float *x, float *M, float *y) {
float sum = 0.0f;
for(int i=0; i<6; i++) {
for(int j=0; j<6; j++) {
sum += x[i]*M[i*6 + j]*y[j];
}
}
return sum;
}
float prod_scalar_unroll2(float *x, float *M, float *y) {
float sum_1 = 0.0f;
float sum_2 = 0.0f;
for(int i=0; i<6; i++) {
for(int j=0; j<6; j+=2) {
sum_1 += x[i]*M[i*6 + j]*y[j];
sum_2 += x[i]*M[i*6 + j+1]*y[j+1];
}
}
return sum_1 + sum_2;
}
void prod_block4(float *x, float *M, float *y, float *result) {
Vec8f sum4 = 0.0f;
for(int i=0; i<6; i++) {
Vec8f x4 = Vec8f().load(&x[8*i]);
for(int j=0; j<6; j++) {
Vec8f y4 = Vec8f().load(&y[8*j]);
sum4 += x4*M[i*6 + j]*y4;
}
}
sum4.store(result);
}
void prod_block4_unroll2(float *x, float *M, float *y, float *result) {
Vec8f sum4_1 = 0.0f;
Vec8f sum4_2 = 0.0f;
Vec8f yrow[6];
for(int i=0; i<6; i++) {
yrow[i] = Vec8f().load(&y[8*i]);
}
for(int i=0; i<6; i++) {
Vec8f x4 = Vec8f().load(&x[8*i]);
for(int j=0; j<6; j+=2) {
sum4_1 += x4*M[i*6 + j]*yrow[j];
sum4_2 += x4*M[i*6 + j+1]*yrow[j+1];
}
}
sum4_1 += sum4_2;
sum4_1.store(result);
}
void loop_scalar(float *x, float *M, float *y, float *result, const int nvec) {
// #pragma omp parallel for
for(int i=0; i<nvec; i++) {
result[i] = prod_scalar(&x[6*i], M, &y[6*i]);
//result[i] = prod_scalar_unroll2(&x[6*i], M, &y[6*i]);
}
}
void loop_SIMD(float *x, float *M, float *y, float *result, const int nvec) {
//#pragma omp parallel for schedule(static,256)
//#pragma omp parallel for schedule(static)
const int N = nvec/8;
//printf("chuck %d\n", N/2);
//omp_set_num_threads(2);
//#pragma omp parallel
{
//int nthreads = omp_get_num_threads();
//int ithread = omp_get_thread_num();
//int start = (ithread*N)/nthreads;
//int end = ((ithread+1)*N)/nthreads;
//printf("ithread, start %d, end %d, chunk %d\n",start, end, end-start);
//#pragma omp for
for(int i=0; i<(nvec/8); i++) {
//for(int i=start; i<end; i++) {
// prod_block4(&x[i*24], M, &y[i*24], &result[4*i]);
prod_block4_unroll2(&x[i*48], M, &y[i*48], &result[8*i]);
}
}
}
void aos2soa(float *in, float *out) {
int cnt = 0;
for(int i=0; i<6; i++) {
for(int j=0; j<8; j++) {
out[cnt++] = in[6*j + i];
}
}
}
void aos2aosoa(float *in, float *out, const int nvec) {
for(int i=0; i<(nvec/8); i++) {
aos2soa(&in[i*48], &out[i*48]);
}
}
float compare_results(float *r1, float * r2, const int nvec) {
float out = 0.0f;
for(int i=0; i<nvec; i++) {
out += r1[i] -r2[i];
//if(out!=0) printf("%f %f\n",r1[i], r2[i]);
}
return out;
}
void loop(const int nvec, const int repeat) {
float *M = (float*)_mm_malloc(6*6*sizeof(float),64);
float *x_aos = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *y_aos = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *x_aosoa = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *y_aosoa = (float*)_mm_malloc(nvec*6*sizeof(float),64);
float *results_scalar = (float*)_mm_malloc(nvec*sizeof(float),64);
float *results_SIMD = (float*)_mm_malloc(nvec*sizeof(float),64);
for(int i=0; i<6; i++) {
for(int j=0; j<6; j++) {
M[j*6 + i] = i*6 + j;
}
}
for(int i=0; i<(6*nvec); i++) {
float r1 = (float)rand()/RAND_MAX;
float r2 = (float)rand()/RAND_MAX;
//x_aos[i] = i;
x_aos[i] = r1;
//y_aos[i] = i;
y_aos[i] = r2;
}
aos2aosoa(x_aos, x_aosoa, nvec);
aos2aosoa(y_aos, y_aosoa, nvec);
float dtime;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) {
loop_scalar(x_aos, M, y_aos, results_scalar, nvec);
}
dtime = omp_get_wtime() - dtime;
printf("time scalar %f\n", dtime);
float dtime_old = dtime;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) {
loop_SIMD(x_aosoa, M, y_aosoa, results_SIMD, nvec);
}
dtime = omp_get_wtime() - dtime;
printf("time SIMD %f\n", dtime);
printf("time scalar/SIMD %f\n", dtime_old/dtime);
printf("difference %f\n", compare_results(results_scalar, results_SIMD, nvec));
_mm_free(M);
_mm_free(x_aos);
_mm_free(y_aos);
_mm_free(x_aosoa);
_mm_free(y_aosoa);
_mm_free(results_scalar);
_mm_free(results_SIMD);
}
int main() {
int nveca[3];
nveca[0] = 2000; // 2000*2*6*8 = 192kb //L2
nveca[1] = 32000; // 32000*2*6*8 = 3Mb //L3
nveca[2] = 5*1000000; //366Mb
int nrepeat[3];
nrepeat[0] = 10000;
nrepeat[1] = 1000;
nrepeat[2] = 100;
for(int i=0; i<3; i++) {
printf("nvec %d, repeat %d\n", nveca[i], nrepeat[i]);
loop(nveca[i], nrepeat[i]);
printf("\n");
}
}