C 使用-ffast math编译AVX代码时出现错误？_C_Linux_Gcc_Simd_Fast Math

C 使用-ffast math编译AVX代码时出现错误？

c linux gcc

C 使用-ffast math编译AVX代码时出现错误？,c,linux,gcc,simd,fast-math,C,Linux,Gcc,Simd,Fast Math,我正在尝试使用GCCs内置simd支持编写一些内核。我有以下代码对AVX dot产品内核进行基准测试： #include <time.h> #include <stdio.h> #include <assert.h> #include <stdint.h> #include <stdlib.h> #include <unistd.h> // define rtdsc instruction static __inline_

我正在尝试使用GCCs内置simd支持编写一些内核。我有以下代码对AVX dot产品内核进行基准测试：

#include <time.h>
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>

// define rtdsc instruction
static __inline__ uint64_t tick(void) {
    uint32_t hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 );
}

// AVX dot product 
float avx_dot(float* __restrict__ ans, float* __restrict__ A, float* __restrict__ B, int N, ssize_t nprod, ssize_t shift) {
    assert(N % 32 == 0 && "N not divisible by 32");
    const int VECTOR_SIZE = 8;

    typedef float vec
        __attribute__ ((vector_size (sizeof(float) * VECTOR_SIZE)));

    N /= VECTOR_SIZE;

    for (ssize_t ii=0; ii < nprod; ii++) {
        vec *Av = (vec*)A;
        vec *Bv = (vec*)(B + ii*shift);

        vec temp[4] = {0,0,0,0};
        for(int jj = 0; jj < N; jj += 4) {
            temp[0] += Av[jj+0] * Bv[jj+0];
            temp[1] += Av[jj+1] * Bv[jj+1];
            temp[2] += Av[jj+2] * Bv[jj+2];
            temp[3] += Av[jj+3] * Bv[jj+3];
        }

        union {
            vec   tempv;
            float tempf[VECTOR_SIZE];
        };

        tempv = temp[0] + temp[1] + temp[2] + temp[3];

        ans[ii] = 0;
        for(int jj = 0; jj < VECTOR_SIZE; ++jj) {
            ans[ii] += tempf[jj];
        }
    }
}

int main(int argc, const char *argv[]) {
    const ssize_t NITER   = 1000;
    const ssize_t DECIM   = atoi(argv[2]);
    const ssize_t DOTPROD = atoi(argv[3]);
    ssize_t size = atoi(argv[1]);

    float* A; posix_memalign((void**)&A, 128, size*sizeof(float));
    float* B; posix_memalign((void**)&B, 128, (size+(DOTPROD-1)*DECIM)*sizeof(float));

    srand(time(NULL));
    for (ssize_t ii=0; ii < size;                   ii++) A[ii] = rand();
    for (ssize_t ii=0; ii < size+(DOTPROD-1)*DECIM; ii++) B[ii] = rand();

    printf("# size: %i  nproducts: %i  shift: %i\n", size, DOTPROD, DECIM);
    printf("# iter  answer  cycles  seconds  samprate\n");
    float results[DOTPROD];
    for (ssize_t ii=0; ii < NITER; ii++) {
        uint64_t beg = tick();
        avx_dot(results, A, B, size, DOTPROD, DECIM);
        uint64_t end = tick();

        float ans = 0;
        for (ssize_t jj=0; jj < DOTPROD; jj++) {
            ans += results[jj];
        }

        double    CLOCK   = 3300e6; 
        uint64_t cycles   = end-beg;
        double   seconds  = (double)cycles/CLOCK;
        double   samprate = (size*DOTPROD)/seconds;

        printf("%-5zd %f %lli %.3e %e\n", ii, ans, (unsigned long long)cycles, seconds, samprate);
    }

    return 0;
}

我第一次访问avx_dot内部的temp时遇到SEGFULT。但是，当使用以下工具进行编译时：

g++ -O3 -march=corei7-avx dotprod.cc -ffast-math -o dotprod

g++ -O3 -march=corei7-avx dotprod.cc -o dotprod

也就是说，如果不启用-ffast数学，它运行得很好。我很困惑，因为我相信快速数学不应该影响内存访问，所以我不知道SEGFULT是从哪里来的

我在跑：

CentOS Linux release 7.2.1511
gcc version 4.8.5 20150623 (Red Hat 4.8.5-4) (GCC)

有人能在他们的机器上确认这一行为并说明发生了什么吗？

我的随机猜测是数据对齐，因为它无法加载数据（失败的指令是….vmovaps（%rcx）），%ymm4…%rcx=0x603228，Bv位于0x603228，阅读该指令的文档显示需要16字节对齐）

进一步调查：

当Bv偏移8字节到B时，由于这一行（AVX需要16字节对齐），会出现问题：

asm中有什么有趣的地方吗？我不确定，我不一定是最擅长原始汇编的，但我会在gcc下运行它，并向我们展示它的insn错误。此外，还有

-march=sandybridge

。IDK它与

-march=corei7 avx

有何不同。顺便说一句，你不应该从自动矢量化中得到错误，除非你告诉gcc你的数据是对齐的，但结果却不是。（

vmovups

将在未对齐的地址上出错，这与其他指令的AVX内存操作数不同。

vmovups

具有相同的性能。）哦，我想是这样的，使用了typedef。

\u mm256\u loadu\u ps

vs.

\u mm256\u loadu\u ps

内部函数用于向编译器传递对齐信息。它需要对

ymm

表单进行32字节对齐，但是的，您可能是对的。只有对齐的加载/存储指令在AVX中具有对齐要求。AVX的主要功能之一是内存操作数不需要对齐，因此即使数据有时可能未对齐，

vaddps ymm0，ymm1，[mem]

也是安全的。跨越缓存线边界有一个perf惩罚，但是如果数据通常在运行时对齐，那么尽可能快地保持对齐的大小写并让硬件处理未对齐的大小写是有意义的，只需稍微减速。

vec *Bv = (vec*)(B + ii*shift);


./dotprod-fast 64 10 10
A=0x1125080
B=0x1125200
# size: 64  nproducts: 10  shift: 10
# iter  answer  cycles  seconds  samprate
Av=0x1125080
Bv=0x1125200
Av=0x1125080
Bv=0x1125228
Segmentation fault (core dumped)