C 学习使用内部函数——使用_mm256_sub_ps的segm故障_C_Vectorization_Intel_Intrinsics

C 学习使用内部函数——使用_mm256_sub_ps的segm故障

C 学习使用内部函数——使用_mm256_sub_ps的segm故障,c,vectorization,intel,intrinsics,C,Vectorization,Intel,Intrinsics,我正在努力学习如何使用内在论。所以，我的c代码是： void Vor( const int NbPoints, const int height, const int width, float * X, float * Y, int * V, int * const ouVor ) { float Xd , Yd; float Distance ,initDistance = FLT_MAX; int

我正在努力学习如何使用内在论。所以，我的c代码是：

void Vor(

    const int NbPoints,
    const int height,
    const int width,
    float * X,
    float * Y,
    int   * V,
    int   * const ouVor )
{

    float Xd , Yd;
    float Distance ,initDistance = FLT_MAX;
    int Threshold;

    int x , y; // pixel coordinates
    int i;

    for ( y = 0; y < height; y++ )
    {
        for ( x = 0; x < width; x++ )
        {
            for ( i = 0; i < NbPoints; i++ )
            {
                Xd = X[ i ] - x;
                Yd = Y[ i ] - y;
                Distance = Xd * Xd + Yd * Yd;

                //if this Point is closer , assign proper threshold
                if ( Distance < initDistance )
                {
                    initDistance = Distance;
                    Threshold = V[ i ];
                }

                *( ouVor + ( x + y * width ) ) = Threshold;

            } /* i */
        } /* x */

    } /* y */


}

（Yd、theMin、SIMDTempDistance的误差相同）

我怎样才能克服这些问题

此外，我删除了if语句并使用_m128_gmin_ps来查找最小值。我的实现正确吗

--------------更新---------------

在Sourav Ghosh评论之后，我搜索了标题。我在任何地方都找不到128位，所以我使用了256位，使用的是

#include

将几行更正为：

__m256 Distance = _mm256_load_ps( &intiDistance );

__m256 theMin = _mm256_min_ps( SIMDTempDistance[ i ] , &Distance );

所有对_mm256而不是_m256的函数调用，我只得到以下错误：

error: argument of type "int" is incompatible with parameter of type "__m256"
Xd = _mm256_sub_ps( theX[ i ] , x );
Yd = _mm256_sub_ps( theY[ i ] , y );

x和y是整数，用于循环中。我不知道如何克服这一点

-----更新----------------------

我想！我在选演员。。我用过：

现在，我的代码是：

void Vor(

        const  int  NbOfPoints,
        const  int  height,
        const  int  width,
        float * restrict X,
        float * restrict Y,
        int   * restrict V,
        int   * restrict ouVor )
    {



       __m256 Xd , Yd;

       __m256i Threshold;
        int x , y; // pixel coordinates


        float * TempDistance = (float*) _mm_malloc( NbOfPoints * sizeof(*TempDistance) ,64 );

        __m256 * SIMDTempDistance = (__m256*) TempDistance;
        __m256 * theX = (__m256*) X;
        __m256 * theY = (__m256*) Y;
        __m256i * theV = (__m256i*) V;
        __m256i * theVor = (__m256i*) ouVor;


    #pragma omp parallel for default( none ) shared( X , Y , V , ouVor ,height , width ,NbOfPoints ,ouVor ,theX,theY,theV ) private ( x,y,Xd,Yd,TempDistance ,Threshold,SIMDTempDistance ) collapse(2)  

    for ( y = 0; y < height; y++ )
    { 
        for ( x = 0; x < width; x++ )
        {
                float initDistance = FLT_MAX;
                __m256 Distance = _mm256_set1_ps( initDistance );

                for ( int i = 0; i < NbOfPoints; i++ )
                {
                    __m256i xxIdx = _mm256_set1_epi32( x );
                    __m256  xIdx  = _mm256_castsi256_ps( xxIdx );

                    __m256i yyIdx = _mm256_set1_epi32( y );
                    __m256  yIdx  = _mm256_castsi256_ps( yyIdx );

                    Xd = _m256_sub_ps( theX[ i ] , xIdx );
                    Yd = _m256_sub_ps( theY[ i ] , yIdx );
                    SIMDTempDistance[ i ] = _m256_add_ps( Xd * Xd , Yd * Yd );

                    __m256 theMin = _m256_gmin_ps( SIMDTempDistance , Distance );

                    Distance = theMin;
                    Threshold = theV[ i ];

                    } /* i */

                    //write result
                    *( ouVor + x + y * width ) = Threshold;

                } /* x */

            } /* y */


        _mm_free( TempDistance );

    }

没关系

但是运行代码会导致分段错误

一致：

Xd = _m256_sub_ps( theX[ i ] , xIdx );
Yd = _m256_sub_ps( theY[ i ] , yIdx );

我认为，您缺少了一些必需的头文件，其中包含

\u m128\u sub\u ps（）

函数的转发声明。我们可以假设实际上

\u m128\u sub\u ps（）

函数的返回类型为

\u m128

，但如果没有前向声明，编译器将假定

\u m128\u sub\u ps（）

函数的默认返回类型为

int

。这就是为什么编译器会发出

函数“\u m128\u sub\u ps”隐式声明

然后，将

int

返回值分配给类型为

\uuu m128

的变量，从而产生问题

编辑：

根据更改后的代码

int x , y; // pixel coordinates

应该是

__m256 x , y; // pixel coordinates

由于要求两个参数的类型都是

\uuum256

，因此您对内部名称有点糊涂了

对于128位SSE，它只是，例如：

_mm_sub_ps

不是：

[混淆可能是因为256位AVX的

\u mm256\u sub\u ps

）

我使用了：

__m256 LX = _mm256_load_ps( &X[ i ] );
__m256 LY = _mm256_load_ps( &Y[ i ] );

而不是：

Xd = _m256_sub_ps( theX[ i ] , xIdx );
Yd = _m256_sub_ps( theY[ i ] , yIdx );

这很好

@George

\u m128\u sub\u ps（）

的函数原型是什么？它在吗？：我已经尝试过了，但是它在循环所在的行中给出了很多错误。例如，错误：不能为（y=0；y

Xd=\u mm256\u sub\u ps（theX[i]，（\uu m256）x）的方法绕过警告等等。然而，我不确定这是否是你想要的。请确认。它给出：不允许转换为类型“\uuum256”
__m256 x , y; // pixel coordinates

_mm_sub_ps

_mm128_sub_ps

__m256 LX = _mm256_load_ps( &X[ i ] );
__m256 LY = _mm256_load_ps( &Y[ i ] );

Xd = _m256_sub_ps( theX[ i ] , xIdx );
Yd = _m256_sub_ps( theY[ i ] , yIdx );