C++ 在C+中转置矩阵的最快方法是什么+；？_C++_Algorithm_Matrix_Transpose

C++ 在C+中转置矩阵的最快方法是什么+；？

c++ algorithm matrix

C++ 在C+中转置矩阵的最快方法是什么+；？,c++,algorithm,matrix,transpose,C++,Algorithm,Matrix,Transpose,我有一个矩阵（相对较大），需要转置。例如，假设我的矩阵是 a b c d e f g h i j k l m n o p q r 我希望结果如下： a g m b h n c I o d j p e k q f l r 最快的方法是什么？这取决于您的应用程序，但通常转置矩阵的最快方法是在查找时反转坐标，然后，您不必实际移动任何数据。这取决于您的应用程序，但通常转置矩阵的最快方法是在查找时反转坐标，然后您不必实际移动任何数据。模板 template <class T> void

我有一个矩阵（相对较大），需要转置。例如，假设我的矩阵是

a b c d e f
g h i j k l
m n o p q r

我希望结果如下：

a g m
b h n
c I o
d j p
e k q
f l r

最快的方法是什么？

这取决于您的应用程序，但通常转置矩阵的最快方法是在查找时反转坐标，然后，您不必实际移动任何数据。

这取决于您的应用程序，但通常转置矩阵的最快方法是在查找时反转坐标，然后您不必实际移动任何数据。

模板
template <class T>
void transpose( const std::vector< std::vector<T> > & a,
std::vector< std::vector<T> > & b,
int width, int height)
{
    for (int i = 0; i < width; i++)
    {
        for (int j = 0; j < height; j++)
        {
            b[j][i] = a[i][j];
        }
    }
} 

无效转置（const std:：vector&a，
std:：vector&b，
整数宽度，整数高度）
{
对于（int i=0；i

模板
无效转置（const std:：vector&a，
std:：vector&b，
整数宽度，整数高度）
{
对于（int i=0；i

这是个好问题。有很多原因使您希望在内存中实际转置矩阵，而不仅仅是交换坐标，例如在矩阵乘法和高斯涂抹中
首先，让我列出一个用于转置的函数（编辑：请参阅我的答案末尾，在那里我找到了一个更快的解决方案）
这是英特尔公司的一篇论文，解释了这一点

最后，我在矩阵乘法（和高斯涂抹）中实际做的不是精确地取转置，而是在一定向量大小的宽度上取转置（例如，对于SSE/AVX为4或8）。这是我使用的函数
void reorder_matrix(const float* A, float* B, const int N, const int M, const int vec_size) {
    #pragma omp parallel for
    for(int n=0; n<M*N; n++) {
        int k = vec_size*(n/N/vec_size);
        int i = (n/vec_size)%N;
        int j = n%vec_size;
        B[n] = A[M*i + k + j];
    }
}

值lda
和ldb
是矩阵的宽度。这些需要是块大小的倍数。为了找到值并为3000x1001矩阵分配内存，我做了如下操作
#define ROUND_UP(x, s) (((x)+((s)-1)) & -(s))
const int n = 3000;
const int m = 1001;
int lda = ROUND_UP(m, 16);
int ldb = ROUND_UP(n, 16);

float *A = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);
float *B = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);

对于3000x1001，返回ldb=3008
和lda=1008

编辑：
我发现了一个使用SSE intrinsics的更快的解决方案：
inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
    __m128 row1 = _mm_load_ps(&A[0*lda]);
    __m128 row2 = _mm_load_ps(&A[1*lda]);
    __m128 row3 = _mm_load_ps(&A[2*lda]);
    __m128 row4 = _mm_load_ps(&A[3*lda]);
     _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
     _mm_store_ps(&B[0*ldb], row1);
     _mm_store_ps(&B[1*ldb], row2);
     _mm_store_ps(&B[2*ldb], row3);
     _mm_store_ps(&B[3*ldb], row4);
}

inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
    #pragma omp parallel for
    for(int i=0; i<n; i+=block_size) {
        for(int j=0; j<m; j+=block_size) {
            int max_i2 = i+block_size < n ? i + block_size : n;
            int max_j2 = j+block_size < m ? j + block_size : m;
            for(int i2=i; i2<max_i2; i2+=4) {
                for(int j2=j; j2<max_j2; j2+=4) {
                    transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
                }
            }
        }
    }
}

inline void transpose4x4_SSE（float*A，float*B，const int lda，const int ldb）{
__m128第1行=_mm_load_ps（&A[0*lda]）；
__m128第2行=_mm_load_ps（&A[1*lda]）；
__m128第3行=_mm_load_ps（&A[2*lda]）；
__m128第4行=_mm_load_ps（&A[3*lda]）；
_MM_TRANSPOSE4_PS（第1行、第2行、第3行、第4行）；
_mm_商店（第1行为&B[0*ldb]）；
_mm_商店（和B[1*ldb]，第2行）；
_mm_store_ps（&B[2*ldb]，第3行）；
_mm_store_ps（&B[3*ldb]，第4行）；
}
内联无效转置块（浮点*A、浮点*B、常数整数n、常数整数m、常数整数lda、常数整数ldb、常数整数块大小）{
#pragma-omp并行
对于（int i=0；i这是一个很好的问题。有很多原因可以让你在内存中转置矩阵，而不仅仅是交换坐标，例如在矩阵乘法和高斯涂抹中
首先，让我列出一个用于转置的函数（编辑：请参阅我的答案末尾，在那里我找到了一个更快的解决方案）
这是英特尔公司的一篇论文，解释了这一点

最后，我在矩阵乘法（和高斯涂抹）中实际做的不是精确地取转置，而是在一定向量大小的宽度上取转置（例如，对于SSE/AVX为4或8）
void reorder_matrix(const float* A, float* B, const int N, const int M, const int vec_size) {
    #pragma omp parallel for
    for(int n=0; n<M*N; n++) {
        int k = vec_size*(n/N/vec_size);
        int i = (n/vec_size)%N;
        int j = n%vec_size;
        B[n] = A[M*i + k + j];
    }
}

值lda
和ldb
是矩阵的宽度。这些值需要是块大小的倍数。要找到值并为例如3000x1001矩阵分配内存，我执行以下操作
#define ROUND_UP(x, s) (((x)+((s)-1)) & -(s))
const int n = 3000;
const int m = 1001;
int lda = ROUND_UP(m, 16);
int ldb = ROUND_UP(n, 16);

float *A = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);
float *B = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);

对于3000x1001，返回ldb=3008
和lda=1008

编辑：
我发现了一个使用SSE intrinsics的更快的解决方案：
inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
    __m128 row1 = _mm_load_ps(&A[0*lda]);
    __m128 row2 = _mm_load_ps(&A[1*lda]);
    __m128 row3 = _mm_load_ps(&A[2*lda]);
    __m128 row4 = _mm_load_ps(&A[3*lda]);
     _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
     _mm_store_ps(&B[0*ldb], row1);
     _mm_store_ps(&B[1*ldb], row2);
     _mm_store_ps(&B[2*ldb], row3);
     _mm_store_ps(&B[3*ldb], row4);
}

inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
    #pragma omp parallel for
    for(int i=0; i<n; i+=block_size) {
        for(int j=0; j<m; j+=block_size) {
            int max_i2 = i+block_size < n ? i + block_size : n;
            int max_j2 = j+block_size < m ? j + block_size : m;
            for(int i2=i; i2<max_i2; i2+=4) {
                for(int j2=j; j2<max_j2; j2+=4) {
                    transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
                }
            }
        }
    }
}

inline void transpose4x4_SSE（float*A，float*B，const int lda，const int ldb）{
__m128第1行=_mm_load_ps（&A[0*lda]）；
__m128第2行=_mm_load_ps（&A[1*lda]）；
__m128第3行=_mm_load_ps（&A[2*lda]）；
__m128第4行=_mm_load_ps（&A[3*lda]）；
_MM_TRANSPOSE4_PS（第1行、第2行、第3行、第4行）；
_mm_商店（第1行为&B[0*ldb]）；
_mm_商店（和B[1*ldb]，第2行）；
_mm_store_ps（&B[2*ldb]，第3行）；
_mm_store_ps（&B[3*ldb]，第4行）；
}
内联无效转置块（浮点*A、浮点*B、常数整数n、常数整数m、常数整数lda、常数整数ldb、常数整数块大小）{
#pragma-omp并行
对于（int i=0；i将每一行视为一列，每一列视为一行。）使用j，i代替i，j
演示：
#包括
使用名称空间std；
int main（）
{
字符A[3][3]=
{
{'a'，'b'，'c'}，
{'d'，'e'，'f'}，
{'g'，'h'，'i'}
};
把每一行看作一列，每一列看作一行。用j，i代替i，j
演示：
#包括
使用名称空间std；
int main（）
{
字符A[3][3]=
{
{'a'，'b'，'c'}，
{'d'，'e'，'f'}，
{'g'，'h'，'i'}
};
cout我认为最快的方法不应该取高于O（n^2）的值，这样你就可以只使用O（1）空间：

这样做的方法是成对交换，因为当你转置一个矩阵时，你要做的是：M[i][j]=M[j][i]，所以将M[i][j]存储在temp中，然后M[i][j]=M[j][i]，最后一步是：M[j][i]=temp。这可以通过一个过程完成，所以它应该需要O（n^2）
我认为最快的方法不应该超过O（n^2）同样，通过这种方式，您可以只使用O（1）空格：

这样做的方法是成对交换，因为当你转置一个矩阵时，你要做的是：M[i][j]=M[j][i]，所以将M[i][j]存储在temp中，然后M[i][j]=M[j][i]，最后一步是：M[j][i]=temp。这可以通过一个过程来完成，所以需要O（n^2）
我的答案是对3x3矩阵的转置
 #include<iostream.h>

#include<math.h>


main()
{
int a[3][3];
int b[3];
cout<<"You must give us an array 3x3 and then we will give you Transposed it "<<endl;
for(int i=0;i<3;i++)
{
    for(int j=0;j<3;j++)
{
cout<<"Enter a["<<i<<"]["<<j<<"]: ";

cin>>a[i][j];

}

}
cout<<"Matrix you entered is :"<<endl;

 for (int e = 0 ; e < 3 ; e++ )

{
    for ( int f = 0 ; f < 3 ; f++ )

        cout << a[e][f] << "\t";


    cout << endl;

    }

 cout<<"\nTransposed of matrix you entered is :"<<endl;
 for (int c = 0 ; c < 3 ; c++ )
{
    for ( int d = 0 ; d < 3 ; d++ )
        cout << a[d][c] << "\t";

    cout << endl;
    }

return 0;
}

#包括
#包括
main（）
{
INTA[3][3]；
int b[3]；
inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
    __m128 row1 = _mm_load_ps(&A[0*lda]);
    __m128 row2 = _mm_load_ps(&A[1*lda]);
    __m128 row3 = _mm_load_ps(&A[2*lda]);
    __m128 row4 = _mm_load_ps(&A[3*lda]);
     _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
     _mm_store_ps(&B[0*ldb], row1);
     _mm_store_ps(&B[1*ldb], row2);
     _mm_store_ps(&B[2*ldb], row3);
     _mm_store_ps(&B[3*ldb], row4);
}

inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
    #pragma omp parallel for
    for(int i=0; i<n; i+=block_size) {
        for(int j=0; j<m; j+=block_size) {
            int max_i2 = i+block_size < n ? i + block_size : n;
            int max_j2 = j+block_size < m ? j + block_size : m;
            for(int i2=i; i2<max_i2; i2+=4) {
                for(int j2=j; j2<max_j2; j2+=4) {
                    transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
                }
            }
        }
    }
}

#include <iostream> 
using namespace std;

int main ()
{
    char A [3][3] =
    {
        { 'a', 'b', 'c' },
        { 'd', 'e', 'f' },
        { 'g', 'h', 'i' }
    };

    cout << "A = " << endl << endl;

    // print matrix A
    for (int i=0; i<3; i++)
    {
        for (int j=0; j<3; j++) cout << A[i][j];
        cout << endl;
    }

    cout << endl << "A transpose = " << endl << endl;

    // print A transpose
    for (int i=0; i<3; i++)
    {
        for (int j=0; j<3; j++) cout << A[j][i];
        cout << endl;
    }

    return 0;
}

 #include<iostream.h>

#include<math.h>


main()
{
int a[3][3];
int b[3];
cout<<"You must give us an array 3x3 and then we will give you Transposed it "<<endl;
for(int i=0;i<3;i++)
{
    for(int j=0;j<3;j++)
{
cout<<"Enter a["<<i<<"]["<<j<<"]: ";

cin>>a[i][j];

}

}
cout<<"Matrix you entered is :"<<endl;

 for (int e = 0 ; e < 3 ; e++ )

{
    for ( int f = 0 ; f < 3 ; f++ )

        cout << a[e][f] << "\t";


    cout << endl;

    }

 cout<<"\nTransposed of matrix you entered is :"<<endl;
 for (int c = 0 ; c < 3 ; c++ )
{
    for ( int d = 0 ; d < 3 ; d++ )
        cout << a[d][c] << "\t";

    cout << endl;
    }

return 0;
}

class Matrix{
   double *data; //suppose this will point to data
   double _get1(int i, int j){return data[i*M+j];} //used to access normally
   double _get2(int i, int j){return data[j*N+i];} //used when transposed

   public:
   int M, N; //dimensions
   double (*get_p)(int, int); //functor to access elements  
   Matrix(int _M,int _N):M(_M), N(_N){
     //allocate data
     get_p=&Matrix::_get1; // initialised with normal access 
     }

   double get(int i, int j){
     //there should be a way to directly use get_p to call. but i think even this
     //doesnt incur overhead because it is inline and the compiler should be intelligent
     //enough to remove the extra call
     return (this->*get_p)(i,j);
    }
   void transpose(){ //twice transpose gives the original
     if(get_p==&Matrix::get1) get_p=&Matrix::_get2;
     else get_p==&Matrix::_get1; 
     swap(M,N);
     }
}

Matrix M(100,200);
double x=M.get(17,45);
M.transpose();
x=M.get(17,45); // = original M(45,17)

t0 = _mm_unpacklo_ps(r0, r1);
t1 = _mm_unpackhi_ps(r0, r1);
t2 = _mm_unpacklo_ps(r2, r3);
t3 = _mm_unpackhi_ps(r2, r3);

r0 = _mm_shuffle_ps(t0,t2, 0x44);
r1 = _mm_shuffle_ps(t0,t2, 0xEE);
r2 = _mm_shuffle_ps(t1,t3, 0x44);
r3 = _mm_shuffle_ps(t1,t3, 0xEE);

t0 = _mm_unpacklo_ps(r0, r1);
t1 = _mm_unpackhi_ps(r0, r1);
t2 = _mm_unpacklo_ps(r2, r3);
t3 = _mm_unpackhi_ps(r2, r3);

v  = _mm_shuffle_ps(t0,t2, 0x4E);
r0 = _mm_blend_ps(t0,v, 0xC);
r1 = _mm_blend_ps(t2,v, 0x3);
v  = _mm_shuffle_ps(t1,t3, 0x4E);
r2 = _mm_blend_ps(t1,v, 0xC);
r3 = _mm_blend_ps(t3,v, 0x3);

#include <bits/stdc++.h>
using namespace std;

union ua{
    int arr[2][3];
    int brr[3][2];
};

int main() {
    union ua uav;
    int karr[2][3] = {{1,2,3},{4,5,6}};
    memcpy(uav.arr,karr,sizeof(karr));
    for (int i=0;i<3;i++)
    {
        for (int j=0;j<2;j++)
            cout<<uav.brr[i][j]<<" ";
        cout<<'\n';
    }

    return 0;
}

#include <mkl.h>

void transpose( float* a, int n, int m ) {
    const char row_major = 'R';
    const char transpose = 'T';
    const float alpha = 1.0f;
    mkl_simatcopy (row_major, transpose, n, m, alpha, a, n, n);
}

#include <armadillo>

void transpose( arma::mat &matrix ) {
    arma::inplace_trans(matrix);
}