Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/125.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++;与Fortran并行MM速度差循环平铺 我无法理解为什么在C++中,矩阵乘法的执行速度是3倍,然后是并行运行时各自的FORTRAN代码。这与串行版本大致相同 program scheduling !$ use omp_lib implicit none integer :: i,j,k,y,x,z integer, parameter :: tile = 8, N = 1000 double precision, dimension(:), allocatable :: a,b,c,d ! data must be allocated on the heap, otherwise OpenMP would allocate it in the stack and an stackoverflow would occur for larger matrices. double precision :: E,S allocate(a(N*N)) allocate(b(N*N)) allocate(c(N*N)) allocate(d(N*N)) call random_seed() call random_number(b) call random_number(a) !transpose b do i = 1,N do j = 1,N d((i-1)*N+j) = b((j-1)*N+i) end do end do S = omp_get_wtime() !$OMP PARALLEL DO SHARED(a,d,c) PRIVATE(i,j,k,x,y,z) SCHEDULE(static) do i = 1,N,tile do j = 1,N,tile do k = 1,N,tile do x = i, min( i+tile-1,N) do y = j, min( j+tile-1,N) do z = k, min( k+tile-1,N) c((x-1)*N+y) = c((x-1)*N+y) + a((x-1)*N+z) * d(z+(y-1)*N) enddo enddo enddo enddo enddo enddo !$OMP END PARALLEL DO E = omp_get_wtime() print*, (E-S) ! Deallocation of memory deallocate(a) deallocate(b) deallocate(c) deallocate(d) end program scheduling_C++_Multithreading_Parallel Processing_Fortran_Matrix Multiplication - Fatal编程技术网

C++;与Fortran并行MM速度差循环平铺 我无法理解为什么在C++中,矩阵乘法的执行速度是3倍,然后是并行运行时各自的FORTRAN代码。这与串行版本大致相同 program scheduling !$ use omp_lib implicit none integer :: i,j,k,y,x,z integer, parameter :: tile = 8, N = 1000 double precision, dimension(:), allocatable :: a,b,c,d ! data must be allocated on the heap, otherwise OpenMP would allocate it in the stack and an stackoverflow would occur for larger matrices. double precision :: E,S allocate(a(N*N)) allocate(b(N*N)) allocate(c(N*N)) allocate(d(N*N)) call random_seed() call random_number(b) call random_number(a) !transpose b do i = 1,N do j = 1,N d((i-1)*N+j) = b((j-1)*N+i) end do end do S = omp_get_wtime() !$OMP PARALLEL DO SHARED(a,d,c) PRIVATE(i,j,k,x,y,z) SCHEDULE(static) do i = 1,N,tile do j = 1,N,tile do k = 1,N,tile do x = i, min( i+tile-1,N) do y = j, min( j+tile-1,N) do z = k, min( k+tile-1,N) c((x-1)*N+y) = c((x-1)*N+y) + a((x-1)*N+z) * d(z+(y-1)*N) enddo enddo enddo enddo enddo enddo !$OMP END PARALLEL DO E = omp_get_wtime() print*, (E-S) ! Deallocation of memory deallocate(a) deallocate(b) deallocate(c) deallocate(d) end program scheduling

C++;与Fortran并行MM速度差循环平铺 我无法理解为什么在C++中,矩阵乘法的执行速度是3倍,然后是并行运行时各自的FORTRAN代码。这与串行版本大致相同 program scheduling !$ use omp_lib implicit none integer :: i,j,k,y,x,z integer, parameter :: tile = 8, N = 1000 double precision, dimension(:), allocatable :: a,b,c,d ! data must be allocated on the heap, otherwise OpenMP would allocate it in the stack and an stackoverflow would occur for larger matrices. double precision :: E,S allocate(a(N*N)) allocate(b(N*N)) allocate(c(N*N)) allocate(d(N*N)) call random_seed() call random_number(b) call random_number(a) !transpose b do i = 1,N do j = 1,N d((i-1)*N+j) = b((j-1)*N+i) end do end do S = omp_get_wtime() !$OMP PARALLEL DO SHARED(a,d,c) PRIVATE(i,j,k,x,y,z) SCHEDULE(static) do i = 1,N,tile do j = 1,N,tile do k = 1,N,tile do x = i, min( i+tile-1,N) do y = j, min( j+tile-1,N) do z = k, min( k+tile-1,N) c((x-1)*N+y) = c((x-1)*N+y) + a((x-1)*N+z) * d(z+(y-1)*N) enddo enddo enddo enddo enddo enddo !$OMP END PARALLEL DO E = omp_get_wtime() print*, (E-S) ! Deallocation of memory deallocate(a) deallocate(b) deallocate(c) deallocate(d) end program scheduling,c++,multithreading,parallel-processing,fortran,matrix-multiplication,C++,Multithreading,Parallel Processing,Fortran,Matrix Multiplication,并使用以下工具进行编译: $ gfortran -O3 -fopenmp scheduling.f08 -o scheduling $ ./scheduling 0.901.... !for the parallel version and 1.3496... !for the serial version (顺便说一句,比索引版本(如a(i,j))慢) 和C++代码: #include <iostream> #include <cmath> #include <

并使用以下工具进行编译:

$ gfortran -O3 -fopenmp scheduling.f08 -o scheduling
$ ./scheduling
0.901.... !for the parallel version and
1.3496... !for the serial version
(顺便说一句,比索引版本(如a(i,j))慢)

和C++代码:

#include <iostream>
#include <cmath>
#include <omp.h>
#include <cstdlib>

int main(int argc, char *argv[])
{
    int i,j,k,x,y,z;
    const int N = 1000;
    double* a = new double[N*N];
    double* b = new double[N*N];
    double* c = new double[N*N];
    double* d = new double[N*N];

    int tile = 8; 

    for(int i = 0; i < N; i++){
        for(int j = 0; j < N; j++){
            a[i*N+j] = rand()%1000; 
            b[i*N+j] = rand()%1000; 
        }
    }
    // transpose
    for(int i = 0; i < N; i++){
        for(int j = 0; j < N; j++){
            d[i*N+j] = b[i+j*N];
        }   
    }
    double start = omp_get_wtime();

//#pragma omp parallel for shared(a,c,d) private(i,j,k,x,y,z) schedule(static)
    for( i = 0; i < N; i+=tile){
        for( j = 0; j < N; j+=tile){
            for( k = 0; k < N; k+=tile){
                for( x = i; x < std::min(i+tile,N); x++){
                    for( y = j; y < std::min(j+tile,N); y++){
                        for( z = k; z < std::min(k+tile,N); z++){
                            c[x*N+y] = c[x*N+y] +  a[x*N+z] * d[z+y*N];
                        }   
                    }     
                }
            }   
        }   
    } 

    double end = omp_get_wtime();
    std::cout << (end-start) << std::endl;

    delete[] a;
    delete[] b;
    delete[] c;
    delete[] d;

    return 0;

}

$g++ -O3 -fopenmp parallel.cpp -o parallel
$./parallel
2.347... //for the parallel version and 
1.47...  //for the serial one
#包括
#包括
#包括
#包括
int main(int argc,char*argv[])
{
int i,j,k,x,y,z;
常数int N=1000;
double*a=新的double[N*N];
double*b=新的double[N*N];
double*c=新的double[N*N];
double*d=新的double[N*N];
int-tile=8;
对于(int i=0;istd::cout有必要显式指定线程的数量。以下是我更改的内容:

... ceteris paribus ...
#pragma omp **omp_set_num_threads(2)** parallel for shared(a,c,d) private(i,j,k,x,y,z) 
... 

$ ./MM
0.98...
这比串行版本快一点,但至少不会慢很多


希望它有帮助。我猜,FORTRAN编译器能够在OpenMP指令的存在下对代码进行矢量化,但是C++编译器不是。而且很容易检查:尝试用一个线程运行OpenMP版本(通过OMPNUnthyType环境变量)好了。一个线程需要4.953秒。关于哪个指令要使用什么想法?你的GCC版本是什么?我用GCC 4.82.在Ubuntu 14.04上玩你的C++代码。问题是可重复的,代码变得慢了。omp parallel for

。我使用了
-fopt info vec optimized
-fopt info vec missed
来获取矢量化报告;如果这些选项工作正常,则两个版本的循环都没有矢量化。因此,一些优化明显遗漏了,但不清楚是哪一个。非常感谢。您提供了很大的帮助。至少现在我有地方可以开始搜索。不过,有趣的是,我在没有omp的情况下遇到了类似的错误。为什么你认为有必要这样做?你的shell中是否设置了
omp\u NUM\u线程
?我尝试过,速度更快。不,我没有在shell中设置它。事实上,我不知道为什么,我只能猜测。