Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/fortran/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/ruby-on-rails-4/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Cuda Fortran MPI_Sendrecv分段错误_Cuda_Fortran - Fatal编程技术网

Cuda Fortran MPI_Sendrecv分段错误

Cuda Fortran MPI_Sendrecv分段错误,cuda,fortran,Cuda,Fortran,我正在尝试运行“科学家和工程师用Cuda Fortran”中的代码 但我不明白,这是一个片段错误 [mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11) [mpi_rank_1][error_sighandler] Caught error: Segmentation fault (signal 11) [mpi_rank_2][error_sighandler] Caught error: Segm

我正在尝试运行“科学家和工程师用Cuda Fortran”中的代码 但我不明白,这是一个片段错误

[mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_1][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_2][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_3][error_sighandler] Caught error: Segmentation fault (signal 11)
我的系统是64 linux,我有PGI编译器。cuda驱动程序是4.0以下是我从书中获取的代码。我可以编译,但似乎MPI_Sendrecv不起作用。 我已经安装了MVAPICH2.1.8。 此命令复制了代码

/usr/.../mvapich/bin/mpif90 filename.cuf
根据您的评论进行编辑

当我使用-C选项构建时,编译失败

pgfortran-Info-Switch -Mvect -fast forces -O2
PGF90-S-0155-Kernel region ignored; see -Minfo messages  (transposeMVA.cuf: 188)
transposempi:
    140, Loop not vectorized/parallelized: contains call
    146, Loop not vectorized/parallelized: contains call
    157, Loop not vectorized/parallelized: contains call
    190, Accelerator restriction: function/procedure calls are not supported
         Loop not vectorized/parallelized: contains call
    191, Accelerator restriction: function/procedure calls are not supported
    217, all reduction inlined
         Loop not vectorized/parallelized: contains call
  0 inform,   0 warnings,   1 severes, 0 fatal for transposempi
当我删除-C选项时,编译通过,但结果仍然相同

/mpif90 -g -O0 -Minfo transposeMVA.cuf       pgfortran-Info-Switch -Mvect -fast forces -O2
transposempi:
    140, Generated vector sse code for the loop
    146, Loop not vectorized: may not be beneficial
         Unrolled inner loop 8 times
    157, Memory copy idiom, loop replaced by call to __c_mcopy4
    178, Loop not vectorized/parallelized: contains call
    190, CUDA kernel generated
        190, !$cuf kernel do <<< (*,*), (128,1) >>>
    217, all reduction inlined
/mpif90-g-O0-Minfo transposeMVA.cuf pgfortran信息开关-Mvect-fast forces-O2
transposempi:
140,为循环生成的矢量sse代码
146,循环未矢量化:可能没有好处
将内环展开8次
157,内存复制习惯用法,循环替换为对u c_mcopy4的调用
178,循环未矢量化/并行化:包含调用
190,CUDA内核生成
190, !$cuf内核do>
217,所有还原都是内联的
我将感谢任何帮助

module transpose_m

  implicit none
  integer, parameter :: cudaTileDim = 32
  integer, parameter :: blockRows = 8

contains

  attributes(global) &
       subroutine cudaTranspose(odata, ldo, idata, ldi)
    real, intent(out) :: odata(ldo,*)
    real, intent(in) :: idata(ldi,*)
    integer, value, intent(in) :: ldo, ldi
    real, shared :: tile(cudaTileDim+1, cudaTileDim)
    integer :: x, y, j

    x = (blockIdx%x-1) * cudaTileDim + threadIdx%x
    y = (blockIdx%y-1) * cudaTileDim + threadIdx%y

    do j = 0, cudaTileDim-1, blockRows
       tile(threadIdx%x, threadIdx%y+j) = idata(x,y+j)
    end do

    call syncthreads()

    x = (blockIdx%y-1) * cudaTileDim + threadIdx%x
    y = (blockIdx%x-1) * cudaTileDim + threadIdx%y

    do j = 0, cudaTileDim-1, blockRows
       odata(x,y+j) = tile(threadIdx%y+j, threadIdx%x)          
    end do
  end subroutine cudaTranspose

end module transpose_m

!
! Main code
!

program transposeMPI
  use cudafor
  use mpi
  use transpose_m 

  implicit none

  ! global array size
  integer, parameter :: nx = 2048, ny = 2048

  ! host arrays (global)
  real :: h_idata(nx,ny), h_tdata(ny,nx), gold(ny,nx)

  ! CUDA vars and device arrays
  integer :: deviceID
  type (dim3) :: dimGrid, dimBlock
  real, device, allocatable :: &
       d_idata(:,:), d_tdata(:,:), d_sTile(:,:), d_rTile(:,:)

  ! MPI stuff
  integer :: mpiTileDimX, mpiTileDimY
  integer :: myrank, nprocs, tag, ierr, localRank
  integer :: nstages, stage, sRank, rRank
  integer :: status(MPI_STATUS_SIZE)
  real(8) :: timeStart, timeStop
  character (len=10) :: localRankStr

  integer :: i, j, nyl, jl, jg, p
  integer :: xOffset, yOffset

  ! for MVAPICH set device before MPI initialization

  call get_environment_variable('MV2_COMM_WORLD_LOCAL_RANK', &
       localRankStr)
  read(localRankStr,'(i10)') localRank
  ierr = cudaSetDevice(localRank)

  ! MPI initialization

  call MPI_init(ierr)
  call MPI_comm_rank(MPI_COMM_WORLD, myrank, ierr)
  call MPI_comm_size(MPI_COMM_WORLD, nProcs, ierr)

  ! check parameters and calculate execution configuration

  if (mod(nx,nProcs) == 0 .and. mod(ny,nProcs) == 0) then
     mpiTileDimX = nx/nProcs
     mpiTileDimY = ny/nProcs
  else
     write(*,*) 'ny must be an integral multiple of nProcs'
     call MPI_Finalize(ierr)
     stop
  endif

  if (mod(mpiTileDimX, cudaTileDim) /= 0 .or. &
       mod(mpiTileDimY, cudaTileDim) /= 0) then
     write(*,*) 'mpiTileDimX and mpitileDimY must be an ', &
          'integral multiple of cudaTileDim'
     call MPI_Finalize(ierr)
     stop
  end if

  if (mod(cudaTileDim, blockRows) /= 0) then
     write(*,*) 'cudaTileDim must be a multiple of blockRows'
     call MPI_Finalize(ierr)
     stop
  end if

  dimGrid = dim3(mpiTileDimX/cudaTileDim, &
       mpiTileDimY/cudaTileDim, 1)
  dimBlock = dim3(cudaTileDim, blockRows, 1)

  ! write parameters

  if (myrank == 0) then
     write(*,*)
     write(*,"(/,'Array size: ', i0,'x',i0,/)") nx, ny

     write(*,"('CUDA block size: ', i0,'x',i0, &
          ',  CUDA tile size: ', i0,'x',i0)") &
          cudaTileDim, blockRows, cudaTileDim, cudaTileDim

     write(*,"('dimGrid: ', i0,'x',i0,'x',i0, &
          ',   dimBlock: ', i0,'x',i0,'x',i0,/)") &
          dimGrid%x, dimGrid%y, dimGrid%z, &
          dimBlock%x, dimBlock%y, dimBlock%z

     write(*,"('nprocs: ', i0, ',  Local input array size: ', &
          i0,'x',i0)") nprocs, nx, mpiTileDimY
     write(*,"('mpiTileDim: ', i0,'x',i0,/)") &
          mpiTileDimX, mpiTileDimY
  endif

  ! initialize data

  ! host - each process has entire array on host (for now)

  do p = 0, nProcs-1
     do jl = 1, mpiTileDimY
        jg = p*mpiTileDimY + jl
        do i = 1, nx
           h_idata(i,jg) = i+(jg-1)*nx 
        enddo
     enddo
  enddo

  gold = transpose(h_idata)

  ! device - each process has 
  ! nx*mpiTileDimY = ny*mpiTileDimX  elements

  allocate(d_idata(nx, mpiTileDimY), &
       d_tdata(ny, mpiTileDimX), &
       d_sTile(mpiTileDimX,mpiTileDimY), &
       d_rTile(mpiTileDimX, mpiTileDimY))

  yOffset = myrank*mpiTileDimY
  d_idata(1:nx,1:mpiTileDimY) = &
       h_idata(1:nx,yOffset+1:yOffset+mpiTileDimY)

  d_tdata = -1.0


  ! ---------
  ! transpose
  ! ---------

  call MPI_BARRIER(MPI_COMM_WORLD, ierr)
  timeStart = MPI_Wtime()

  ! 0th stage - local transpose

  call cudaTranspose<<<dimGrid, dimBlock>>> &
       (d_tdata(myrank*mpiTileDimY+1,1), ny, &
       d_idata(myrank*mpiTileDimX+1,1), nx)

  ! other stages that involve MPI transfers

  do stage = 1, nProcs-1
     ! sRank = the rank to which myrank sends data
     ! rRank = the rank from which myrank receives data
     sRank = modulo(myrank-stage, nProcs) 
     rRank = modulo(myrank+stage, nProcs) 

     call MPI_BARRIER(MPI_COMM_WORLD, ierr)

     ! pack tile so data to be sent is contiguous

     !$cuf kernel do(2) <<<*,*>>>
     do j = 1, mpiTileDimY
        do i = 1, mpiTileDimX
           d_sTile(i,j) = d_idata(sRank*mpiTileDimX+i,j)
        enddo
     enddo

     call MPI_SENDRECV(d_sTile, mpiTileDimX*mpiTileDimY, &
          MPI_REAL, sRank, myrank, &
          d_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, &
          rRank, rRank, MPI_COMM_WORLD, status, ierr)

     ! do transpose from receive tile into final array 
     ! (no need to unpack)

     call cudaTranspose<<<dimGrid, dimBlock>>> &
          (d_tdata(rRank*mpiTileDimY+1,1), ny, &
          d_rTile, mpiTileDimX)

  end do ! stage     

  call MPI_BARRIER(MPI_COMM_WORLD, ierr)
  timeStop = MPI_Wtime()

  ! check results

  h_tdata = d_tdata

  xOffset = myrank*mpiTileDimX
  if (all(h_tdata(1:ny,1:mpiTileDimX) == &
       gold(1:ny, xOffset+1:xOffset+mpiTileDimX))) then
     if (myrank == 0) then
        write(*,"('Bandwidth (GB/s): ', f7.2,/)") &
             2.*(nx*ny*4)/(1.0e+9*(timeStop-timeStart)) 
     endif
  else
     write(*,"('[',i0,']', *** Failed ***,/)") myrank
  endif

  ! cleanup

  deallocate(d_idata, d_tdata, d_sTile, d_rTile)

  call MPI_Finalize(ierr)

end program transposeMPI
模块转置
隐式无
整数,参数::cudaTileDim=32
整数,参数::blockRows=8
包含
属性(全局)&
子程序CUDATTranspose(odata、ldo、idata、ldi)
真实,意图(外)::odata(ldo,*)
真实,意图(in):idata(本地设计院,*)
整数、值、意图(in)::ldo、ldi
real,shared::tile(cudaTileDim+1,cudaTileDim)
整数::x,y,j
x=(块IDX%x-1)*cudaTileDim+threadIdx%x
y=(块IDX%y-1)*cudaTileDim+threadIdx%y
do j=0,cudaTileDim-1,块行
平铺(线程IDX%x,线程IDX%y+j)=idata(x,y+j)
结束
调用syncthreads()
x=(块IDX%y-1)*cudaTileDim+threadIdx%x
y=(块IDX%x-1)*cudaTileDim+threadIdx%y
do j=0,cudaTileDim-1,块行
odata(x,y+j)=平铺(线程IDx%y+j,线程IDx%x)
结束
结束子程序CUDATTranspose
端模块转置
!
! 主代码
!
程序转置符号
使用cudafor
使用mpi
使用转置
隐式无
! 全局数组大小
整数,参数::nx=2048,ny=2048
! 主机阵列(全局)
雷亚尔:哈伊达(nx,nx)、哈蒂达(ny,nx)、黄金(ny,nx)
! CUDA VAR和设备阵列
整数::设备ID
类型(dim3)::dimGrid、dimBlock
实型,设备,可分配::&
d_idata(:,:),d_tdata(:,:),d_sTile(:,:),d_rTile(:,:)
! MPI材料
整数::mpiTileDimX,mpiTileDimY
整数::myrank、nprocs、标记、ierr、localRank
整数::nstages、stage、sRank、rRank
整数::状态(MPI\U状态\U大小)
real(8)::timeStart、timeStop
字符(len=10)::localRankStr
整数::i,j,nyl,jl,jg,p
整数::xOffset,yOffset
! 对于MPI初始化之前的MVAPICH设置设备
调用get_环境_变量('MV2_COMM_WORLD_LOCAL_RANK'&
localRankStr)
读取(localRankStr),(i10)’localRank
ierr=cudaSetDevice(localRank)
! MPI初始化
调用MPI_init(ierr)
调用MPI_comm_rank(MPI_comm_WORLD、myrank、ierr)
呼叫MPI_通信大小(MPI_通信世界、NPROC、ierr)
! 检查参数并计算执行配置
如果(mod(nx,nProcs)==0.和.mod(ny,nProcs)==0),则
mpiTileDimX=nx/nProcs
mpiTileDimY=ny/nProcs
其他的
写入(*,*)“ny必须是NPROC的整数倍”
调用MPI_Finalize(ierr)
停止
恩迪夫
如果(mod(mpiTileDimX,cudaTileDim)/=0.或&
mod(mpiTileDimY,cudaTileDim)/=0)然后
写入(*,*)“mpiTileDimX和mpitileDimY必须是”&
“cudaTileDim的整数倍”
调用MPI_Finalize(ierr)
停止
如果结束
如果(mod(cudaTileDim,blockRows)/=0),则
写入(*,*)“cudaTileDim必须是块行的倍数”
调用MPI_Finalize(ierr)
停止
如果结束
dimGrid=dim3(mpiTileDimX/cudaTileDim&
mpiTileDimY/cudaTileDim,1)
dimBlock=dim3(cudaTileDim,块行,1)
! 写入参数
如果(myrank==0),则
写入(*,*)
写入(*,“(/,“数组大小:”,i0,'x',i0,/)”)nx,ny
写入(*,“('CUDA块大小:',i0,'x',i0&
“,CUDA磁贴大小:”,i0,'x',i0)”&
cudaTileDim,块行,cudaTileDim,cudaTileDim
写入(*,“('dimGrid:',i0,'x',i0,'x',i0&
“,dimBlock:”,i0,'x',i0,'x',i0,/))&
dimGrid%x,dimGrid%y,dimGrid%z&
dimBlock%x,dimBlock%y,dimBlock%z
写入(*,“('nprocs:',i0',本地输入数组大小:'&
i0,'x',i0)“)NPROC,nx,mpiTileDimY
写入(*,“('mpiTileDim:',i0,'x',i0,/)”)&
mpiTileDimX,mpiTileDimY
恩迪夫
! 初始化数据
! 主机-每个进程在主机上都有整个阵列(目前)
do p=0,nProcs-1
do jl=1,mpiTileDimY
jg=p*mpiTileDimY+jl
i=1,nx吗
h_idata(i,jg)=i+(jg-1)*nx
结束循环
结束循环
结束循环
gold=转置(h_idata)
! 设备-每个进程都有
! nx*mpiTileDimY=ny*mpiTileDimX元素
分配(d_idata(nx,mpiTileDimY)&
d_tdata(纽约,mpiTileDimX)&
d_sTile(mpiTileDimX,mpiTileDimY)&
d_rTile(mpiTileDimX,mpiTileDimY))
yOffset=myrank*mpiTileDimY
d_idata(1:nx,1:mpiTileDimY)=&
h_idata(1:nx,yOffset+1:yOffset+mpiTileDimY)
d_tdata=-1.0
! ---------
! 转置
! ---------
呼叫MPI_屏障(MPI_通信世界,ierr)
timeStart=MPI_Wtime()
! 第0阶段-局部转置
叫cudaTranspose&
(d_tdata(myrank*mpiTileDimY+1,1),纽约州&
d_idata(myrank*mpiTileDimX+1,1),nx)
! 涉及MPI传输的其他阶段
do阶段=1,nProcs-1
! sRank=myrank向其发送数据的列组
! rRank=myrank从中接收数据的列组
sRank=模(myrank阶段,NPROC)
rRank=模(myrank+阶段,NPROC)
呼叫MPI_屏障(MPI_通信世界,ierr)
! 打包磁贴,以便发送的数据是连续的
!$cuf内核do(2)
j=1,mpiTileDimY吗
i=1,mpiTileDimX吗
杜斯蒂尔(
    h_sTile = d_sTile

    call MPI_SENDRECV(h_sTile, mpiTileDimX*mpiTileDimY, &
    MPI_REAL, sRank, myrank, &
    h_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, &
    rRank, rRank, MPI_COMM_WORLD, status, ierr)

   !data to device device buffer 
   d_rTile = h_rTile