Cuda Fortran MPI_Sendrecv分段错误
我正在尝试运行“科学家和工程师用Cuda Fortran”中的代码 但我不明白,这是一个片段错误Cuda Fortran MPI_Sendrecv分段错误,cuda,fortran,Cuda,Fortran,我正在尝试运行“科学家和工程师用Cuda Fortran”中的代码 但我不明白,这是一个片段错误 [mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11) [mpi_rank_1][error_sighandler] Caught error: Segmentation fault (signal 11) [mpi_rank_2][error_sighandler] Caught error: Segm
[mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_1][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_2][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_3][error_sighandler] Caught error: Segmentation fault (signal 11)
我的系统是64 linux,我有PGI编译器。cuda驱动程序是4.0以下是我从书中获取的代码。我可以编译,但似乎MPI_Sendrecv不起作用。
我已经安装了MVAPICH2.1.8。
此命令复制了代码
/usr/.../mvapich/bin/mpif90 filename.cuf
根据您的评论进行编辑
当我使用-C选项构建时,编译失败
pgfortran-Info-Switch -Mvect -fast forces -O2
PGF90-S-0155-Kernel region ignored; see -Minfo messages (transposeMVA.cuf: 188)
transposempi:
140, Loop not vectorized/parallelized: contains call
146, Loop not vectorized/parallelized: contains call
157, Loop not vectorized/parallelized: contains call
190, Accelerator restriction: function/procedure calls are not supported
Loop not vectorized/parallelized: contains call
191, Accelerator restriction: function/procedure calls are not supported
217, all reduction inlined
Loop not vectorized/parallelized: contains call
0 inform, 0 warnings, 1 severes, 0 fatal for transposempi
当我删除-C选项时,编译通过,但结果仍然相同
/mpif90 -g -O0 -Minfo transposeMVA.cuf pgfortran-Info-Switch -Mvect -fast forces -O2
transposempi:
140, Generated vector sse code for the loop
146, Loop not vectorized: may not be beneficial
Unrolled inner loop 8 times
157, Memory copy idiom, loop replaced by call to __c_mcopy4
178, Loop not vectorized/parallelized: contains call
190, CUDA kernel generated
190, !$cuf kernel do <<< (*,*), (128,1) >>>
217, all reduction inlined
/mpif90-g-O0-Minfo transposeMVA.cuf pgfortran信息开关-Mvect-fast forces-O2
transposempi:
140,为循环生成的矢量sse代码
146,循环未矢量化:可能没有好处
将内环展开8次
157,内存复制习惯用法,循环替换为对u c_mcopy4的调用
178,循环未矢量化/并行化:包含调用
190,CUDA内核生成
190, !$cuf内核do>
217,所有还原都是内联的
我将感谢任何帮助
module transpose_m
implicit none
integer, parameter :: cudaTileDim = 32
integer, parameter :: blockRows = 8
contains
attributes(global) &
subroutine cudaTranspose(odata, ldo, idata, ldi)
real, intent(out) :: odata(ldo,*)
real, intent(in) :: idata(ldi,*)
integer, value, intent(in) :: ldo, ldi
real, shared :: tile(cudaTileDim+1, cudaTileDim)
integer :: x, y, j
x = (blockIdx%x-1) * cudaTileDim + threadIdx%x
y = (blockIdx%y-1) * cudaTileDim + threadIdx%y
do j = 0, cudaTileDim-1, blockRows
tile(threadIdx%x, threadIdx%y+j) = idata(x,y+j)
end do
call syncthreads()
x = (blockIdx%y-1) * cudaTileDim + threadIdx%x
y = (blockIdx%x-1) * cudaTileDim + threadIdx%y
do j = 0, cudaTileDim-1, blockRows
odata(x,y+j) = tile(threadIdx%y+j, threadIdx%x)
end do
end subroutine cudaTranspose
end module transpose_m
!
! Main code
!
program transposeMPI
use cudafor
use mpi
use transpose_m
implicit none
! global array size
integer, parameter :: nx = 2048, ny = 2048
! host arrays (global)
real :: h_idata(nx,ny), h_tdata(ny,nx), gold(ny,nx)
! CUDA vars and device arrays
integer :: deviceID
type (dim3) :: dimGrid, dimBlock
real, device, allocatable :: &
d_idata(:,:), d_tdata(:,:), d_sTile(:,:), d_rTile(:,:)
! MPI stuff
integer :: mpiTileDimX, mpiTileDimY
integer :: myrank, nprocs, tag, ierr, localRank
integer :: nstages, stage, sRank, rRank
integer :: status(MPI_STATUS_SIZE)
real(8) :: timeStart, timeStop
character (len=10) :: localRankStr
integer :: i, j, nyl, jl, jg, p
integer :: xOffset, yOffset
! for MVAPICH set device before MPI initialization
call get_environment_variable('MV2_COMM_WORLD_LOCAL_RANK', &
localRankStr)
read(localRankStr,'(i10)') localRank
ierr = cudaSetDevice(localRank)
! MPI initialization
call MPI_init(ierr)
call MPI_comm_rank(MPI_COMM_WORLD, myrank, ierr)
call MPI_comm_size(MPI_COMM_WORLD, nProcs, ierr)
! check parameters and calculate execution configuration
if (mod(nx,nProcs) == 0 .and. mod(ny,nProcs) == 0) then
mpiTileDimX = nx/nProcs
mpiTileDimY = ny/nProcs
else
write(*,*) 'ny must be an integral multiple of nProcs'
call MPI_Finalize(ierr)
stop
endif
if (mod(mpiTileDimX, cudaTileDim) /= 0 .or. &
mod(mpiTileDimY, cudaTileDim) /= 0) then
write(*,*) 'mpiTileDimX and mpitileDimY must be an ', &
'integral multiple of cudaTileDim'
call MPI_Finalize(ierr)
stop
end if
if (mod(cudaTileDim, blockRows) /= 0) then
write(*,*) 'cudaTileDim must be a multiple of blockRows'
call MPI_Finalize(ierr)
stop
end if
dimGrid = dim3(mpiTileDimX/cudaTileDim, &
mpiTileDimY/cudaTileDim, 1)
dimBlock = dim3(cudaTileDim, blockRows, 1)
! write parameters
if (myrank == 0) then
write(*,*)
write(*,"(/,'Array size: ', i0,'x',i0,/)") nx, ny
write(*,"('CUDA block size: ', i0,'x',i0, &
', CUDA tile size: ', i0,'x',i0)") &
cudaTileDim, blockRows, cudaTileDim, cudaTileDim
write(*,"('dimGrid: ', i0,'x',i0,'x',i0, &
', dimBlock: ', i0,'x',i0,'x',i0,/)") &
dimGrid%x, dimGrid%y, dimGrid%z, &
dimBlock%x, dimBlock%y, dimBlock%z
write(*,"('nprocs: ', i0, ', Local input array size: ', &
i0,'x',i0)") nprocs, nx, mpiTileDimY
write(*,"('mpiTileDim: ', i0,'x',i0,/)") &
mpiTileDimX, mpiTileDimY
endif
! initialize data
! host - each process has entire array on host (for now)
do p = 0, nProcs-1
do jl = 1, mpiTileDimY
jg = p*mpiTileDimY + jl
do i = 1, nx
h_idata(i,jg) = i+(jg-1)*nx
enddo
enddo
enddo
gold = transpose(h_idata)
! device - each process has
! nx*mpiTileDimY = ny*mpiTileDimX elements
allocate(d_idata(nx, mpiTileDimY), &
d_tdata(ny, mpiTileDimX), &
d_sTile(mpiTileDimX,mpiTileDimY), &
d_rTile(mpiTileDimX, mpiTileDimY))
yOffset = myrank*mpiTileDimY
d_idata(1:nx,1:mpiTileDimY) = &
h_idata(1:nx,yOffset+1:yOffset+mpiTileDimY)
d_tdata = -1.0
! ---------
! transpose
! ---------
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
timeStart = MPI_Wtime()
! 0th stage - local transpose
call cudaTranspose<<<dimGrid, dimBlock>>> &
(d_tdata(myrank*mpiTileDimY+1,1), ny, &
d_idata(myrank*mpiTileDimX+1,1), nx)
! other stages that involve MPI transfers
do stage = 1, nProcs-1
! sRank = the rank to which myrank sends data
! rRank = the rank from which myrank receives data
sRank = modulo(myrank-stage, nProcs)
rRank = modulo(myrank+stage, nProcs)
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
! pack tile so data to be sent is contiguous
!$cuf kernel do(2) <<<*,*>>>
do j = 1, mpiTileDimY
do i = 1, mpiTileDimX
d_sTile(i,j) = d_idata(sRank*mpiTileDimX+i,j)
enddo
enddo
call MPI_SENDRECV(d_sTile, mpiTileDimX*mpiTileDimY, &
MPI_REAL, sRank, myrank, &
d_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, &
rRank, rRank, MPI_COMM_WORLD, status, ierr)
! do transpose from receive tile into final array
! (no need to unpack)
call cudaTranspose<<<dimGrid, dimBlock>>> &
(d_tdata(rRank*mpiTileDimY+1,1), ny, &
d_rTile, mpiTileDimX)
end do ! stage
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
timeStop = MPI_Wtime()
! check results
h_tdata = d_tdata
xOffset = myrank*mpiTileDimX
if (all(h_tdata(1:ny,1:mpiTileDimX) == &
gold(1:ny, xOffset+1:xOffset+mpiTileDimX))) then
if (myrank == 0) then
write(*,"('Bandwidth (GB/s): ', f7.2,/)") &
2.*(nx*ny*4)/(1.0e+9*(timeStop-timeStart))
endif
else
write(*,"('[',i0,']', *** Failed ***,/)") myrank
endif
! cleanup
deallocate(d_idata, d_tdata, d_sTile, d_rTile)
call MPI_Finalize(ierr)
end program transposeMPI
模块转置
隐式无
整数,参数::cudaTileDim=32
整数,参数::blockRows=8
包含
属性(全局)&
子程序CUDATTranspose(odata、ldo、idata、ldi)
真实,意图(外)::odata(ldo,*)
真实,意图(in):idata(本地设计院,*)
整数、值、意图(in)::ldo、ldi
real,shared::tile(cudaTileDim+1,cudaTileDim)
整数::x,y,j
x=(块IDX%x-1)*cudaTileDim+threadIdx%x
y=(块IDX%y-1)*cudaTileDim+threadIdx%y
do j=0,cudaTileDim-1,块行
平铺(线程IDX%x,线程IDX%y+j)=idata(x,y+j)
结束
调用syncthreads()
x=(块IDX%y-1)*cudaTileDim+threadIdx%x
y=(块IDX%x-1)*cudaTileDim+threadIdx%y
do j=0,cudaTileDim-1,块行
odata(x,y+j)=平铺(线程IDx%y+j,线程IDx%x)
结束
结束子程序CUDATTranspose
端模块转置
!
! 主代码
!
程序转置符号
使用cudafor
使用mpi
使用转置
隐式无
! 全局数组大小
整数,参数::nx=2048,ny=2048
! 主机阵列(全局)
雷亚尔:哈伊达(nx,nx)、哈蒂达(ny,nx)、黄金(ny,nx)
! CUDA VAR和设备阵列
整数::设备ID
类型(dim3)::dimGrid、dimBlock
实型,设备,可分配::&
d_idata(:,:),d_tdata(:,:),d_sTile(:,:),d_rTile(:,:)
! MPI材料
整数::mpiTileDimX,mpiTileDimY
整数::myrank、nprocs、标记、ierr、localRank
整数::nstages、stage、sRank、rRank
整数::状态(MPI\U状态\U大小)
real(8)::timeStart、timeStop
字符(len=10)::localRankStr
整数::i,j,nyl,jl,jg,p
整数::xOffset,yOffset
! 对于MPI初始化之前的MVAPICH设置设备
调用get_环境_变量('MV2_COMM_WORLD_LOCAL_RANK'&
localRankStr)
读取(localRankStr),(i10)’localRank
ierr=cudaSetDevice(localRank)
! MPI初始化
调用MPI_init(ierr)
调用MPI_comm_rank(MPI_comm_WORLD、myrank、ierr)
呼叫MPI_通信大小(MPI_通信世界、NPROC、ierr)
! 检查参数并计算执行配置
如果(mod(nx,nProcs)==0.和.mod(ny,nProcs)==0),则
mpiTileDimX=nx/nProcs
mpiTileDimY=ny/nProcs
其他的
写入(*,*)“ny必须是NPROC的整数倍”
调用MPI_Finalize(ierr)
停止
恩迪夫
如果(mod(mpiTileDimX,cudaTileDim)/=0.或&
mod(mpiTileDimY,cudaTileDim)/=0)然后
写入(*,*)“mpiTileDimX和mpitileDimY必须是”&
“cudaTileDim的整数倍”
调用MPI_Finalize(ierr)
停止
如果结束
如果(mod(cudaTileDim,blockRows)/=0),则
写入(*,*)“cudaTileDim必须是块行的倍数”
调用MPI_Finalize(ierr)
停止
如果结束
dimGrid=dim3(mpiTileDimX/cudaTileDim&
mpiTileDimY/cudaTileDim,1)
dimBlock=dim3(cudaTileDim,块行,1)
! 写入参数
如果(myrank==0),则
写入(*,*)
写入(*,“(/,“数组大小:”,i0,'x',i0,/)”)nx,ny
写入(*,“('CUDA块大小:',i0,'x',i0&
“,CUDA磁贴大小:”,i0,'x',i0)”&
cudaTileDim,块行,cudaTileDim,cudaTileDim
写入(*,“('dimGrid:',i0,'x',i0,'x',i0&
“,dimBlock:”,i0,'x',i0,'x',i0,/))&
dimGrid%x,dimGrid%y,dimGrid%z&
dimBlock%x,dimBlock%y,dimBlock%z
写入(*,“('nprocs:',i0',本地输入数组大小:'&
i0,'x',i0)“)NPROC,nx,mpiTileDimY
写入(*,“('mpiTileDim:',i0,'x',i0,/)”)&
mpiTileDimX,mpiTileDimY
恩迪夫
! 初始化数据
! 主机-每个进程在主机上都有整个阵列(目前)
do p=0,nProcs-1
do jl=1,mpiTileDimY
jg=p*mpiTileDimY+jl
i=1,nx吗
h_idata(i,jg)=i+(jg-1)*nx
结束循环
结束循环
结束循环
gold=转置(h_idata)
! 设备-每个进程都有
! nx*mpiTileDimY=ny*mpiTileDimX元素
分配(d_idata(nx,mpiTileDimY)&
d_tdata(纽约,mpiTileDimX)&
d_sTile(mpiTileDimX,mpiTileDimY)&
d_rTile(mpiTileDimX,mpiTileDimY))
yOffset=myrank*mpiTileDimY
d_idata(1:nx,1:mpiTileDimY)=&
h_idata(1:nx,yOffset+1:yOffset+mpiTileDimY)
d_tdata=-1.0
! ---------
! 转置
! ---------
呼叫MPI_屏障(MPI_通信世界,ierr)
timeStart=MPI_Wtime()
! 第0阶段-局部转置
叫cudaTranspose&
(d_tdata(myrank*mpiTileDimY+1,1),纽约州&
d_idata(myrank*mpiTileDimX+1,1),nx)
! 涉及MPI传输的其他阶段
do阶段=1,nProcs-1
! sRank=myrank向其发送数据的列组
! rRank=myrank从中接收数据的列组
sRank=模(myrank阶段,NPROC)
rRank=模(myrank+阶段,NPROC)
呼叫MPI_屏障(MPI_通信世界,ierr)
! 打包磁贴,以便发送的数据是连续的
!$cuf内核do(2)
j=1,mpiTileDimY吗
i=1,mpiTileDimX吗
杜斯蒂尔(
h_sTile = d_sTile
call MPI_SENDRECV(h_sTile, mpiTileDimX*mpiTileDimY, &
MPI_REAL, sRank, myrank, &
h_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, &
rRank, rRank, MPI_COMM_WORLD, status, ierr)
!data to device device buffer
d_rTile = h_rTile