! a cross b.f
!
! Fixed-Format Fortran Source File
! Generated by PGI Visual Fortran(R)
! 2010-12-12 21:58:04
!
!Parallel matrix multiplication: main program
program cross
implicit double precision (a-h, o-z)
include 'mpif.h'
parameter (nbuffer=128*1024*1024/8)
dimension buf(nbuffer),buf2(nbuffer)
double precision time_start, time_end
external init, check, matmul
call MPI_Init(ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, myrank, ierr)
call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr)
if (myrank.eq.0) then
print *, 'Enter M, N, L: '
call flush(6)
read(*,*) M, N, L
endif
call MPI_Bcast(M, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
call MPI_Bcast(N, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
call MPI_Bcast(L, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
if ( mod(m,nprocs).ne.0 .or. mod(l,nprocs).ne.0 ) then
if (myrank.eq.0) print *, 'M or L cannot be divided by nprocs!'
call MPI_Finalize(ierr)
stop
endif
ia = 1
ib = ia + m/nprocs ! n
ic = ib + n ! l/nprocs
iwk = ic + m/nprocs ! l
iend = iwk + n ! l/nprocs
if ( iend .gt. nbuffer+1 ) then
if (myrank.eq.0) print *, 'Insufficient buffer size!'
call MPI_Finalize(ierr)
stop
endif
call init( m, n, l, myrank, nprocs, buf(ia), buf(ib), buf(ic)
& , buf2(ia),buf2(ib),buf2(ic) )
time_start = MPI_Wtime()
call matmul( m, n, l, myrank, nprocs, buf2(ia), buf2(ib), buf2(ic)
&, buf2(iwk) )
time_end = MPI_Wtime()
call check( m, n, l, myrank, nprocs, buf2(ia), buf2(ib), buf2(ic))
if ( myrank .eq. 0 ) then
print *, 'time = ', time_end-time_start
print *, 'mflops = ', m*(n+n-1.0)*l/(time_end-time_start)*1d-6
endif
print*,'ok'
call MPI_Finalize(ierr)
stop
end
!------------------------------------------------------------------
subroutine init(m, n, l, myrank, nprocs, a, b, c, a2, b2,c2)
implicit double precision (a-h, o-z)
include 'mpif.h'
dimension a(m/nprocs, n), b(n, l/nprocs), c(m/nprocs, l)
dimension a2(n, m/nprocs), b2(l/nprocs, n), c2(l,m/nprocs)
mloc = m/nprocs
lloc = l/nprocs
! Init. a, b
do j=1, n
do i=1, mloc
a(i,j) = i+myrank*mloc
enddo
enddo
do j=1, lloc
do i=1, n
b(i,j) = j+myrank*lloc
enddo
enddo
! Tranpose a, b ->a2, b2
do j=1, mloc
do i=1,n
a2(i,j) = a(j,i)
enddo
enddo
do j=1, n
do i=1,lloc
b2(i,j) = b(j,i)
enddo
enddo
return
end
!------------------------------------------------------------------
subroutine check(m, n, l, myrank, nprocs, a, b, c)
implicit double precision (a-h, o-z)
include 'mpif.h'
dimension a(m/nprocs, n), b(n, l/nprocs), c(m/nprocs, l)
!dimension a(n,m/nprocs), b(l/nprocs,n), c(l,m/nprocs)
integer local_code, code
mloc = m/nprocs
lloc = l/nprocs
!Check the results
local_code = 0
do i=1, l
do j=1, mloc
if ( abs(c(i,j) - n*dble(j+myrank*lloc)*i) .gt. 1d-10 ) then
local_code = 1
print*,'local_code=',local_code
goto 10
endif
enddo
enddo
10call MPI_Reduce( local_code, code, 1, MPI_INTEGER, MPI_SUM, 0,
&MPI_COMM_WORLD, ierr)
!
if ( myrank .eq. 0 ) then
print *, 'code = ', code
endif
!
return
end
*!Parallel multiplication of matrices using MPI_Isend/MPI_Irecv
*
subroutine matmul(m, n, l, myrank, nprocs, a, b, c, work)
implicit double precision (a-h, o-z)
include 'mpif.h'
dimension a(n,m/nprocs), b(l/nprocs,n), c(l/nprocs,m),
& work(n,m/nprocs)
integer src, dest, tag
integer status(MPI_STATUS_SIZE, 2), request(2)
*
mloc = m/nprocs
lloc = l/nprocs
*
dest = mod( myrank-1+nprocs, nprocs )
src = mod( myrank+1,nprocs )
*
jpos=myrank*mloc
print*,'myrank=',myrank
c print*,'dest=',dest,'src=',src
c print*,'jpos=',jpos,'tag=',tag
*
do ip=1, nprocs - 1
tag = 10000 + ip
*
call MPI_Isend( a, n*mloc, MPI_DOUBLE_PRECISION, dest, tag,
& MPI_COMM_WORLD, request(1), ierr )
call MPI_Irecv( work, n*mloc, MPI_DOUBLE_PRECISION, src, tag,
& MPI_COMM_WORLD, request(2), ierr )
*
do i=1, lloc
do j=1, mloc
sum=0.d0
do k=1, n
sum = sum + b(i,k) * a(k,j)
enddo
c(i, j+jpos) = sum
enddo
enddo
*
call MPI_Waitall(2, request, status, ierr)
*
* 拷贝 work ->b (可以通过在计算/通信中交替使用 b/work 来避该免 *** 作)
do i=1, n
do j=1, mloc
a(i,j) = work(i,j)
enddo
enddo
*
jpos = jpos + mloc
if ( jpos .ge. m ) jpos = 0
*
enddo
*
do i=1, lloc
do j=1, mloc
sum=0.d0
do k=1, n
sum = sum + b(i,k) * a(k,j)
enddo
c(i, j+jpos) = sum
enddo
enddo
*
print*,'c(1,mloc)=',c(1,mloc)
print*,'c(1,2)=', c(1,2)
print*,'c(2,1)=', c(2,1)
print*,'c(lloc,1)=',c(lloc,1)
return
end
并行编程模式
对等模式—程序的各个部分地位相同,功能和代码基本一致,只是处理的数据或对象不同;主从模式—程序通信进程之间的一种主从或依赖关系 。
点对点通信模式
阻塞—发送完成的数据已经拷贝出发送缓冲区,即发送缓冲区可以重新分配使用,阻塞接受的完成意味着接收数据已经拷贝到接收缓冲区,即接收方已可以使用。非阻塞—在必要的硬件支持下,可以实现计算和通信的重叠。4种通信模式:标准通信模式、缓存通信模式、同步通信模式、就绪通信模式 。
组通信
一个特定组内所有进程都参加全局的数据处理和通信 *** 作 。
功能:通信—组内数据的传输;同步—所有进程在特定的点上取得一致;计算—对给定的数据完成一定的 *** 作 。
类型:1)数据移动:广播(mpi bcast) 收集(mpi gather) 散射(mpi scater)组收集(mpi all gather)全交换(all to all);2)聚集:规约(mpi reduce)将组内所有的进程输入 缓冲区中的数据按,定 *** 作OP进行运算,并将起始结果返回到root进程的接收缓冲区扫描(mpi scan)要求每一个进程对排在它前面的进程进行规约 *** 作,结果存入自身的输出缓冲区;3)同步:路障(mpi barrier)实现通信域内所有进程互相同步,它们将处于等待状态,直到所有进程执行它们各自的MPI-BARRIER调用 。
#include <mpi.h>#include <stdio.h>
#include <math.h>
#define SIZE 10
void main(int argc, char *argv)
{
int myid, numprocs
int data[SIZE], i, x, low, high, myresult, result
char fn[255]
char *fp
MPI_Init(&argc,&argv)
MPI_Comm_size(MPI_COMM_WORLD,&numprocs)
MPI_Comm_rank(MPI_COMM_WORLD,&myid)
if (myid == 0) { /* Open input file and initialize data */
strcpy(fn,getenv("HOME"))
strcat(fn,"/data")
if ((fp = fopen(fn,"r")) == NULL) {
printf("Can’t open the input file: %s\n\n", fn)
exit(1)
}
for(i = 0i <SIZEi++) fscanf(fp,"%d", &data[i])
}
/* broadcast data */
MPI_Bcast(data, SIZE, MPI_INT, 0, MPI_COMM_WORLD)
/* Add my portion Of data */
x = SIZE/numprocs
low = myid * x
high = low + x
if(myid == numprocs - 1) high = SIZE
myresult = 0
for(i = lowi <highi++)
myresult += data[i]
/* Compute global sum */
MPI_Reduce(&myresult, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)
if (myid == 0) printf("The sum is %d.\n", result)
MPI_Finalize()
}
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)