Java 使用OpenMPI广播发生死锁的数据
我用Java编写了一个程序,在单线程中调用OpenMPI进行通信Isend/recv用于防止死锁。调用方调用send方法,然后将所有发送请求放入队列。网络线程从队列获取要发送的请求Java 使用OpenMPI广播发生死锁的数据,java,multithreading,mpi,distributed-computing,openmpi,Java,Multithreading,Mpi,Distributed Computing,Openmpi,我用Java编写了一个程序,在单线程中调用OpenMPI进行通信Isend/recv用于防止死锁。调用方调用send方法,然后将所有发送请求放入队列。网络线程从队列获取要发送的请求 class NetworkThread extends Thread { private final ConcurrentLinkedQueue<SendRequest> sendQueue = new ConcurrentLinkedQueue<>(); private fi
class NetworkThread extends Thread {
private final ConcurrentLinkedQueue<SendRequest> sendQueue = new ConcurrentLinkedQueue<>();
private final List<Request> activeSends = new LinkedList<>();
private final List<RecvRequest> recvList = new LinkedList<>();
private volatile boolean shutdown;
@Override
public void run() {
System.out.println("network thread started");
try {
loop();
} catch (MPIException e) {
e.printStackTrace();
}
}
void loop() throws MPIException {
while (!shutdown) {
Status status = MPI.COMM_WORLD.iProbe(MPI.ANY_SOURCE, MPI.ANY_TAG);
if (status != null) {
int source = status.getSource();
int tag = status.getTag();
int sizeInBytes = status.getCount(MPI.BYTE);
ByteBuffer buffer = MPI.newByteBuffer(sizeInBytes);
MPI.COMM_WORLD.recv(buffer, sizeInBytes, MPI.BYTE, source, tag);
byte[] data = new byte[sizeInBytes];
buffer.get(data);
RecvRequest recvRequest = new RecvRequest(data, source, tag);
synchronized (recvList) {
recvList.add(recvRequest);
}
}
SendRequest sendRequest;
while ((sendRequest = sendQueue.poll()) != null) {
byte[] data = sendRequest.getData();
ByteBuffer buffer = MPI.newByteBuffer(data.length);
buffer.put(data);
Request request = MPI.COMM_WORLD.iSend(buffer, data.length, MPI.BYTE, sendRequest.getDest(), sendRequest.getTag());
synchronized (activeSends) {
activeSends.add(request);
}
}
//delete sent record
synchronized (activeSends) {
Iterator<Request> iterator = activeSends.iterator();
while (iterator.hasNext()) {
Request request = iterator.next();
if (request.test())
iterator.remove();
}
}
}
}
public void send(byte[] data, int dest, int tag) {
SendRequest sendRequest = new SendRequest(data, dest, tag);
sendQueue.add(sendRequest);
}
public byte[] read(int source, int tag) {
byte[] data;
while ((data = tryRead(source, tag)) == null) {
try {
Thread.sleep(10);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return data;
}
public byte[] tryRead(int source, int tag) {
byte[] data = null;
synchronized (recvList) {
Iterator<RecvRequest> iterator = recvList.iterator();
while (iterator.hasNext()) {
RecvRequest recvRequest = iterator.next();
if (recvRequest.getSource() == source && recvRequest.getTag() == tag) {
iterator.remove();
data = recvRequest.getData();
break;//just get one
}
}
}
return data;
}
public void shutdown() {
shutdown = true;
//waiting for all sent
synchronized (activeSends) {
while (activeSends.size() > 0)
try {
Thread.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
class SendRequest {
private byte[] data;
private int dest;
private int tag;
SendRequest(byte[] data, int dest, int tag) {
this.data = data;
this.dest = dest;
this.tag = tag;
}
public int getTag() {
return tag;
}
public int getDest() {
return dest;
}
public byte[] getData() {
return data;
}
}
class RecvRequest {
private byte[] data;
private int source;
private int tag;
RecvRequest(byte[] data, int source, int tag) {
this.data = data;
this.source = source;
this.tag = tag;
}
public int getTag() {
return tag;
}
public int getSource() {
return source;
}
public byte[] getData() {
return data;
}
}
问题:此程序在较低的进程(例如插槽=4)下运行良好。但当增加要发送的插槽或数据大小时,程序偶尔会进入死锁状态。我试图更改OpenMPI版本(3.0、2.1.2、1.7.5),但似乎不起作用。乍一看,关机过程中似乎存在争用情况。您应该在退出之前完成所有队列(例如发送和接收所有内容)。是的,语句“shutdown=true”应该在“while code block”下面执行。修复后,问题仍然发生。我添加了一些输出语句,以确定哪一行被卡住了。sendTh.start();recvTh.start();sendTh.join();recvTh.join();System.out.println(String.format(“%s-%d”在退出之前),host,rank);networkThread.shutdown();System.out.println(String.format(“%s-%d”在退出后)、host、rank));networkThread.join();结果显示,所有进程都输出“xxxx-xx后退出”,但有些进程卡在“networkThread.shutdown();”中。这很奇怪,因为所有进程都接收到数据,所以activeSends列表应该是空的。因此,任何进程都不应该停留在“while code block”中@GillesGouaillardeti在
NetworkThread.run()
的末尾添加了一个printf()
类。尽管所有非主列组都到达了那里,但并不是所有列组都可以join()
它们的NetworkThread
。我对java MPI绑定知之甚少,而且我发现的文档非常糟糕。您确定传递给MPI.COMM\u WORLD.iSend
的缓冲区在请求期间始终保持活动状态吗?另外,您应该以某种方式请求并确认您的线程级别至少是MPI\u thread\u SERIALIZED
。您应该在退出之前完成所有队列(例如发送和接收所有内容)。是的,语句“shutdown=true”应该在“while code block”下面执行。修复后,问题仍然发生。我添加了一些输出语句,以确定哪一行被卡住了。sendTh.start();recvTh.start();sendTh.join();recvTh.join();System.out.println(String.format(“%s-%d”在退出之前),host,rank);networkThread.shutdown();System.out.println(String.format(“%s-%d”在退出后)、host、rank));networkThread.join();结果显示,所有进程都输出“xxxx-xx后退出”,但有些进程卡在“networkThread.shutdown();”中。这很奇怪,因为所有进程都接收到数据,所以activeSends列表应该是空的。因此,任何进程都不应该停留在“while code block”中@GillesGouaillardeti在NetworkThread.run()
的末尾添加了一个printf()
类。尽管所有非主列组都到达了那里,但并不是所有列组都可以join()
它们的NetworkThread
。我对java MPI绑定知之甚少,而且我发现的文档非常糟糕。您确定传递给MPI.COMM\u WORLD.iSend
的缓冲区在请求期间始终保持活动状态吗?此外,您应该以某种方式请求并确认您的线程级别至少是MPI\u thread\u序列化的
。
public class BroadcastTest {
private static final int TAG_MPI = 123;
static int rank;
static String host;
static {
try {
host = InetAddress.getLocalHost().getHostName();
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws MPIException, InterruptedException, UnknownHostException {
MPI.Init(args);
int rank = MPI.COMM_WORLD.getRank();
int size = MPI.COMM_WORLD.getSize();
BroadcastTest.rank = rank;
if (rank == 0) {
System.out.println(String.format("total %d machines", size));
System.out.println("master started");
} else {
NetworkThread networkThread = new NetworkThread();
networkThread.start();
Thread sendTh = new Thread(() -> {
for (int i = 0; i < 5; i++) {
try {
Thread.sleep((long) (Math.random() * 1000)); //send data five times in random interval
} catch (InterruptedException e) {
e.printStackTrace();
}
for (int machineId = 1/* skip master */; machineId < size; machineId++) {
if (machineId == rank) continue;//skip myself
networkThread.send(new byte[4096], machineId, TAG_MPI); //send 4K bytes data
}
}
});
//receive data
Thread recvTh = new Thread(() -> {
for (int i = 0; i < 5; i++) {
for (int machineId = 1; machineId < size; machineId++) {
if (machineId == rank) continue;
byte[] bytes = networkThread.read(machineId, TAG_MPI);
}
}
});
sendTh.start();
recvTh.start();
sendTh.join();
recvTh.join();
networkThread.shutdown();
networkThread.join();
}
System.out.println(String.format("%s exit", host));
MPI.Finalize();
}
}
/home/gongsf/openmpi-2.1.2/bin/mpirun --prefix /home/gongsf/openmpi-2.1.2 -bycore -nooversubscribe -machinefile /home/gongsf/JavaMPI/myhosts /home/gongsf/jdk1.8.0_144/bin/java -classpath /home/gongsf/JavaMPI/lib/*:/home/gongsf/JavaMPI/out/production/JavaMPI BroadcastTest