forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmpi_common.cc
178 lines (148 loc) · 5.25 KB
/
mpi_common.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#include "caffe2/mpi/mpi_common.h"
#include <thread>
#include <c10/util/typeid.h>
#include "caffe2/utils/proto_utils.h"
namespace caffe2 {
CAFFE_KNOWN_TYPE(MPICommonWorldWrapper);
static std::mutex gCaffe2MPIMutex;
std::mutex& MPIMutex() {
return gCaffe2MPIMutex;
}
static MPI_Comm gCaffe2MPIComm = MPI_COMM_WORLD;
MPI_Comm GlobalMPIComm() {
return gCaffe2MPIComm;
}
void SetGlobalMPIComm(MPI_Comm new_comm) {
if (gCaffe2MPIComm != MPI_COMM_WORLD) {
MPI_Comm_free(&gCaffe2MPIComm);
}
gCaffe2MPIComm = new_comm;
}
int MPICommSize(MPI_Comm comm) {
int comm_size;
MPI_CHECK(MPI_Comm_size(comm, &comm_size));
return comm_size;
}
int MPICommRank(MPI_Comm comm) {
int comm_rank;
MPI_CHECK(MPI_Comm_rank(comm, &comm_rank));
return comm_rank;
}
/**
* Helper function used to setup MPI intercommunicator.
*/
static MPI_Comm AssimilateComm(MPI_Comm intra, MPI_Comm inter) {
MPI_Comm peer = MPI_COMM_NULL;
MPI_Comm newInterComm = MPI_COMM_NULL;
MPI_Comm newIntraComm = MPI_COMM_NULL;
// The spawned rank will be the "high" rank in the new intra-comm
int high = (MPI_COMM_NULL == intra) ? 1 : 0;
// If this is one of the (two) ranks in the inter-comm,
// create a new intra-comm from the inter-comm
if (MPI_COMM_NULL != inter) {
MPI_CHECK(MPI_Intercomm_merge(inter, high, &peer));
} else {
peer = MPI_COMM_NULL;
}
// Create a new inter-comm between the pre-existing intra-comm
// (all of it, not only rank zero), and the remote (spawned) rank,
// using the just-created intra-comm as the peer communicator.
int tag = 12345;
if (MPI_COMM_NULL != intra) {
// This task is a member of the pre-existing intra-comm
MPI_CHECK(MPI_Intercomm_create(intra, 0, peer, 1, tag, &newInterComm));
} else {
// This is the remote (spawned) task
MPI_CHECK(
MPI_Intercomm_create(MPI_COMM_SELF, 0, peer, 0, tag, &newInterComm));
}
// Now convert this inter-comm into an intra-comm
MPI_CHECK(MPI_Intercomm_merge(newInterComm, high, &newIntraComm));
// Clean up the intermediaries
if (MPI_COMM_NULL != peer) {
MPI_CHECK(MPI_Comm_free(&peer));
}
MPI_CHECK(MPI_Comm_free(&newInterComm));
// Delete the original intra-comm
if (MPI_COMM_NULL != intra && MPI_COMM_WORLD != intra &&
GlobalMPIComm() != intra) {
MPI_CHECK(MPI_Comm_free(&intra));
}
// Return the new intra-comm
return newIntraComm;
}
void MPISetupPeers(
const int replicas,
const string& role,
const string& job_path) {
int flag;
MPI_Initialized(&flag);
if (!flag) {
int mpi_ret;
MPI_CHECK(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &mpi_ret));
if (mpi_ret != MPI_THREAD_MULTIPLE && mpi_ret != MPI_THREAD_SERIALIZED) {
LOG(FATAL) << "This test requires the underlying MPI to support the "
<< "MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.";
return;
}
}
if (MPICommSize(MPI_COMM_WORLD) != 1) {
LOG(ERROR) << "MPI_COMM_WORLD size is not 1: did you already run "
"MPISetupPeers? Note that if you execute your program with "
"mpirun to launch multiple local processes, you should not "
"call MPISetupPeers.";
return;
}
if (role == "server") {
// Open a port to accept connections.
char port_name[MPI_MAX_PORT_NAME] = {'\0'};
MPI_CHECK(MPI_Open_port(MPI_INFO_NULL, port_name));
VLOG(1) << "MPI server: port: " << port_name;
// Writes the port name to the file.
CHECK(WriteStringToFile(std::string(port_name), job_path.c_str()));
VLOG(1) << "MPI server: wrote to file: " << job_path;
int comm_size = MPICommSize(GlobalMPIComm());
while (comm_size < replicas) {
MPI_Comm icomm;
VLOG(1) << "MPI server: waiting for client "
<< "(" << comm_size << "/" << replicas << " have connected)";
MPI_CHECK(
MPI_Comm_accept(port_name, MPI_INFO_NULL, 0, MPI_COMM_SELF, &icomm));
VLOG(1) << "MPI server: accepted client";
MPI_Comm new_intra_comm = AssimilateComm(GlobalMPIComm(), icomm);
SetGlobalMPIComm(new_intra_comm);
comm_size = MPICommSize(new_intra_comm);
}
} else {
// Opens the job path file to obtain server address.
std::string port_name;
while (!ReadStringFromFile(job_path.c_str(), &port_name) ||
port_name.length() == 0) {
/* sleep override */
std::this_thread::sleep_for(std::chrono::seconds(1));
}
// Connect to server.
MPI_Comm icomm;
VLOG(1) << "MPI client: connecting to port: " << port_name;
MPI_CHECK(MPI_Comm_connect(
const_cast<char*>(port_name.c_str()),
MPI_INFO_NULL,
0,
GlobalMPIComm(),
&icomm));
VLOG(1) << "MPI client: connected";
// Join the server's reference intracommunicator.
MPI_Comm new_intra_comm = AssimilateComm(MPI_COMM_NULL, icomm);
SetGlobalMPIComm(new_intra_comm);
// Let other clients join the intracommunicator we're now a part of.
while (MPICommSize(GlobalMPIComm()) < replicas) {
MPI_Comm comm = AssimilateComm(GlobalMPIComm(), MPI_COMM_NULL);
SetGlobalMPIComm(comm);
}
}
// After all peers have assimilated, do a barrier.
MPI_Barrier(GlobalMPIComm());
VLOG(1) << "MPI using a communicator of size: "
<< MPICommSize(GlobalMPIComm());
}
} // namespace caffe2