TTK
Loading...
Searching...
No Matches
ArrayPreconditioning.h
Go to the documentation of this file.
1
16#pragma once
17
18// ttk common includes
19#include <Debug.h>
20#include <Triangulation.h>
21#include <psort.h>
22#include <vector>
23
24namespace ttk {
25
26#ifdef TTK_ENABLE_MPI
27
28#ifdef TTK_ENABLE_MPI_RANK_ID_INT
29 using RankId = int;
30#else
31 using RankId = char;
32#endif
33
34 namespace globalOrder {
35
36 template <typename datatype>
37 struct vertexToSort {
38 ttk::SimplexId globalId;
39 datatype value;
40 ttk::RankId rank;
41 };
42
43 struct sortedVertex {
44 ttk::SimplexId globalId;
45 ttk::SimplexId order;
46 };
47
48 template <typename datatype>
49 bool comp(const vertexToSort<datatype> a, const vertexToSort<datatype> b) {
50 return (b.value > a.value)
51 || (a.value == b.value && a.globalId < b.globalId);
52 };
53
54 template <typename datatype>
55 bool oppositeComp(const vertexToSort<datatype> a,
56 const vertexToSort<datatype> b) {
57 return (a.value > b.value)
58 || (a.value == b.value && a.globalId > b.globalId);
59 }
60 } // namespace globalOrder
61
62#endif // TTK_ENABLE_MPI
67 class ArrayPreconditioning : virtual public Debug {
68
69 public:
71
73 // Pre-condition functions.
74 if(triangulation) {
75#ifdef TTK_ENABLE_MPI
76 triangulation->preconditionExchangeGhostVertices();
77#endif // TTK_ENABLE_MPI
78 }
79 return 0;
80 }
81
82 void setGlobalOrder(bool order) {
83 this->GlobalOrder = order;
84 }
85
86#ifdef TTK_ENABLE_MPI
98 template <typename DT, typename triangulationType>
99 int
100 postprocessing(const triangulationType *triangulation,
101 ttk::SimplexId *orderArray,
102 const ttk::SimplexId begin,
103 const ttk::SimplexId end,
104 const ttk::SimplexId orderOffset,
105 std::vector<globalOrder::vertexToSort<DT>> &verticesToSort,
106 MPI_Datatype &MPI_sortedVertexType,
107 MPI_Datatype &MPI_SimplexId,
108 const ttk::SimplexId itNumber,
109 const ttk::SimplexId localNbChunk) const {
110 ttk::RankId rank{0};
111 std::vector<std::vector<globalOrder::sortedVertex>> verticesSorted(
112 ttk::MPIsize_, std::vector<globalOrder::sortedVertex>());
113 if(itNumber < localNbChunk + 1) {
114 // Compute the order and prepare the vector to send
115#ifdef TTK_ENABLE_OPENMP
116 std::vector<std::vector<std::vector<globalOrder::sortedVertex>>>
117 verticesSortedThread(
118 this->threadNumber_,
119 std::vector<std::vector<globalOrder::sortedVertex>>(
120 ttk::MPIsize_, std::vector<globalOrder::sortedVertex>()));
121#pragma omp parallel firstprivate(rank) num_threads(threadNumber_)
122 {
123 int threadNumber = omp_get_thread_num();
124#pragma omp for schedule(static)
125 for(ttk::SimplexId i = begin; i < end; i++) {
126 rank = verticesToSort.at(i).rank;
127 if(rank == ttk::MPIrank_) {
128 orderArray[triangulation->getVertexLocalId(
129 verticesToSort.at(i).globalId)]
130 = orderOffset + i;
131 } else {
132 verticesSortedThread.at(threadNumber)
133 .at(rank)
134 .push_back(globalOrder::sortedVertex{
135 verticesToSort.at(i).globalId, orderOffset + i});
136 }
137 }
138 }
139 // For a smaller memory footprint
140 verticesToSort.resize(begin + 1);
141 verticesToSort.shrink_to_fit();
142 // Concatenate the vector produced by each thread
143#pragma omp parallel for schedule(static, 1)
144 for(int j = 0; j < ttk::MPIsize_; j++) {
145 for(int i = 0; i < this->threadNumber_; i++) {
146 if(j != ttk::MPIrank_) {
147 verticesSorted.at(j).insert(
148 verticesSorted.at(j).end(),
149 verticesSortedThread.at(i).at(j).begin(),
150 verticesSortedThread.at(i).at(j).end());
151 }
152 }
153 }
154 verticesSortedThread.clear();
155 verticesSortedThread.shrink_to_fit();
156#else
157 for(ttk::SimplexId i = begin; i < end; i++) {
158 rank = verticesToSort.at(i).rank;
159 if(rank == ttk::MPIrank_) {
160 orderArray[triangulation->getVertexLocalId(
161 verticesToSort.at(i).globalId)]
162 = orderOffset + i;
163 } else {
164 verticesSorted.at(rank).push_back(globalOrder::sortedVertex{
165 verticesToSort.at(i).globalId, orderOffset + i});
166 }
167 }
168#endif // TTK_ENABLE_OPENMP
169 }
170 // Send and receive the global orders. The use of MPI_Waitsome,
171 // Isend and Irecv enables the computation to overlap communications.
172 std::vector<MPI_Request> sendRequests(ttk::MPIsize_ - 1);
173 std::vector<MPI_Request> recvRequests(ttk::MPIsize_ - 1);
174 std::vector<MPI_Status> sendStatus(ttk::MPIsize_ - 1);
175 std::vector<MPI_Status> recvStatus(ttk::MPIsize_ - 1);
176 std::vector<ttk::SimplexId> sendMessageSize(ttk::MPIsize_, 0);
177 std::vector<ttk::SimplexId> recvMessageSize(ttk::MPIsize_, 0);
178 std::vector<int> recvCompleted(ttk::MPIsize_ - 1, 0);
179 std::vector<int> sendCompleted(ttk::MPIsize_ - 1, 0);
180 int sendPerformedCount = 0;
181 int recvPerformedCount = 0;
182 int sendPerformedCountTotal = 0;
183 int recvPerformedCountTotal = 0;
184 int count = 0;
185 for(int i = 0; i < ttk::MPIsize_; i++) {
186 // Send size of verticesToSort.at(i)
187 if(i != ttk::MPIrank_) {
188 sendMessageSize[i] = verticesSorted.at(i).size();
189 MPI_Isend(&sendMessageSize[i], 1, MPI_SimplexId, i, 0, ttk::MPIcomm_,
190 &sendRequests[count]);
191 MPI_Irecv(&recvMessageSize[i], 1, MPI_SimplexId, i, 0, ttk::MPIcomm_,
192 &recvRequests[count]);
193 count++;
194 }
195 }
196 std::vector<std::vector<globalOrder::sortedVertex>> recvVerticesSorted(
197 ttk::MPIsize_, std::vector<globalOrder::sortedVertex>());
198 std::vector<MPI_Request> sendRequestsData(ttk::MPIsize_ - 1);
199 std::vector<MPI_Request> recvRequestsData(ttk::MPIsize_ - 1);
200 std::vector<MPI_Status> recvStatusData(ttk::MPIsize_ - 1);
201 int recvCount = 0;
202 int sendCount = 0;
203 int r;
204 while((sendPerformedCountTotal < ttk::MPIsize_ - 1
205 || recvPerformedCountTotal < ttk::MPIsize_ - 1)) {
206 if(sendPerformedCountTotal < ttk::MPIsize_ - 1) {
207 MPI_Waitsome(ttk::MPIsize_ - 1, sendRequests.data(),
208 &sendPerformedCount, sendCompleted.data(),
209 sendStatus.data());
210 if(sendPerformedCount > 0) {
211 for(int i = 0; i < sendPerformedCount; i++) {
212 r = sendCompleted[i];
213 if(ttk::MPIrank_ <= sendCompleted[i]) {
214 r++;
215 }
216 if((sendMessageSize[r] > 0)) {
217 MPI_Isend(verticesSorted.at(r).data(), sendMessageSize[r],
218 MPI_sortedVertexType, r, 1, ttk::MPIcomm_,
219 &sendRequestsData[sendCount]);
220 sendCount++;
221 }
222 }
223 sendPerformedCountTotal += sendPerformedCount;
224 }
225 }
226 if(recvPerformedCountTotal < ttk::MPIsize_ - 1) {
227 MPI_Waitsome(ttk::MPIsize_ - 1, recvRequests.data(),
228 &recvPerformedCount, recvCompleted.data(),
229 recvStatus.data());
230 if(recvPerformedCount > 0) {
231 for(int i = 0; i < recvPerformedCount; i++) {
232 r = recvStatus[i].MPI_SOURCE;
233 if((recvMessageSize[r] > 0)) {
234 recvVerticesSorted.at(r).resize(recvMessageSize[r]);
235 MPI_Irecv(recvVerticesSorted.at(r).data(), recvMessageSize[r],
236 MPI_sortedVertexType, r, 1, ttk::MPIcomm_,
237 &recvRequestsData[recvCount]);
238
239 recvCount++;
240 }
241 }
242 recvPerformedCountTotal += recvPerformedCount;
243 }
244 }
245 }
246 recvPerformedCountTotal = 0;
247 while(recvPerformedCountTotal < recvCount) {
248 MPI_Waitsome(recvCount, recvRequestsData.data(), &recvPerformedCount,
249 recvCompleted.data(), recvStatusData.data());
250 if(recvPerformedCount > 0) {
251 for(int i = 0; i < recvPerformedCount; i++) {
252 r = recvStatusData[i].MPI_SOURCE;
253#pragma omp parallel for schedule(static)
254 for(int j = 0; j < recvMessageSize[r]; j++) {
255 orderArray[triangulation->getVertexLocalId(
256 recvVerticesSorted.at(r).at(j).globalId)]
257 = recvVerticesSorted.at(r).at(j).order;
258 }
259 }
260 recvPerformedCountTotal += recvPerformedCount;
261 }
262 }
263 MPI_Waitall(sendCount, sendRequestsData.data(), MPI_STATUSES_IGNORE);
264 return 0;
265 }
266
267#endif // TTK_ENABLE_MPI
268
269 template <typename DT, typename triangulationType>
270 int processScalarArray(const triangulationType *triangulation,
271 ttk::SimplexId *orderArray,
272 const DT *scalarArray,
273 const size_t nVerts) const { // start global timer
274 ttk::Timer globalTimer;
275
276 // print horizontal separator
277 this->printMsg(ttk::debug::Separator::L1); // L1 is the '=' separator
278 // print input parameters in table format
279 this->printMsg({
280 {"#Threads", std::to_string(this->threadNumber_)},
281 {"#Vertices", std::to_string(nVerts)},
282 });
284
285// -----------------------------------------------------------------------
286// Computing order Array
287// -----------------------------------------------------------------------
288#ifdef TTK_ENABLE_MPI
289 if(ttk::isRunningWithMPI()) {
290 ttk::SimplexId id = 0;
291 MPI_Datatype MPI_SimplexId = getMPIType(id);
292 // Prepare the vector to sort.
293 std::vector<globalOrder::vertexToSort<DT>> verticesToSort;
294 verticesToSort.reserve(nVerts);
295#ifdef TTK_ENABLE_OPENMP
296#pragma omp declare reduction (merge : std::vector<globalOrder::vertexToSort<DT>> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end()))
297#pragma omp parallel for reduction(merge : verticesToSort) schedule(static)
298#endif
299 for(size_t i = 0; i < nVerts; i++) {
300 if(triangulation->getVertexRank(i) == ttk::MPIrank_) {
301 verticesToSort.emplace_back(globalOrder::vertexToSort<DT>{
302 triangulation->getVertexGlobalId(i), scalarArray[i],
303 (ttk::RankId)ttk::MPIrank_});
304 }
305 }
306
307 MPI_Datatype MPI_vertexToSortType;
308
309 /*
310 * WARNING: the struct is sent as an array of char, as experiments show
311 * that using MPI's built-in struct management yields poor performance
312 * when used with a templated struct.
313 */
314 MPI_Type_contiguous(sizeof(globalOrder::vertexToSort<DT>), MPI_CHAR,
315 &MPI_vertexToSortType);
316 MPI_Type_commit(&MPI_vertexToSortType);
317
318 std::vector<ttk::SimplexId> vertexDistribution(ttk::MPIsize_);
319 ttk::SimplexId localVertexNumber = verticesToSort.size();
320 MPI_Allgather(&localVertexNumber, 1, MPI_SimplexId,
321 vertexDistribution.data(), 1, MPI_SimplexId,
322 ttk::MPIcomm_);
323 // The distributed sort will sort using the scalar field first,
324 // and the global id for disambiguation.
325 p_sort::parallel_sort<globalOrder::vertexToSort<DT>>(
326 verticesToSort, globalOrder::comp<DT>, globalOrder::oppositeComp<DT>,
327 vertexDistribution, MPI_vertexToSortType, MPI_SimplexId,
328 threadNumber_);
329
330 // Now, the computation for the global order starts, using the sorted
331 // vector.
332
333 MPI_Datatype types[] = {MPI_SimplexId, MPI_SimplexId};
334 int lengths[] = {1, 1};
335 MPI_Datatype MPI_sortedVertexType;
336 const long int mpi_offsets[]
337 = {offsetof(globalOrder::sortedVertex, globalId),
338 offsetof(globalOrder::sortedVertex, order)};
339 MPI_Type_create_struct(
340 2, lengths, mpi_offsets, types, &MPI_sortedVertexType);
341 MPI_Type_commit(&MPI_sortedVertexType);
342
343 ttk::SimplexId verticesToSortSize = verticesToSort.size();
344 // Compute the order of the first element of the current process
345 ttk::SimplexId orderOffset
346 = std::accumulate(vertexDistribution.begin(),
347 vertexDistribution.begin() + ttk::MPIrank_, 0);
348 // nbChunk, rest and nbChunkTotal are used to compute the
349 // post-processing bit by bit.
350 ttk::SimplexId nbChunk = std::floor(verticesToSortSize / ChunkSize);
351 ttk::SimplexId rest = verticesToSortSize % ChunkSize;
352 ttk::SimplexId nbChunkTotal{nbChunk};
353 if(rest > 0) {
354 nbChunkTotal++;
355 }
356 MPI_Allreduce(MPI_IN_PLACE, &nbChunkTotal, 1, MPI_SimplexId, MPI_MAX,
357 ttk::MPIcomm_);
358 ttk::SimplexId beginningChunk = nbChunk;
359
360 // The end of the vector is computed first to resize and delete easily
361 // elements of verticesToSort once they will no longer be used in the
362 // rest of the algorithm.
363 if(rest > 0) {
364 beginningChunk++;
365 postprocessing<DT, triangulationType>(
366 triangulation, orderArray, nbChunk * ChunkSize,
367 nbChunk * ChunkSize + rest, orderOffset, verticesToSort,
368 MPI_sortedVertexType, MPI_SimplexId, nbChunk, nbChunk);
369 }
370 for(int chunk = nbChunk - 1; chunk >= 0; chunk--) {
371 postprocessing<DT, triangulationType>(
372 triangulation, orderArray, chunk * ChunkSize,
373 (chunk + 1) * ChunkSize, orderOffset, verticesToSort,
374 MPI_sortedVertexType, MPI_SimplexId, chunk, nbChunk);
375 }
376
377 // In case some processes still have messages to send after the current
378 // process has sent all of his.
379 for(int chunk = beginningChunk; chunk < nbChunkTotal; chunk++) {
380 postprocessing<DT, triangulationType>(
381 triangulation, orderArray, nbChunk * ChunkSize + rest,
382 nbChunk * ChunkSize + rest, orderOffset, verticesToSort,
383 MPI_sortedVertexType, MPI_SimplexId, chunk, nbChunk);
384 }
385
386 // Exchange of the order array computed for the ghost vertices.
387 ttk::exchangeGhostVertices<ttk::SimplexId, triangulationType>(
388 orderArray, triangulation, ttk::MPIcomm_, 1);
389 }
390#else
391 this->printMsg("MPI not enabled!");
392 TTK_FORCE_USE(orderArray);
393 TTK_FORCE_USE(scalarArray);
394 TTK_FORCE_USE(triangulation);
395 return 0;
396#endif // TTK_ENABLE_MPI
397
398 // ---------------------------------------------------------------------
399 // print global performance
400 // ---------------------------------------------------------------------
401 {
402 this->printMsg(ttk::debug::Separator::L2); // horizontal '-' separator
403 this->printMsg(
404 "Complete", 1, globalTimer.getElapsedTime() // global progress, time
405 );
406 this->printMsg(ttk::debug::Separator::L1); // horizontal '=' separator
407 }
408
409 return 1; // return success
410 }
411
412 protected:
413 bool GlobalOrder{false};
414 // This value has been chosen for systems of 128 Gb of memory per computing
415 // node. For systems with much smaller memory, it may be inadequate and
416 // require a smaller value.
417 int ChunkSize{1000000000};
418 }; // ArrayPreconditioning class
419
420} // namespace ttk
#define TTK_FORCE_USE(x)
Force the compiler to use the function/method parameter.
Definition BaseClass.h:57
AbstractTriangulation is an interface class that defines an interface for efficient traversal methods...
int preconditionTriangulation(AbstractTriangulation *triangulation)
int processScalarArray(const triangulationType *triangulation, ttk::SimplexId *orderArray, const DT *scalarArray, const size_t nVerts) const
double getElapsedTime()
Definition Timer.h:15
The Topology ToolKit.
COMMON_EXPORTS int MPIsize_
Definition BaseClass.cpp:10
COMMON_EXPORTS int MPIrank_
Definition BaseClass.cpp:9
int SimplexId
Identifier type for simplices of any dimension.
Definition DataTypes.h:22
T end(std::pair< T, T > &p)
Definition ripserpy.cpp:483
T begin(std::pair< T, T > &p)
Definition ripserpy.cpp:479
printMsg(debug::output::BOLD+" | | | | | . \\ | | (__| | / __/| |_| / __/|__ _|"+debug::output::ENDCOLOR, debug::Priority::PERFORMANCE, debug::LineMode::NEW, stream)