TTK
Loading...
Searching...
No Matches
ArrayPreconditioning.h
Go to the documentation of this file.
1
12
13#pragma once
14
15// ttk common includes
16#include <Debug.h>
17#include <Triangulation.h>
18#include <psort.h>
19#include <vector>
20
21namespace ttk {
22
23#ifdef TTK_ENABLE_MPI
24
25#ifdef TTK_ENABLE_MPI_RANK_ID_INT
26 using RankId = int;
27#else
28 using RankId = char;
29#endif
30
31 namespace globalOrder {
32
33 template <typename datatype>
34 struct vertexToSort {
35 ttk::SimplexId globalId;
36 datatype value;
37 ttk::RankId rank;
38 };
39
40 struct sortedVertex {
41 ttk::SimplexId globalId;
42 ttk::SimplexId order;
43 };
44
45 template <typename datatype>
46 bool comp(const vertexToSort<datatype> a, const vertexToSort<datatype> b) {
47 return (b.value > a.value)
48 || (a.value == b.value && a.globalId < b.globalId);
49 };
50
51 template <typename datatype>
52 bool oppositeComp(const vertexToSort<datatype> a,
53 const vertexToSort<datatype> b) {
54 return (a.value > b.value)
55 || (a.value == b.value && a.globalId > b.globalId);
56 }
57 } // namespace globalOrder
58
59#endif // TTK_ENABLE_MPI
64 class ArrayPreconditioning : virtual public Debug {
65
66 public:
68
70 // Pre-condition functions.
71 if(triangulation) {
72#ifdef TTK_ENABLE_MPI
73 triangulation->preconditionExchangeGhostVertices();
74#endif // TTK_ENABLE_MPI
75 }
76 return 0;
77 }
78
79 void setGlobalOrder(bool order) {
80 this->GlobalOrder = order;
81 }
82
83#ifdef TTK_ENABLE_MPI
95 template <typename DT, typename triangulationType>
96 int
97 postprocessing(const triangulationType *triangulation,
98 ttk::SimplexId *orderArray,
100 const ttk::SimplexId end,
101 const ttk::SimplexId orderOffset,
102 std::vector<globalOrder::vertexToSort<DT>> &verticesToSort,
103 MPI_Datatype &MPI_sortedVertexType,
104 MPI_Datatype &MPI_SimplexId,
105 const ttk::SimplexId itNumber,
106 const ttk::SimplexId localNbChunk) const {
107 ttk::RankId rank{0};
108 std::vector<std::vector<globalOrder::sortedVertex>> verticesSorted(
109 ttk::MPIsize_, std::vector<globalOrder::sortedVertex>());
110 if(itNumber < localNbChunk + 1) {
111 // Compute the order and prepare the vector to send
112#ifdef TTK_ENABLE_OPENMP
113 std::vector<std::vector<std::vector<globalOrder::sortedVertex>>>
114 verticesSortedThread(
115 this->threadNumber_,
116 std::vector<std::vector<globalOrder::sortedVertex>>(
117 ttk::MPIsize_, std::vector<globalOrder::sortedVertex>()));
118#pragma omp parallel firstprivate(rank) num_threads(threadNumber_)
119 {
120 int threadNumber = omp_get_thread_num();
121#pragma omp for schedule(static)
122 for(ttk::SimplexId i = begin; i < end; i++) {
123 rank = verticesToSort.at(i).rank;
124 if(rank == ttk::MPIrank_) {
125 orderArray[triangulation->getVertexLocalId(
126 verticesToSort.at(i).globalId)]
127 = orderOffset + i;
128 } else {
129 verticesSortedThread.at(threadNumber)
130 .at(rank)
131 .push_back(globalOrder::sortedVertex{
132 verticesToSort.at(i).globalId, orderOffset + i});
133 }
134 }
135 }
136 // For a smaller memory footprint
137 verticesToSort.resize(begin + 1);
138 verticesToSort.shrink_to_fit();
139 // Concatenate the vector produced by each thread
140#pragma omp parallel for schedule(static, 1)
141 for(int j = 0; j < ttk::MPIsize_; j++) {
142 for(int i = 0; i < this->threadNumber_; i++) {
143 if(j != ttk::MPIrank_) {
144 verticesSorted.at(j).insert(
145 verticesSorted.at(j).end(),
146 verticesSortedThread.at(i).at(j).begin(),
147 verticesSortedThread.at(i).at(j).end());
148 }
149 }
150 }
151 verticesSortedThread.clear();
152 verticesSortedThread.shrink_to_fit();
153#else
154 for(ttk::SimplexId i = begin; i < end; i++) {
155 rank = verticesToSort.at(i).rank;
156 if(rank == ttk::MPIrank_) {
157 orderArray[triangulation->getVertexLocalId(
158 verticesToSort.at(i).globalId)]
159 = orderOffset + i;
160 } else {
161 verticesSorted.at(rank).push_back(globalOrder::sortedVertex{
162 verticesToSort.at(i).globalId, orderOffset + i});
163 }
164 }
165#endif // TTK_ENABLE_OPENMP
166 }
167 // Send and receive the global orders. The use of MPI_Waitsome,
168 // Isend and Irecv enables the computation to overlap communications.
169 std::vector<MPI_Request> sendRequests(ttk::MPIsize_ - 1);
170 std::vector<MPI_Request> recvRequests(ttk::MPIsize_ - 1);
171 std::vector<MPI_Status> sendStatus(ttk::MPIsize_ - 1);
172 std::vector<MPI_Status> recvStatus(ttk::MPIsize_ - 1);
173 std::vector<ttk::SimplexId> sendMessageSize(ttk::MPIsize_, 0);
174 std::vector<ttk::SimplexId> recvMessageSize(ttk::MPIsize_, 0);
175 std::vector<int> recvCompleted(ttk::MPIsize_ - 1, 0);
176 std::vector<int> sendCompleted(ttk::MPIsize_ - 1, 0);
177 int sendPerformedCount = 0;
178 int recvPerformedCount = 0;
179 int sendPerformedCountTotal = 0;
180 int recvPerformedCountTotal = 0;
181 int count = 0;
182 for(int i = 0; i < ttk::MPIsize_; i++) {
183 // Send size of verticesToSort.at(i)
184 if(i != ttk::MPIrank_) {
185 sendMessageSize[i] = verticesSorted.at(i).size();
186 MPI_Isend(&sendMessageSize[i], 1, MPI_SimplexId, i, 0, ttk::MPIcomm_,
187 &sendRequests[count]);
188 MPI_Irecv(&recvMessageSize[i], 1, MPI_SimplexId, i, 0, ttk::MPIcomm_,
189 &recvRequests[count]);
190 count++;
191 }
192 }
193 std::vector<std::vector<globalOrder::sortedVertex>> recvVerticesSorted(
194 ttk::MPIsize_, std::vector<globalOrder::sortedVertex>());
195 std::vector<MPI_Request> sendRequestsData(ttk::MPIsize_ - 1);
196 std::vector<MPI_Request> recvRequestsData(ttk::MPIsize_ - 1);
197 std::vector<MPI_Status> recvStatusData(ttk::MPIsize_ - 1);
198 int recvCount = 0;
199 int sendCount = 0;
200 int r;
201 while((sendPerformedCountTotal < ttk::MPIsize_ - 1
202 || recvPerformedCountTotal < ttk::MPIsize_ - 1)) {
203 if(sendPerformedCountTotal < ttk::MPIsize_ - 1) {
204 MPI_Waitsome(ttk::MPIsize_ - 1, sendRequests.data(),
205 &sendPerformedCount, sendCompleted.data(),
206 sendStatus.data());
207 if(sendPerformedCount > 0) {
208 for(int i = 0; i < sendPerformedCount; i++) {
209 r = sendCompleted[i];
210 if(ttk::MPIrank_ <= sendCompleted[i]) {
211 r++;
212 }
213 if((sendMessageSize[r] > 0)) {
214 MPI_Isend(verticesSorted.at(r).data(), sendMessageSize[r],
215 MPI_sortedVertexType, r, 1, ttk::MPIcomm_,
216 &sendRequestsData[sendCount]);
217 sendCount++;
218 }
219 }
220 sendPerformedCountTotal += sendPerformedCount;
221 }
222 }
223 if(recvPerformedCountTotal < ttk::MPIsize_ - 1) {
224 MPI_Waitsome(ttk::MPIsize_ - 1, recvRequests.data(),
225 &recvPerformedCount, recvCompleted.data(),
226 recvStatus.data());
227 if(recvPerformedCount > 0) {
228 for(int i = 0; i < recvPerformedCount; i++) {
229 r = recvStatus[i].MPI_SOURCE;
230 if((recvMessageSize[r] > 0)) {
231 recvVerticesSorted.at(r).resize(recvMessageSize[r]);
232 MPI_Irecv(recvVerticesSorted.at(r).data(), recvMessageSize[r],
233 MPI_sortedVertexType, r, 1, ttk::MPIcomm_,
234 &recvRequestsData[recvCount]);
235
236 recvCount++;
237 }
238 }
239 recvPerformedCountTotal += recvPerformedCount;
240 }
241 }
242 }
243 recvPerformedCountTotal = 0;
244 while(recvPerformedCountTotal < recvCount) {
245 MPI_Waitsome(recvCount, recvRequestsData.data(), &recvPerformedCount,
246 recvCompleted.data(), recvStatusData.data());
247 if(recvPerformedCount > 0) {
248 for(int i = 0; i < recvPerformedCount; i++) {
249 r = recvStatusData[i].MPI_SOURCE;
250#pragma omp parallel for schedule(static)
251 for(int j = 0; j < recvMessageSize[r]; j++) {
252 orderArray[triangulation->getVertexLocalId(
253 recvVerticesSorted.at(r).at(j).globalId)]
254 = recvVerticesSorted.at(r).at(j).order;
255 }
256 }
257 recvPerformedCountTotal += recvPerformedCount;
258 }
259 }
260 MPI_Waitall(sendCount, sendRequestsData.data(), MPI_STATUSES_IGNORE);
261 return 0;
262 }
263
264#endif // TTK_ENABLE_MPI
265
266 template <typename DT, typename triangulationType>
267 int processScalarArray(const triangulationType *triangulation,
268 ttk::SimplexId *orderArray,
269 const DT *scalarArray,
270 const size_t nVerts) const { // start global timer
271 ttk::Timer globalTimer;
272
273 // print horizontal separator
274 this->printMsg(ttk::debug::Separator::L1); // L1 is the '=' separator
275 // print input parameters in table format
276 this->printMsg({
277 {"#Threads", std::to_string(this->threadNumber_)},
278 {"#Vertices", std::to_string(nVerts)},
279 });
281
282// -----------------------------------------------------------------------
283// Computing order Array
284// -----------------------------------------------------------------------
285#ifdef TTK_ENABLE_MPI
286 if(ttk::isRunningWithMPI()) {
287 ttk::SimplexId id = 0;
288 MPI_Datatype MPI_SimplexId = getMPIType(id);
289 // Prepare the vector to sort.
290 std::vector<globalOrder::vertexToSort<DT>> verticesToSort;
291 verticesToSort.reserve(nVerts);
292#ifdef TTK_ENABLE_OPENMP
293#pragma omp declare reduction (merge : std::vector<globalOrder::vertexToSort<DT>> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end()))
294#pragma omp parallel for reduction(merge : verticesToSort) schedule(static)
295#endif
296 for(size_t i = 0; i < nVerts; i++) {
297 if(triangulation->getVertexRank(i) == ttk::MPIrank_) {
298 verticesToSort.emplace_back(globalOrder::vertexToSort<DT>{
299 triangulation->getVertexGlobalId(i), scalarArray[i],
300 (ttk::RankId)ttk::MPIrank_});
301 }
302 }
303
304 MPI_Datatype MPI_vertexToSortType;
305
306 /*
307 * WARNING: the struct is sent as an array of char, as experiments show
308 * that using MPI's built-in struct management yields poor performance
309 * when used with a templated struct.
310 */
311 MPI_Type_contiguous(sizeof(globalOrder::vertexToSort<DT>), MPI_CHAR,
312 &MPI_vertexToSortType);
313 MPI_Type_commit(&MPI_vertexToSortType);
314
315 std::vector<ttk::SimplexId> vertexDistribution(ttk::MPIsize_);
316 ttk::SimplexId localVertexNumber = verticesToSort.size();
317 MPI_Allgather(&localVertexNumber, 1, MPI_SimplexId,
318 vertexDistribution.data(), 1, MPI_SimplexId,
319 ttk::MPIcomm_);
320 // The distributed sort will sort using the scalar field first,
321 // and the global id for disambiguation.
322 p_sort::parallel_sort<globalOrder::vertexToSort<DT>>(
323 verticesToSort, globalOrder::comp<DT>, globalOrder::oppositeComp<DT>,
324 vertexDistribution, MPI_vertexToSortType, MPI_SimplexId,
325 threadNumber_);
326
327 // Now, the computation for the global order starts, using the sorted
328 // vector.
329
330 MPI_Datatype types[] = {MPI_SimplexId, MPI_SimplexId};
331 int lengths[] = {1, 1};
332 MPI_Datatype MPI_sortedVertexType;
333 const long int mpi_offsets[]
334 = {offsetof(globalOrder::sortedVertex, globalId),
335 offsetof(globalOrder::sortedVertex, order)};
336 MPI_Type_create_struct(
337 2, lengths, mpi_offsets, types, &MPI_sortedVertexType);
338 MPI_Type_commit(&MPI_sortedVertexType);
339
340 ttk::SimplexId verticesToSortSize = verticesToSort.size();
341 // Compute the order of the first element of the current process
342 ttk::SimplexId orderOffset
343 = std::accumulate(vertexDistribution.begin(),
344 vertexDistribution.begin() + ttk::MPIrank_, 0);
345 // nbChunk, rest and nbChunkTotal are used to compute the
346 // post-processing bit by bit.
347 ttk::SimplexId nbChunk = std::floor(verticesToSortSize / ChunkSize);
348 ttk::SimplexId rest = verticesToSortSize % ChunkSize;
349 ttk::SimplexId nbChunkTotal{nbChunk};
350 if(rest > 0) {
351 nbChunkTotal++;
352 }
353 MPI_Allreduce(MPI_IN_PLACE, &nbChunkTotal, 1, MPI_SimplexId, MPI_MAX,
354 ttk::MPIcomm_);
355 ttk::SimplexId beginningChunk = nbChunk;
356
357 // The end of the vector is computed first to resize and delete easily
358 // elements of verticesToSort once they will no longer be used in the
359 // rest of the algorithm.
360 if(rest > 0) {
361 beginningChunk++;
362 postprocessing<DT, triangulationType>(
363 triangulation, orderArray, nbChunk * ChunkSize,
364 nbChunk * ChunkSize + rest, orderOffset, verticesToSort,
365 MPI_sortedVertexType, MPI_SimplexId, nbChunk, nbChunk);
366 }
367 for(int chunk = nbChunk - 1; chunk >= 0; chunk--) {
368 postprocessing<DT, triangulationType>(
369 triangulation, orderArray, chunk * ChunkSize,
370 (chunk + 1) * ChunkSize, orderOffset, verticesToSort,
371 MPI_sortedVertexType, MPI_SimplexId, chunk, nbChunk);
372 }
373
374 // In case some processes still have messages to send after the current
375 // process has sent all of his.
376 for(int chunk = beginningChunk; chunk < nbChunkTotal; chunk++) {
377 postprocessing<DT, triangulationType>(
378 triangulation, orderArray, nbChunk * ChunkSize + rest,
379 nbChunk * ChunkSize + rest, orderOffset, verticesToSort,
380 MPI_sortedVertexType, MPI_SimplexId, chunk, nbChunk);
381 }
382
383 // Exchange of the order array computed for the ghost vertices.
384 ttk::exchangeGhostVertices<ttk::SimplexId, triangulationType>(
385 orderArray, triangulation, ttk::MPIcomm_, 1);
386 }
387#else
388 this->printMsg("MPI not enabled!");
389 TTK_FORCE_USE(orderArray);
390 TTK_FORCE_USE(scalarArray);
391 TTK_FORCE_USE(triangulation);
392 return 0;
393#endif // TTK_ENABLE_MPI
394
395 // ---------------------------------------------------------------------
396 // print global performance
397 // ---------------------------------------------------------------------
398 {
399 this->printMsg(ttk::debug::Separator::L2); // horizontal '-' separator
400 this->printMsg(
401 "Complete", 1, globalTimer.getElapsedTime() // global progress, time
402 );
403 this->printMsg(ttk::debug::Separator::L1); // horizontal '=' separator
404 }
405
406 return 1; // return success
407 }
408
409 protected:
410 bool GlobalOrder{false};
411 // This value has been chosen for systems of 128 Gb of memory per computing
412 // node. For systems with much smaller memory, it may be inadequate and
413 // require a smaller value.
414 int ChunkSize{1000000000};
415 }; // ArrayPreconditioning class
416
417} // namespace ttk
#define TTK_FORCE_USE(x)
Force the compiler to use the function/method parameter.
Definition BaseClass.h:57
AbstractTriangulation is an interface class that defines an interface for efficient traversal methods...
int preconditionTriangulation(AbstractTriangulation *triangulation)
int processScalarArray(const triangulationType *triangulation, ttk::SimplexId *orderArray, const DT *scalarArray, const size_t nVerts) const
double getElapsedTime()
Definition Timer.h:15
std::string to_string(__int128)
Definition ripserpy.cpp:99
The Topology ToolKit.
COMMON_EXPORTS int MPIsize_
Definition BaseClass.cpp:10
COMMON_EXPORTS int MPIrank_
Definition BaseClass.cpp:9
int SimplexId
Identifier type for simplices of any dimension.
Definition DataTypes.h:22
T end(std::pair< T, T > &p)
Definition ripserpy.cpp:472
T begin(std::pair< T, T > &p)
Definition ripserpy.cpp:468
printMsg(debug::output::BOLD+" | | | | | . \\ | | (__| | / __/| |_| / __/|__ _|"+debug::output::ENDCOLOR, debug::Priority::PERFORMANCE, debug::LineMode::NEW, stream)