Rheolef  7.2
an efficient C++ finite element environment
 
Loading...
Searching...
No Matches
geo_mpi_dual.cc
Go to the documentation of this file.
1
21#include "rheolef/config.h"
22#ifdef _RHEOLEF_HAVE_MPI
23// mesh2dual
25#include <algorithm> // fill, copy
26
27namespace rheolef {
28using namespace std;
29
30static void ikeysort(int total_elems, KeyValueType *pbase);
31
32// -------------------------------------------------------------------------
33// This function converts a mesh into a dual graph
34// -------------------------------------------------------------------------
36 my_idxtype *elmdist,
37 my_idxtype *eptr,
38 vector<my_idxtype>& eind,
39 int *ncommonnodes,
40 vector<my_idxtype>& xadj,
41 vector<my_idxtype>& adjncy,
42 const mpi::communicator& comm)
43{
44 int i, j, jj, k, kk, m;
45 int pe, count, mask, pass;
46 int lnns, my_nns, node;
47 int firstelm, firstnode, lnode, nrecv, nsend;
48 my_idxtype maxcount;
49
50 int npes = comm.size();
51 int mype = comm.rank();
52 srand (mype);
53
54 int nelms = elmdist[mype+1]-elmdist[mype];
55 check_macro(nelms >= 0, "unexpected nelms <= 0");
56
57 mask = (1<<11)-1;
58
59 // ----------------------------
60 // 1) determine number of nodes
61 // ----------------------------
62 int gminnode = mpi::all_reduce (comm, eind[idxamin(eptr[nelms], eind)], mpi::minimum<int>());
63 for (i=0; i<eptr[nelms]; i++)
64 eind[i] -= gminnode;
65
66 int gmaxnode = mpi::all_reduce (comm, eind[idxamax(eptr[nelms], eind)], mpi::maximum<int>());
67 // ----------------------------------------
68 // 2) construct node distribution array
69 // ----------------------------------------
70 vector<my_idxtype> nodedist (npes+1, 0);
71 for (nodedist[0]=0, i=0, j=gmaxnode+1; i<npes; i++) {
72 k = j/(npes-i);
73 nodedist[i+1] = nodedist[i]+k;
74 j -= k;
75 }
76 my_nns = nodedist[mype+1]-nodedist[mype];
77 firstnode = nodedist[mype];
78
79 vector<KeyValueType> nodelist (eptr[nelms]);
80 vector<my_idxtype> auxarray (eptr[nelms]);
81 vector<my_idxtype> htable (amax(my_nns, mask+1), -1);
82 vector<int> scounts (4*npes+2);
83 int *rcounts = scounts.begin().operator->() + npes;
84 int *sdispl = scounts.begin().operator->() + 2*npes;
85 int *rdispl = scounts.begin().operator->() + 3*npes+1;
86 // -----------------------------------------------
87 // 3) first find a local numbering of the nodes
88 // -----------------------------------------------
89 for (i=0; i<nelms; i++) {
90 for (j=eptr[i]; j<eptr[i+1]; j++) {
91 nodelist[j].key = eind[j];
92 nodelist[j].val = j;
93 auxarray[j] = i; /* remember the local element ID that uses this node */
94 }
95 }
96 ikeysort(eptr[nelms], nodelist.begin().operator->());
97
98 for (count=1, i=1; i<eptr[nelms]; i++) {
99 if (nodelist[i].key > nodelist[i-1].key)
100 count++;
101 }
102
103 lnns = count;
104 vector<my_idxtype> nmap (lnns);
105 // -----------------------------------------------
106 // 4) renumber the nodes of the elements array
107 // -----------------------------------------------
108 count = 1;
109 nmap[0] = nodelist[0].key;
110 eind[nodelist[0].val] = 0;
111 nodelist[0].val = auxarray[nodelist[0].val]; /* Store the local element ID */
112 for (i=1; i<eptr[nelms]; i++) {
113 if (nodelist[i].key > nodelist[i-1].key) {
114 nmap[count] = nodelist[i].key;
115 count++;
116 }
117 eind[nodelist[i].val] = count-1;
118 nodelist[i].val = auxarray[nodelist[i].val]; /* Store the local element ID */
119 }
120 comm.barrier();
121 // --------------------------------------------------------
122 // 5) perform comms necessary to construct node-element list
123 // --------------------------------------------------------
124 fill (scounts.begin(), scounts.begin() + npes, 0);
125 for (pe=i=0; i<eptr[nelms]; i++) {
126 while (nodelist[i].key >= nodedist[pe+1])
127 pe++;
128 scounts[pe] += 2;
129 }
130 check_macro(pe < npes, "unexpected pe");
131
132 mpi::all_to_all (comm, scounts.begin().operator->(), rcounts);
133
134 icopy(npes, scounts.begin().operator->(), sdispl);
135 init_csr_ptr(npes, sdispl);
136
137 icopy(npes, rcounts, rdispl);
138 init_csr_ptr(npes, rdispl);
139
140 check_macro(sdispl[npes] == eptr[nelms]*2, "unexpected sdispl[]");
141
142 nrecv = rdispl[npes]/2;
143 vector<KeyValueType> recvbuffer (amax(1, nrecv));
144
145 // no boost::mpi equivalent:
146 MPI_Alltoallv(nodelist.begin().operator->(), scounts.begin().operator->(), sdispl, IDX_DATATYPE, recvbuffer.begin().operator->(), rcounts, rdispl, IDX_DATATYPE, comm);
147 // --------------------------------------------------------
148 // 6) construct global node-element list
149 // --------------------------------------------------------
150 vector<my_idxtype> gnptr (my_nns+1, 0);
151 for (i=0; i<npes; i++) {
152 for (j=rdispl[i]/2; j<rdispl[i+1]/2; j++) {
153 lnode = recvbuffer[j].key-firstnode;
154 check_macro(lnode >= 0 && lnode < my_nns, "unexpected lnode range")
155
156 gnptr[lnode]++;
157 }
158 }
159 init_csr_ptr (my_nns, gnptr.begin());
160
161 vector<my_idxtype> gnind (amax(1, gnptr[my_nns]));
162 for (pe=0; pe<npes; pe++) {
163 firstelm = elmdist[pe];
164 for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
165 lnode = recvbuffer[j].key-firstnode;
166 gnind[gnptr[lnode]++] = recvbuffer[j].val+firstelm;
167 }
168 }
169 SHIFTCSR(i, my_nns, gnptr);
170 // --------------------------------------------------------
171 // 7) send the node-element info to the relevant processors
172 // --------------------------------------------------------
173 fill (scounts.begin(), scounts.begin() + npes, 0);
174
175 /* use a hash table to ensure that each node is sent to a proc only once */
176 for (pe=0; pe<npes; pe++) {
177 for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
178 lnode = recvbuffer[j].key-firstnode;
179 if (htable[lnode] == -1) {
180 scounts[pe] += gnptr[lnode+1]-gnptr[lnode];
181 htable[lnode] = 1;
182 }
183 }
184
185 /* now reset the hash table */
186 for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
187 lnode = recvbuffer[j].key-firstnode;
188 htable[lnode] = -1;
189 }
190 }
191 mpi::all_to_all (comm, scounts.begin().operator->(), rcounts);
192 icopy(npes, scounts.begin().operator->(), sdispl);
193 init_csr_ptr(npes, sdispl);
194
195 // --------------------------------------------------------
196 // 8) create the send buffer
197 // --------------------------------------------------------
198 nsend = sdispl[npes];
199 nodelist.clear();
200 vector<my_idxtype> sbuffer (amax(1, nsend));
201 count = 0;
202 for (pe=0; pe<npes; pe++) {
203 for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
204 lnode = recvbuffer[j].key-firstnode;
205 if (htable[lnode] == -1) {
206 for (k=gnptr[lnode]; k<gnptr[lnode+1]; k++) {
207 if (k == gnptr[lnode])
208 sbuffer[count++] = -1*(gnind[k]+1);
209 else
210 sbuffer[count++] = gnind[k];
211 }
212 htable[lnode] = 1;
213 }
214 }
215 check_macro(count == sdispl[pe+1], "unexpected count");
216
217 /* now reset the hash table */
218 for (j=rdispl[pe]/2; j<rdispl[pe+1]/2; j++) {
219 lnode = recvbuffer[j].key-firstnode;
220 htable[lnode] = -1;
221 }
222 }
223
224 icopy(npes, rcounts, rdispl);
225 init_csr_ptr(npes, rdispl);
226
227 nrecv = rdispl[npes];
228 recvbuffer.clear();
229 vector<my_idxtype> rbuffer (amax(1, nrecv));
230
231 // no boost::mpi equivalent:
232 MPI_Alltoallv(sbuffer.begin().operator->(), scounts.begin().operator->(), sdispl, IDX_DATATYPE, rbuffer.begin().operator->(), rcounts, rdispl, IDX_DATATYPE, comm);
233
234 k = -1;
235 vector<my_idxtype> nptr (lnns+1, 0);
236 my_idxtype *nind = rbuffer.begin().operator->(); // QUOI ??? TODO: comprendre .... re-utilisation de la mem ?
237 for (pe=0; pe<npes; pe++) {
238 for (j=rdispl[pe]; j<rdispl[pe+1]; j++) {
239 if (nind[j] < 0) {
240 k++;
241 nind[j] = (-1*nind[j])-1;
242 }
243 nptr[k]++;
244 }
245 }
246 init_csr_ptr(lnns, nptr.begin());
247
248 check_macro(k+1 == lnns, "unexpected k+1");
249 check_macro(nptr[lnns] == nrecv, "unexpected nptr[]")
250
251 xadj.resize(nelms+1);
252 fill (xadj.begin(), xadj.end(), 0);
253 my_idxtype *myxadj = xadj.begin().operator->(); // TODO: suppress ptrs !
254 fill (htable.begin(), htable.begin() + mask+1, -1);
255 firstelm = elmdist[mype];
256
257 // ---------------------------------------------------------------------------
258 // 9) Two passes -- in first pass, simply find out the memory requirements
259 // ---------------------------------------------------------------------------
260 maxcount = 200;
261 vector<my_idxtype> ind (maxcount);
262 vector<my_idxtype> wgt (maxcount);
263 my_idxtype *myadjncy = NULL;
264
265 for (pass=0; pass<2; pass++) {
266 for (i=0; i<nelms; i++) {
267 for (count=0, j=eptr[i]; j<eptr[i+1]; j++) {
268 node = eind[j];
269
270 for (k=nptr[node]; k<nptr[node+1]; k++) {
271 if ((kk=nind[k]) == firstelm+i) continue;
272 m = htable[(kk&mask)];
273 if (m == -1) {
274 ind[count] = kk;
275 wgt[count] = 1;
276 htable[(kk&mask)] = count++;
277 } else {
278 if (ind[m] == kk) {
279 wgt[m]++;
280 } else {
281 for (jj=0; jj<count; jj++) {
282 if (ind[jj] == kk) {
283 wgt[jj]++;
284 break;
285 }
286 }
287 if (jj == count) {
288 ind[count] = kk;
289 wgt[count++] = 1;
290 }
291 }
292 }
293 // Adjust the memory.
294 if (count == maxcount-1) {
295 ind.resize (2*maxcount);
296 wgt.resize (2*maxcount);
297 maxcount *= 2;
298 }
299 }
300 }
301 for (j=0; j<count; j++) {
302 htable[(ind[j]&mask)] = -1;
303 if (wgt[j] >= *ncommonnodes) {
304 if (pass == 0)
305 myxadj[i]++;
306 else
307 myadjncy[myxadj[i]++] = ind[j];
308 }
309 }
310 }
311
312 if (pass == 0) {
313 init_csr_ptr(nelms, myxadj);
314 adjncy.resize (myxadj[nelms]);
315 myadjncy = adjncy.begin().operator->(); // TODO: avoid pointers !
316 }
317 else {
318 SHIFTCSR(i, nelms, myxadj);
319 }
320 }
321 // ---------------------------------------------------------------------------
322 // 10) correctly renumber the elements array */
323 // ---------------------------------------------------------------------------
324 for (i=0; i<eptr[nelms]; i++)
325 eind[i] = nmap[eind[i]] + gminnode;
326}
327// -------------------------------------------------------------------------
328// quick sort algorithm
329// -------------------------------------------------------------------------
330
331/* Discontinue quicksort algorithm when partition gets below this size.
332 This particular magic number was chosen to work best on a Sun 4/260. */
333#define MAX_THRESH 20
334
335/* Byte-wise swap two items of size SIZE. */
336#define QSSWAP(a, b, stmp) do { stmp = (a); (a) = (b); (b) = stmp; } while (0)
337
338/* Stack node declarations used to store unfulfilled partition obligations. */
339typedef struct {
340 KeyValueType *lo;
341 KeyValueType *hi;
342} stack_node;
343
344
345/* The next 4 #defines implement a very fast in-line stack abstraction. */
346#define STACK_SIZE (8 * sizeof(unsigned long int))
347#define PUSH(low, high) ((void) ((top->lo = (low)), (top->hi = (high)), ++top))
348#define POP(low, high) ((void) (--top, (low = top->lo), (high = top->hi)))
349#define STACK_NOT_EMPTY (stack < top)
350
351
352static
353void
354ikeysort(int total_elems, KeyValueType *pbase) {
355 KeyValueType pivot, stmp;
356 if (total_elems == 0)
357 /* Avoid lossage with unsigned arithmetic below. */
358 return;
359
360 if (total_elems > MAX_THRESH) {
361 KeyValueType *lo = pbase;
362 KeyValueType *hi = &lo[total_elems - 1];
363 stack_node stack[STACK_SIZE]; /* Largest size needed for 32-bit int!!! */
364 stack_node *top = stack + 1;
365
366 while (STACK_NOT_EMPTY) {
367 KeyValueType *left_ptr;
368 KeyValueType *right_ptr;
369 KeyValueType *mid = lo + ((hi - lo) >> 1);
370
371 if (mid->key < lo->key)
372 QSSWAP(*mid, *lo, stmp);
373 if (hi->key < mid->key)
374 QSSWAP(*mid, *hi, stmp);
375 else
376 goto jump_over;
377 if (mid->key < lo->key)
378 QSSWAP(*mid, *lo, stmp);
379
380jump_over:;
381 pivot = *mid;
382 left_ptr = lo + 1;
383 right_ptr = hi - 1;
384
385 /* Here's the famous ``collapse the walls'' section of quicksort.
386 Gotta like those tight inner loops! They are the main reason
387 that this algorithm runs much faster than others. */
388 do {
389 while (left_ptr->key < pivot.key)
390 left_ptr++;
391
392 while (pivot.key < right_ptr->key)
393 right_ptr--;
394
395 if (left_ptr < right_ptr) {
396 QSSWAP (*left_ptr, *right_ptr, stmp);
397 left_ptr++;
398 right_ptr--;
399 }
400 else if (left_ptr == right_ptr) {
401 left_ptr++;
402 right_ptr--;
403 break;
404 }
405 } while (left_ptr <= right_ptr);
406
407 /* Set up pointers for next iteration. First determine whether
408 left and right partitions are below the threshold size. If so,
409 ignore one or both. Otherwise, push the larger partition's
410 bounds on the stack and continue sorting the smaller one. */
411
412 if ((size_t) (right_ptr - lo) <= MAX_THRESH) {
413 if ((size_t) (hi - left_ptr) <= MAX_THRESH)
414 /* Ignore both small partitions. */
415 POP (lo, hi);
416 else
417 /* Ignore small left partition. */
418 lo = left_ptr;
419 }
420 else if ((size_t) (hi - left_ptr) <= MAX_THRESH)
421 /* Ignore small right partition. */
422 hi = right_ptr;
423 else if ((right_ptr - lo) > (hi - left_ptr)) {
424 /* Push larger left partition indices. */
425 PUSH (lo, right_ptr);
426 lo = left_ptr;
427 }
428 else {
429 /* Push larger right partition indices. */
430 PUSH (left_ptr, hi);
431 hi = right_ptr;
432 }
433 }
434 }
435 /* Once the BASE_PTR array is partially sorted by quicksort the rest
436 is completely sorted using insertion sort, since this is efficient
437 for partitions below MAX_THRESH size. BASE_PTR points to the beginning
438 of the array to sort, and END_PTR points at the very last element in
439 the array (*not* one beyond it!). */
440 {
441 KeyValueType *end_ptr = &pbase[total_elems - 1];
442 KeyValueType *tmp_ptr = pbase;
443 KeyValueType *thresh = (end_ptr < pbase + MAX_THRESH ? end_ptr : pbase + MAX_THRESH);
444 KeyValueType *run_ptr;
445
446 /* Find smallest element in first threshold and place it at the
447 array's beginning. This is the smallest array element,
448 and the operation speeds up insertion sort's inner loop. */
449
450 for (run_ptr = tmp_ptr + 1; run_ptr <= thresh; run_ptr++)
451 if (run_ptr->key < tmp_ptr->key)
452 tmp_ptr = run_ptr;
453
454 if (tmp_ptr != pbase)
455 QSSWAP(*tmp_ptr, *pbase, stmp);
456
457 /* Insertion sort, running from left-hand-side up to right-hand-side. */
458 run_ptr = pbase + 1;
459 while (++run_ptr <= end_ptr) {
460 tmp_ptr = run_ptr - 1;
461 while (run_ptr->key < tmp_ptr->key)
462 tmp_ptr--;
463
464 tmp_ptr++;
465 if (tmp_ptr != run_ptr) {
466 KeyValueType elmnt = *run_ptr;
467 KeyValueType *mptr;
468
469 for (mptr=run_ptr; mptr>tmp_ptr; mptr--)
470 *mptr = *(mptr-1);
471 *mptr = elmnt;
472 }
473 }
474 }
475}
476
477} // namespace rheolef
478#endif // _RHEOLEF_HAVE_MPI
check_macro(expr1.have_homogeneous_space(Xh1), "dual(expr1,expr2); expr1 should have homogeneous space. HINT: use dual(interpolate(Xh, expr1),expr2)")
#define MAX_THRESH
#define STACK_SIZE
#define PUSH(low, high)
#define POP(low, high)
#define QSSWAP(a, b, stmp)
#define STACK_NOT_EMPTY
#define icopy(n, a, b)
#define IDX_DATATYPE
#define SHIFTCSR(i, n, a)
#define amax(a, b)
This file is part of Rheolef.
int idxamin(int n, const std::vector< my_idxtype > &x)
int idxamax(int n, const std::vector< my_idxtype > &x)
void geo_dual(my_idxtype *elmdist, my_idxtype *eptr, vector< my_idxtype > &eind, int *ncommonnodes, vector< my_idxtype > &xadj, vector< my_idxtype > &adjncy, const mpi::communicator &comm)
STL namespace.