Line data Source code
1 : // The libMesh Finite Element Library.
2 : // Copyright (C) 2002-2025 Benjamin S. Kirk, John W. Peterson, Roy H. Stogner
3 :
4 : // This library is free software; you can redistribute it and/or
5 : // modify it under the terms of the GNU Lesser General Public
6 : // License as published by the Free Software Foundation; either
7 : // version 2.1 of the License, or (at your option) any later version.
8 :
9 : // This library is distributed in the hope that it will be useful,
10 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 : // Lesser General Public License for more details.
13 :
14 : // You should have received a copy of the GNU Lesser General Public
15 : // License along with this library; if not, write to the Free Software
16 : // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 :
18 : #ifndef LIBMESH_THREADS_PTHREAD_H
19 : #define LIBMESH_THREADS_PTHREAD_H
20 :
21 : // Do not try to #include this header directly, it is designed to be
22 : // #included directly by threads.h
23 : #ifndef LIBMESH_SQUASH_HEADER_WARNING
24 : # warning "This file is designed to be included through libmesh/threads.h"
25 : #else
26 :
27 : #ifdef LIBMESH_HAVE_PTHREAD
28 :
29 : // C++ includes
30 : #ifdef LIBMESH_HAVE_CXX11_THREAD
31 : # include <thread>
32 : #endif
33 :
34 : #include <pthread.h>
35 : #include <algorithm>
36 : #include <vector>
37 : #include <memory> // std::unique_ptr, std::make_unique
38 :
39 : #ifdef __APPLE__
40 : # ifdef __MAC_10_12
41 : # include <os/lock.h>
42 : #else
43 : # include <libkern/OSAtomic.h>
44 : # endif
45 : #endif
46 :
47 : // Thread-Local-Storage macros
48 : #ifdef LIBMESH_HAVE_CXX11_THREAD
49 : # define LIBMESH_TLS_TYPE(type) thread_local type
50 : # define LIBMESH_TLS_REF(value) (value)
51 : #else // Maybe support gcc __thread eventually?
52 : # define LIBMESH_TLS_TYPE(type) type
53 : # define LIBMESH_TLS_REF(value) (value)
54 : #endif
55 :
56 : namespace libMesh
57 : {
58 :
59 : namespace Threads
60 : {
61 :
62 :
63 : #ifdef LIBMESH_HAVE_CXX11_THREAD
64 : /**
65 : * Use std::thread when available.
66 : */
67 : typedef std::thread Thread;
68 :
69 : #else
70 :
71 : /**
72 : * Use the non-concurrent placeholder.
73 : */
74 : typedef NonConcurrentThread Thread;
75 :
76 : #endif // LIBMESH_HAVE_CXX11_THREAD
77 :
78 :
79 : /**
80 : * Spin mutex. Implements mutual exclusion by busy-waiting in user
81 : * space for the lock to be acquired.
82 : */
83 : #ifdef __APPLE__
84 : #ifdef __MAC_10_12
85 : class spin_mutex
86 : {
87 : public:
88 : spin_mutex() { ulock = OS_UNFAIR_LOCK_INIT; }
89 : ~spin_mutex() = default;
90 :
91 : void lock () { os_unfair_lock_lock(&ulock); }
92 : void unlock () { os_unfair_lock_unlock(&ulock); }
93 :
94 : class scoped_lock
95 : {
96 : public:
97 : scoped_lock () : smutex(nullptr) {}
98 : explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
99 :
100 : ~scoped_lock () { release(); }
101 :
102 : void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
103 : void release () { if (smutex) smutex->unlock(); smutex = nullptr; }
104 :
105 : private:
106 : spin_mutex * smutex;
107 : };
108 :
109 : private:
110 : os_unfair_lock ulock;
111 : };
112 : #else
113 : class spin_mutex
114 : {
115 : public:
116 : spin_mutex() : slock(0) {} // The convention is that the lock being zero is _unlocked_
117 : ~spin_mutex() = default;
118 :
119 : void lock () { OSSpinLockLock(&slock); }
120 : void unlock () { OSSpinLockUnlock(&slock); }
121 :
122 : class scoped_lock
123 : {
124 : public:
125 : scoped_lock () : smutex(nullptr) {}
126 : explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
127 :
128 : ~scoped_lock () { release(); }
129 :
130 : void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
131 : void release () { if (smutex) smutex->unlock(); smutex = nullptr; }
132 :
133 : private:
134 : spin_mutex * smutex;
135 : };
136 :
137 : private:
138 : OSSpinLock slock;
139 : };
140 : #endif
141 : #else
142 : class spin_mutex
143 : {
144 : public:
145 : // Might want to use PTHREAD_MUTEX_ADAPTIVE_NP on Linux, but it's not available on OSX.
146 31208 : spin_mutex() { pthread_spin_init(&slock, PTHREAD_PROCESS_PRIVATE); }
147 137580 : ~spin_mutex() { pthread_spin_destroy(&slock); }
148 :
149 7467916156 : void lock () { pthread_spin_lock(&slock); }
150 6451589454 : void unlock () { pthread_spin_unlock(&slock); }
151 :
152 : class scoped_lock
153 : {
154 : public:
155 : scoped_lock () : smutex(nullptr) {}
156 287218 : explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
157 :
158 287218 : ~scoped_lock () { release(); }
159 :
160 8 : void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
161 355024 : void release () { if (smutex) smutex->unlock(); smutex = nullptr; }
162 :
163 : private:
164 : spin_mutex * smutex;
165 : };
166 :
167 : private:
168 : pthread_spinlock_t slock;
169 : };
170 : #endif // __APPLE__
171 :
172 :
173 :
174 : /**
175 : * Recursive mutex. Implements mutual exclusion by busy-waiting in user
176 : * space for the lock to be acquired.
177 : */
178 : class recursive_mutex
179 : {
180 : public:
181 : // Might want to use PTHREAD_MUTEX_ADAPTIVE_NP on Linux, but it's not available on OSX.
182 16389 : recursive_mutex()
183 16389 : {
184 16389 : pthread_mutexattr_init(&attr);
185 16389 : pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
186 16389 : pthread_mutex_init(&mutex, &attr);
187 16389 : }
188 16389 : ~recursive_mutex() { pthread_mutex_destroy(&mutex); }
189 :
190 : void lock () { pthread_mutex_lock(&mutex); }
191 : void unlock () { pthread_mutex_unlock(&mutex); }
192 :
193 : class scoped_lock
194 : {
195 : public:
196 : scoped_lock () : rmutex(nullptr) {}
197 : explicit scoped_lock ( recursive_mutex & in_rmutex ) : rmutex(&in_rmutex) { rmutex->lock(); }
198 :
199 : ~scoped_lock () { release(); }
200 :
201 : void acquire ( recursive_mutex & in_rmutex ) { rmutex = &in_rmutex; rmutex->lock(); }
202 : void release () { if (rmutex) rmutex->unlock(); rmutex = nullptr; }
203 :
204 : private:
205 : recursive_mutex * rmutex;
206 : };
207 :
208 : private:
209 : pthread_mutex_t mutex;
210 : pthread_mutexattr_t attr;
211 : };
212 :
213 : template <typename Range>
214 177398 : unsigned int num_pthreads(Range & range)
215 : {
216 365410 : std::size_t min = std::min((std::size_t)libMesh::n_threads(), range.size());
217 271404 : return min > 0 ? cast_int<unsigned int>(min) : 1;
218 : }
219 :
220 : template <typename Range, typename Body>
221 : class RangeBody
222 : {
223 : public:
224 : Range * range;
225 : Body * body;
226 : };
227 :
228 : template <typename Range, typename Body>
229 133783 : void * run_body(void * args)
230 : {
231 133783 : RangeBody<Range, Body> * range_body = (RangeBody<Range, Body> *)args;
232 :
233 133783 : Body & body = *range_body->body;
234 133783 : Range & range = *range_body->range;
235 :
236 133783 : body(range);
237 :
238 133783 : return nullptr;
239 : }
240 :
241 : /**
242 : * Scheduler to manage threads.
243 : */
244 : class task_scheduler_init
245 : {
246 : public:
247 : static const int automatic = -1;
248 462 : explicit task_scheduler_init (int = automatic) {}
249 : void initialize (int = automatic) {}
250 : void terminate () {}
251 : };
252 :
253 : //-------------------------------------------------------------------
254 : /**
255 : * Dummy "splitting object" used to distinguish splitting constructors
256 : * from copy constructors.
257 : */
258 : class split {};
259 :
260 :
261 :
262 :
263 : //-------------------------------------------------------------------
264 : /**
265 : * Execute the provided function object in parallel on the specified
266 : * range.
267 : */
268 : template <typename Range, typename Body>
269 : inline
270 571805 : void parallel_for (const Range & range, const Body & body)
271 : {
272 22914 : Threads::BoolAcquire b(Threads::in_threads);
273 :
274 : // If we're running in serial - just run!
275 571805 : if (libMesh::n_threads() == 1)
276 : {
277 513047 : body(range);
278 1046 : return;
279 : }
280 :
281 87472 : DisablePerfLogInScope disable_perf;
282 :
283 36890 : unsigned int n_threads = num_pthreads(range);
284 :
285 102494 : std::vector<std::unique_ptr<Range>> ranges(n_threads);
286 80626 : std::vector<RangeBody<const Range, const Body>> range_bodies(n_threads);
287 58758 : std::vector<pthread_t> threads(n_threads);
288 :
289 : // Create the ranges for each thread
290 58758 : std::size_t range_size = range.size() / n_threads;
291 :
292 58758 : typename Range::const_iterator current_beginning = range.begin();
293 :
294 154827 : for (unsigned int i=0; i<n_threads; i++)
295 : {
296 35305 : std::size_t this_range_size = range_size;
297 :
298 96069 : if (i+1 == n_threads)
299 58758 : this_range_size += range.size() % n_threads; // Give the last one the remaining work to do
300 :
301 131374 : ranges[i] = std::make_unique<Range>(range, current_beginning, current_beginning + this_range_size);
302 :
303 131374 : current_beginning = current_beginning + this_range_size;
304 : }
305 :
306 : // Create the RangeBody arguments
307 154827 : for (unsigned int i=0; i<n_threads; i++)
308 : {
309 131374 : range_bodies[i].range = ranges[i].get();
310 96069 : range_bodies[i].body = &body;
311 : }
312 :
313 : // Create the threads. It may seem redundant to wrap a pragma in
314 : // #ifdefs... but GCC warns about an "unknown pragma" if it
315 : // encounters this line of code when -fopenmp is not passed to the
316 : // compiler.
317 : #ifdef LIBMESH_HAVE_OPENMP
318 58758 : #pragma omp parallel for schedule (static)
319 : #endif
320 : for (int i=0; i<static_cast<int>(n_threads); i++)
321 : {
322 : #if !LIBMESH_HAVE_OPENMP
323 : pthread_create(&threads[i], nullptr, &run_body<Range, Body>, (void *)&range_bodies[i]);
324 : #else
325 : run_body<Range, Body>((void *)&range_bodies[i]);
326 : #endif
327 : }
328 :
329 : #if !LIBMESH_HAVE_OPENMP
330 : // Wait for them to finish
331 :
332 : // The use of 'int' instead of unsigned for the iteration variable
333 : // is deliberate here. This is an OpenMP loop, and some older
334 : // compilers warn when you don't use int for the loop index. The
335 : // reason has to do with signed vs. unsigned integer overflow
336 : // behavior and optimization.
337 : // http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html
338 : for (int i=0; i<static_cast<int>(n_threads); i++)
339 : pthread_join(threads[i], nullptr);
340 : #endif
341 15022 : }
342 :
343 : /**
344 : * Execute the provided function object in parallel on the specified
345 : * range with the specified partitioner.
346 : */
347 : template <typename Range, typename Body, typename Partitioner>
348 : inline
349 : void parallel_for (const Range & range, const Body & body, const Partitioner &)
350 : {
351 : parallel_for (range, body);
352 : }
353 :
354 : /**
355 : * Execute the provided reduction operation in parallel on the specified
356 : * range.
357 : */
358 : template <typename Range, typename Body>
359 : inline
360 3673890 : void parallel_reduce (const Range & range, Body & body)
361 : {
362 73070 : Threads::BoolAcquire b(Threads::in_threads);
363 :
364 : // If we're running in serial - just run!
365 3673890 : if (libMesh::n_threads() == 1)
366 : {
367 3461244 : body(range);
368 932 : return;
369 : }
370 :
371 288552 : DisablePerfLogInScope disable_perf;
372 :
373 140508 : unsigned int n_threads = num_pthreads(range);
374 :
375 356922 : std::vector<std::unique_ptr<Range>> ranges(n_threads);
376 356922 : std::vector<std::unique_ptr<Body>> managed_bodies(n_threads); // bodies we are responsible for
377 284784 : std::vector<Body *> bodies(n_threads); // dumb pointers to managed_bodies
378 284784 : std::vector<RangeBody<Range, Body>> range_bodies(n_threads);
379 :
380 : // Create n_threads-1 copies of "body". We manage the lifetime of
381 : // these copies with std::unique_ptrs.
382 289640 : for (unsigned int i=1; i<n_threads; i++)
383 88294 : managed_bodies[i] = std::make_unique<Body>(body, Threads::split());
384 :
385 : // Set up the "bodies" vector. Use the passed in body for the first
386 : // one, point to managed_bodies entries for the others.
387 212646 : bodies[0] = &body;
388 289640 : for (unsigned int i=1; i<n_threads; i++)
389 103334 : bodies[i] = managed_bodies[i].get();
390 :
391 : // Create the ranges for each thread
392 212646 : std::size_t range_size = range.size() / n_threads;
393 :
394 212646 : typename Range::const_iterator current_beginning = range.begin();
395 :
396 502286 : for (unsigned int i=0; i<n_threads; i++)
397 : {
398 98478 : std::size_t this_range_size = range_size;
399 :
400 289640 : if (i+1 == n_threads)
401 212646 : this_range_size += range.size() % n_threads; // Give the last one the remaining work to do
402 :
403 388118 : ranges[i] = std::make_unique<Range>(range, current_beginning, current_beginning + this_range_size);
404 :
405 388118 : current_beginning = current_beginning + this_range_size;
406 : }
407 :
408 : // Create the RangeBody arguments
409 502286 : for (unsigned int i=0; i<n_threads; i++)
410 : {
411 388118 : range_bodies[i].range = ranges[i].get();
412 388118 : range_bodies[i].body = bodies[i];
413 : }
414 :
415 : // Create the threads
416 284784 : std::vector<pthread_t> threads(n_threads);
417 :
418 : // It may seem redundant to wrap a pragma in #ifdefs... but GCC
419 : // warns about an "unknown pragma" if it encounters this line of
420 : // code when -fopenmp is not passed to the compiler.
421 : #ifdef LIBMESH_HAVE_OPENMP
422 212646 : #pragma omp parallel for schedule (static)
423 : #endif
424 : // The use of 'int' instead of unsigned for the iteration variable
425 : // is deliberate here. This is an OpenMP loop, and some older
426 : // compilers warn when you don't use int for the loop index. The
427 : // reason has to do with signed vs. unsigned integer overflow
428 : // behavior and optimization.
429 : // http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html
430 : for (int i=0; i<static_cast<int>(n_threads); i++)
431 : {
432 : #if !LIBMESH_HAVE_OPENMP
433 : pthread_create(&threads[i], nullptr, &run_body<Range, Body>, (void *)&range_bodies[i]);
434 : #else
435 : run_body<Range, Body>((void *)&range_bodies[i]);
436 : #endif
437 : }
438 :
439 : #if !LIBMESH_HAVE_OPENMP
440 : // Wait for them to finish
441 : for (unsigned int i=0; i<n_threads; i++)
442 : pthread_join(threads[i], nullptr);
443 : #endif
444 :
445 : // Join them all down to the original Body
446 289640 : for (unsigned int i=n_threads-1; i != 0; i--)
447 129674 : bodies[i-1]->join(*bodies[i]);
448 68370 : }
449 :
450 : /**
451 : * Execute the provided reduction operation in parallel on the specified
452 : * range with the specified partitioner.
453 : */
454 : template <typename Range, typename Body, typename Partitioner>
455 : inline
456 : void parallel_reduce (const Range & range, Body & body, const Partitioner &)
457 : {
458 : parallel_reduce(range, body);
459 : }
460 :
461 :
462 : /**
463 : * Defines atomic operations which can only be executed on a
464 : * single thread at a time.
465 : */
466 : template <typename T>
467 : class atomic
468 : {
469 : public:
470 462 : atomic () : val(0) {}
471 16381 : operator T () { return val; }
472 :
473 : T operator=( T value )
474 : {
475 : spin_mutex::scoped_lock lock(smutex);
476 : val = value;
477 : return val;
478 : }
479 :
480 : atomic<T> & operator=( const atomic<T> & value )
481 : {
482 : spin_mutex::scoped_lock lock(smutex);
483 : val = value;
484 : return *this;
485 : }
486 :
487 :
488 : T operator+=(T value)
489 : {
490 : spin_mutex::scoped_lock lock(smutex);
491 : val += value;
492 : return val;
493 : }
494 :
495 : T operator-=(T value)
496 : {
497 : spin_mutex::scoped_lock lock(smutex);
498 : val -= value;
499 : return val;
500 : }
501 :
502 1567026230 : T operator++()
503 : {
504 42729276 : spin_mutex::scoped_lock lock(smutex);
505 2404658778 : val++;
506 1567026230 : return val;
507 : }
508 :
509 : T operator++(int)
510 : {
511 : spin_mutex::scoped_lock lock(smutex);
512 : val++;
513 : return val;
514 : }
515 :
516 4633270039 : T operator--()
517 : {
518 12207561 : spin_mutex::scoped_lock lock(smutex);
519 4811964193 : val--;
520 4633270039 : return val;
521 : }
522 :
523 : T operator--(int)
524 : {
525 : spin_mutex::scoped_lock lock(smutex);
526 : val--;
527 : return val;
528 : }
529 :
530 : private:
531 : T val;
532 : spin_mutex smutex;
533 : };
534 :
535 : } // namespace Threads
536 :
537 : } // namespace libMesh
538 :
539 : #endif // #ifdef LIBMESH_HAVE_PTHREAD
540 :
541 : #endif // LIBMESH_SQUASH_HEADER_WARNING
542 :
543 : #endif // LIBMESH_THREADS_PTHREAD_H
|