LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50  kmp_team_t *team = this_thr->th.th_team;
51  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52  kmp_info_t **other_threads = team->t.t_threads;
53 
54  KA_TRACE(
55  20,
56  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57  gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64  __itt_get_timestamp();
65  }
66 #endif
67  // We now perform a linear reduction to signal that all of the threads have
68  // arrived.
69  if (!KMP_MASTER_TID(tid)) {
70  KA_TRACE(20,
71  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72  "arrived(%p): %llu => %llu\n",
73  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76  // Mark arrival to master thread
77  /* After performing this write, a worker thread may not assume that the team
78  is valid any more - it could be deallocated by the master thread at any
79  time. */
80  ANNOTATE_BARRIER_BEGIN(this_thr);
81  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82  flag.release();
83  } else {
84  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85  int nproc = this_thr->th.th_team_nproc;
86  int i;
87  // Don't have to worry about sleep bit here or atomic since team setting
88  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90  // Collect all the worker team member threads.
91  for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93  // Prefetch next thread's arrived count
94  if (i + 1 < nproc)
95  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98  "arrived(%p) == %llu\n",
99  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100  team->t.t_id, i,
101  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103  // Wait for worker thread to arrive
104  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105  new_state);
106  if (cancellable) {
107  bool cancelled = flag.wait_cancellable_nosleep(
108  this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109  if (cancelled)
110  return true;
111  } else {
112  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113  }
114  ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116  // Barrier imbalance - write min of the thread time and the other thread
117  // time to the thread.
118  if (__kmp_forkjoin_frames_mode == 2) {
119  this_thr->th.th_bar_min_time = KMP_MIN(
120  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121  }
122 #endif
123  if (reduce) {
124  KA_TRACE(100,
125  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127  team->t.t_id, i));
128  ANNOTATE_REDUCE_AFTER(reduce);
129  OMPT_REDUCTION_DECL(this_thr, gtid);
130  OMPT_REDUCTION_BEGIN;
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  OMPT_REDUCTION_END;
134  ANNOTATE_REDUCE_BEFORE(reduce);
135  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136  }
137  }
138  // Don't have to worry about sleep bit here or atomic since team setting
139  team_bar->b_arrived = new_state;
140  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141  "arrived(%p) = %llu\n",
142  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143  new_state));
144  }
145  KA_TRACE(
146  20,
147  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148  gtid, team->t.t_id, tid, bt));
149  return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158  kmp_team_t *team;
159 
160  if (KMP_MASTER_TID(tid)) {
161  unsigned int i;
162  kmp_uint32 nproc = this_thr->th.th_team_nproc;
163  kmp_info_t **other_threads;
164 
165  team = __kmp_threads[gtid]->th.th_team;
166  KMP_DEBUG_ASSERT(team != NULL);
167  other_threads = team->t.t_threads;
168 
169  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170  "barrier type %d\n",
171  gtid, team->t.t_id, tid, bt));
172 
173  if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175  {
176  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177  if (propagate_icvs) {
178  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179  for (i = 1; i < nproc; ++i) {
180  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181  team, i, FALSE);
182  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183  &team->t.t_implicit_task_taskdata[0].td_icvs);
184  }
185  ngo_sync();
186  }
187  }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190  // Now, release all of the worker threads
191  for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193  // Prefetch next thread's go flag
194  if (i + 1 < nproc)
195  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197  KA_TRACE(
198  20,
199  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200  "go(%p): %u => %u\n",
201  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go,
204  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207  other_threads[i]);
208  flag.release();
209  }
210  }
211  } else { // Wait for the MASTER thread to release us
212  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215  if (cancellable) {
216  bool cancelled = flag.wait_cancellable_nosleep(
217  this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218  if (cancelled) {
219  return true;
220  }
221  } else {
222  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223  }
224  ANNOTATE_BARRIER_END(this_thr);
225 #if USE_ITT_BUILD && USE_ITT_NOTIFY
226  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228  // disabled)
229  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230  // Cancel wait on previous parallel region...
231  __kmp_itt_task_starting(itt_sync_obj);
232 
233  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234  return false;
235 
236  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237  if (itt_sync_obj != NULL)
238  // Call prepare as early as possible for "new" barrier
239  __kmp_itt_task_finished(itt_sync_obj);
240  } else
241 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242  // Early exit for reaping threads releasing forkjoin barrier
243  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244  return false;
245 // The worker thread may now assume that the team is valid.
246 #ifdef KMP_DEBUG
247  tid = __kmp_tid_from_gtid(gtid);
248  team = __kmp_threads[gtid]->th.th_team;
249 #endif
250  KMP_DEBUG_ASSERT(team != NULL);
251  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252  KA_TRACE(20,
253  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255  KMP_MB(); // Flush all pending memory write invalidates.
256  }
257  KA_TRACE(
258  20,
259  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260  gtid, team->t.t_id, tid, bt));
261  return false;
262 }
263 
264 static void __kmp_linear_barrier_gather(
265  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267  __kmp_linear_barrier_gather_template<false>(
268  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269 }
270 
271 static bool __kmp_linear_barrier_gather_cancellable(
272  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274  return __kmp_linear_barrier_gather_template<true>(
275  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276 }
277 
278 static void __kmp_linear_barrier_release(
279  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281  __kmp_linear_barrier_release_template<false>(
282  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283 }
284 
285 static bool __kmp_linear_barrier_release_cancellable(
286  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288  return __kmp_linear_barrier_release_template<true>(
289  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290 }
291 
292 // Tree barrier
293 static void
294 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295  int tid, void (*reduce)(void *, void *)
296  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298  kmp_team_t *team = this_thr->th.th_team;
299  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300  kmp_info_t **other_threads = team->t.t_threads;
301  kmp_uint32 nproc = this_thr->th.th_team_nproc;
302  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303  kmp_uint32 branch_factor = 1 << branch_bits;
304  kmp_uint32 child;
305  kmp_uint32 child_tid;
306  kmp_uint64 new_state;
307 
308  KA_TRACE(
309  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310  gtid, team->t.t_id, tid, bt));
311  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 
313 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314  // Barrier imbalance - save arrive time to the thread
315  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317  __itt_get_timestamp();
318  }
319 #endif
320  // Perform tree gather to wait until all threads have arrived; reduce any
321  // required data as we go
322  child_tid = (tid << branch_bits) + 1;
323  if (child_tid < nproc) {
324  // Parent threads wait for all their children to arrive
325  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326  child = 1;
327  do {
328  kmp_info_t *child_thr = other_threads[child_tid];
329  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 #if KMP_CACHE_MANAGE
331  // Prefetch next thread's arrived count
332  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333  KMP_CACHE_PREFETCH(
334  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335 #endif /* KMP_CACHE_MANAGE */
336  KA_TRACE(20,
337  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338  "arrived(%p) == %llu\n",
339  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341  // Wait for child to arrive
342  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344  ANNOTATE_BARRIER_END(child_thr);
345 #if USE_ITT_BUILD && USE_ITT_NOTIFY
346  // Barrier imbalance - write min of the thread time and a child time to
347  // the thread.
348  if (__kmp_forkjoin_frames_mode == 2) {
349  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350  child_thr->th.th_bar_min_time);
351  }
352 #endif
353  if (reduce) {
354  KA_TRACE(100,
355  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357  team->t.t_id, child_tid));
358  ANNOTATE_REDUCE_AFTER(reduce);
359  OMPT_REDUCTION_DECL(this_thr, gtid);
360  OMPT_REDUCTION_BEGIN;
361  (*reduce)(this_thr->th.th_local.reduce_data,
362  child_thr->th.th_local.reduce_data);
363  OMPT_REDUCTION_END;
364  ANNOTATE_REDUCE_BEFORE(reduce);
365  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366  }
367  child++;
368  child_tid++;
369  } while (child <= branch_factor && child_tid < nproc);
370  }
371 
372  if (!KMP_MASTER_TID(tid)) { // Worker threads
373  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374 
375  KA_TRACE(20,
376  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377  "arrived(%p): %llu => %llu\n",
378  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381 
382  // Mark arrival to parent thread
383  /* After performing this write, a worker thread may not assume that the team
384  is valid any more - it could be deallocated by the master thread at any
385  time. */
386  ANNOTATE_BARRIER_BEGIN(this_thr);
387  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388  flag.release();
389  } else {
390  // Need to update the team arrived pointer if we are the master thread
391  if (nproc > 1) // New value was already computed above
392  team->t.t_bar[bt].b_arrived = new_state;
393  else
394  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396  "arrived(%p) = %llu\n",
397  gtid, team->t.t_id, tid, team->t.t_id,
398  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399  }
400  KA_TRACE(20,
401  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402  gtid, team->t.t_id, tid, bt));
403 }
404 
405 static void __kmp_tree_barrier_release(
406  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409  kmp_team_t *team;
410  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411  kmp_uint32 nproc;
412  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413  kmp_uint32 branch_factor = 1 << branch_bits;
414  kmp_uint32 child;
415  kmp_uint32 child_tid;
416 
417  // Perform a tree release for all of the threads that have been gathered
418  if (!KMP_MASTER_TID(
419  tid)) { // Handle fork barrier workers who aren't part of a team yet
420  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422  // Wait for parent thread to release us
423  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425  ANNOTATE_BARRIER_END(this_thr);
426 #if USE_ITT_BUILD && USE_ITT_NOTIFY
427  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428  // In fork barrier where we could not get the object reliably (or
429  // ITTNOTIFY is disabled)
430  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431  // Cancel wait on previous parallel region...
432  __kmp_itt_task_starting(itt_sync_obj);
433 
434  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435  return;
436 
437  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438  if (itt_sync_obj != NULL)
439  // Call prepare as early as possible for "new" barrier
440  __kmp_itt_task_finished(itt_sync_obj);
441  } else
442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443  // Early exit for reaping threads releasing forkjoin barrier
444  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445  return;
446 
447  // The worker thread may now assume that the team is valid.
448  team = __kmp_threads[gtid]->th.th_team;
449  KMP_DEBUG_ASSERT(team != NULL);
450  tid = __kmp_tid_from_gtid(gtid);
451 
452  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453  KA_TRACE(20,
454  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456  KMP_MB(); // Flush all pending memory write invalidates.
457  } else {
458  team = __kmp_threads[gtid]->th.th_team;
459  KMP_DEBUG_ASSERT(team != NULL);
460  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461  "barrier type %d\n",
462  gtid, team->t.t_id, tid, bt));
463  }
464  nproc = this_thr->th.th_team_nproc;
465  child_tid = (tid << branch_bits) + 1;
466 
467  if (child_tid < nproc) {
468  kmp_info_t **other_threads = team->t.t_threads;
469  child = 1;
470  // Parent threads release all their children
471  do {
472  kmp_info_t *child_thr = other_threads[child_tid];
473  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474 #if KMP_CACHE_MANAGE
475  // Prefetch next thread's go count
476  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477  KMP_CACHE_PREFETCH(
478  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479 #endif /* KMP_CACHE_MANAGE */
480 
481 #if KMP_BARRIER_ICV_PUSH
482  {
483  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484  if (propagate_icvs) {
485  __kmp_init_implicit_task(team->t.t_ident,
486  team->t.t_threads[child_tid], team,
487  child_tid, FALSE);
488  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489  &team->t.t_implicit_task_taskdata[0].td_icvs);
490  }
491  }
492 #endif // KMP_BARRIER_ICV_PUSH
493  KA_TRACE(20,
494  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495  "go(%p): %u => %u\n",
496  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499  // Release child from barrier
500  ANNOTATE_BARRIER_BEGIN(child_thr);
501  kmp_flag_64 flag(&child_bar->b_go, child_thr);
502  flag.release();
503  child++;
504  child_tid++;
505  } while (child <= branch_factor && child_tid < nproc);
506  }
507  KA_TRACE(
508  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509  gtid, team->t.t_id, tid, bt));
510 }
511 
512 // Hyper Barrier
513 static void
514 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515  int tid, void (*reduce)(void *, void *)
516  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518  kmp_team_t *team = this_thr->th.th_team;
519  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520  kmp_info_t **other_threads = team->t.t_threads;
521  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524  kmp_uint32 branch_factor = 1 << branch_bits;
525  kmp_uint32 offset;
526  kmp_uint32 level;
527 
528  KA_TRACE(
529  20,
530  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531  gtid, team->t.t_id, tid, bt));
532  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533 
534 #if USE_ITT_BUILD && USE_ITT_NOTIFY
535  // Barrier imbalance - save arrive time to the thread
536  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538  __itt_get_timestamp();
539  }
540 #endif
541  /* Perform a hypercube-embedded tree gather to wait until all of the threads
542  have arrived, and reduce any required data as we go. */
543  kmp_flag_64 p_flag(&thr_bar->b_arrived);
544  for (level = 0, offset = 1; offset < num_threads;
545  level += branch_bits, offset <<= branch_bits) {
546  kmp_uint32 child;
547  kmp_uint32 child_tid;
548 
549  if (((tid >> level) & (branch_factor - 1)) != 0) {
550  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551 
552  KMP_MB(); // Synchronize parent and child threads.
553  KA_TRACE(20,
554  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
555  "arrived(%p): %llu => %llu\n",
556  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
557  team->t.t_id, parent_tid, &thr_bar->b_arrived,
558  thr_bar->b_arrived,
559  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
560  // Mark arrival to parent thread
561  /* After performing this write (in the last iteration of the enclosing for
562  loop), a worker thread may not assume that the team is valid any more
563  - it could be deallocated by the master thread at any time. */
564  ANNOTATE_BARRIER_BEGIN(this_thr);
565  p_flag.set_waiter(other_threads[parent_tid]);
566  p_flag.release();
567  break;
568  }
569 
570  // Parent threads wait for children to arrive
571  if (new_state == KMP_BARRIER_UNUSED_STATE)
572  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
573  for (child = 1, child_tid = tid + (1 << level);
574  child < branch_factor && child_tid < num_threads;
575  child++, child_tid += (1 << level)) {
576  kmp_info_t *child_thr = other_threads[child_tid];
577  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
578 #if KMP_CACHE_MANAGE
579  kmp_uint32 next_child_tid = child_tid + (1 << level);
580  // Prefetch next thread's arrived count
581  if (child + 1 < branch_factor && next_child_tid < num_threads)
582  KMP_CACHE_PREFETCH(
583  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
584 #endif /* KMP_CACHE_MANAGE */
585  KA_TRACE(20,
586  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
587  "arrived(%p) == %llu\n",
588  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
589  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
590  // Wait for child to arrive
591  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
592  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
593  ANNOTATE_BARRIER_END(child_thr);
594  KMP_MB(); // Synchronize parent and child threads.
595 #if USE_ITT_BUILD && USE_ITT_NOTIFY
596  // Barrier imbalance - write min of the thread time and a child time to
597  // the thread.
598  if (__kmp_forkjoin_frames_mode == 2) {
599  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
600  child_thr->th.th_bar_min_time);
601  }
602 #endif
603  if (reduce) {
604  KA_TRACE(100,
605  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
606  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
607  team->t.t_id, child_tid));
608  ANNOTATE_REDUCE_AFTER(reduce);
609  OMPT_REDUCTION_DECL(this_thr, gtid);
610  OMPT_REDUCTION_BEGIN;
611  (*reduce)(this_thr->th.th_local.reduce_data,
612  child_thr->th.th_local.reduce_data);
613  OMPT_REDUCTION_END;
614  ANNOTATE_REDUCE_BEFORE(reduce);
615  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
616  }
617  }
618  }
619 
620  if (KMP_MASTER_TID(tid)) {
621  // Need to update the team arrived pointer if we are the master thread
622  if (new_state == KMP_BARRIER_UNUSED_STATE)
623  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
624  else
625  team->t.t_bar[bt].b_arrived = new_state;
626  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
627  "arrived(%p) = %llu\n",
628  gtid, team->t.t_id, tid, team->t.t_id,
629  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
630  }
631  KA_TRACE(
632  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
633  gtid, team->t.t_id, tid, bt));
634 }
635 
636 // The reverse versions seem to beat the forward versions overall
637 #define KMP_REVERSE_HYPER_BAR
638 static void __kmp_hyper_barrier_release(
639  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
640  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
641  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
642  kmp_team_t *team;
643  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
644  kmp_info_t **other_threads;
645  kmp_uint32 num_threads;
646  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
647  kmp_uint32 branch_factor = 1 << branch_bits;
648  kmp_uint32 child;
649  kmp_uint32 child_tid;
650  kmp_uint32 offset;
651  kmp_uint32 level;
652 
653  /* Perform a hypercube-embedded tree release for all of the threads that have
654  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
655  are released in the reverse order of the corresponding gather, otherwise
656  threads are released in the same order. */
657  if (KMP_MASTER_TID(tid)) { // master
658  team = __kmp_threads[gtid]->th.th_team;
659  KMP_DEBUG_ASSERT(team != NULL);
660  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
661  "barrier type %d\n",
662  gtid, team->t.t_id, tid, bt));
663 #if KMP_BARRIER_ICV_PUSH
664  if (propagate_icvs) { // master already has ICVs in final destination; copy
665  copy_icvs(&thr_bar->th_fixed_icvs,
666  &team->t.t_implicit_task_taskdata[tid].td_icvs);
667  }
668 #endif
669  } else { // Handle fork barrier workers who aren't part of a team yet
670  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
671  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
672  // Wait for parent thread to release us
673  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
674  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
675  ANNOTATE_BARRIER_END(this_thr);
676 #if USE_ITT_BUILD && USE_ITT_NOTIFY
677  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
678  // In fork barrier where we could not get the object reliably
679  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
680  // Cancel wait on previous parallel region...
681  __kmp_itt_task_starting(itt_sync_obj);
682 
683  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
684  return;
685 
686  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
687  if (itt_sync_obj != NULL)
688  // Call prepare as early as possible for "new" barrier
689  __kmp_itt_task_finished(itt_sync_obj);
690  } else
691 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
692  // Early exit for reaping threads releasing forkjoin barrier
693  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
694  return;
695 
696  // The worker thread may now assume that the team is valid.
697  team = __kmp_threads[gtid]->th.th_team;
698  KMP_DEBUG_ASSERT(team != NULL);
699  tid = __kmp_tid_from_gtid(gtid);
700 
701  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
702  KA_TRACE(20,
703  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
704  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
705  KMP_MB(); // Flush all pending memory write invalidates.
706  }
707  num_threads = this_thr->th.th_team_nproc;
708  other_threads = team->t.t_threads;
709 
710 #ifdef KMP_REVERSE_HYPER_BAR
711  // Count up to correct level for parent
712  for (level = 0, offset = 1;
713  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
714  level += branch_bits, offset <<= branch_bits)
715  ;
716 
717  // Now go down from there
718  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
719  level -= branch_bits, offset >>= branch_bits)
720 #else
721  // Go down the tree, level by level
722  for (level = 0, offset = 1; offset < num_threads;
723  level += branch_bits, offset <<= branch_bits)
724 #endif // KMP_REVERSE_HYPER_BAR
725  {
726 #ifdef KMP_REVERSE_HYPER_BAR
727  /* Now go in reverse order through the children, highest to lowest.
728  Initial setting of child is conservative here. */
729  child = num_threads >> ((level == 0) ? level : level - 1);
730  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
731  child_tid = tid + (child << level);
732  child >= 1; child--, child_tid -= (1 << level))
733 #else
734  if (((tid >> level) & (branch_factor - 1)) != 0)
735  // No need to go lower than this, since this is the level parent would be
736  // notified
737  break;
738  // Iterate through children on this level of the tree
739  for (child = 1, child_tid = tid + (1 << level);
740  child < branch_factor && child_tid < num_threads;
741  child++, child_tid += (1 << level))
742 #endif // KMP_REVERSE_HYPER_BAR
743  {
744  if (child_tid >= num_threads)
745  continue; // Child doesn't exist so keep going
746  else {
747  kmp_info_t *child_thr = other_threads[child_tid];
748  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
749 #if KMP_CACHE_MANAGE
750  kmp_uint32 next_child_tid = child_tid - (1 << level);
751 // Prefetch next thread's go count
752 #ifdef KMP_REVERSE_HYPER_BAR
753  if (child - 1 >= 1 && next_child_tid < num_threads)
754 #else
755  if (child + 1 < branch_factor && next_child_tid < num_threads)
756 #endif // KMP_REVERSE_HYPER_BAR
757  KMP_CACHE_PREFETCH(
758  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
759 #endif /* KMP_CACHE_MANAGE */
760 
761 #if KMP_BARRIER_ICV_PUSH
762  if (propagate_icvs) // push my fixed ICVs to my child
763  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
764 #endif // KMP_BARRIER_ICV_PUSH
765 
766  KA_TRACE(
767  20,
768  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
769  "go(%p): %u => %u\n",
770  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
771  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
772  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
773  // Release child from barrier
774  ANNOTATE_BARRIER_BEGIN(child_thr);
775  kmp_flag_64 flag(&child_bar->b_go, child_thr);
776  flag.release();
777  }
778  }
779  }
780 #if KMP_BARRIER_ICV_PUSH
781  if (propagate_icvs &&
782  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
783  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
784  FALSE);
785  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
786  &thr_bar->th_fixed_icvs);
787  }
788 #endif
789  KA_TRACE(
790  20,
791  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
792  gtid, team->t.t_id, tid, bt));
793 }
794 
795 // Hierarchical Barrier
796 
797 // Initialize thread barrier data
798 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
799  Performs the minimum amount of initialization required based on how the team
800  has changed. Returns true if leaf children will require both on-core and
801  traditional wake-up mechanisms. For example, if the team size increases,
802  threads already in the team will respond to on-core wakeup on their parent
803  thread, but threads newly added to the team will only be listening on the
804  their local b_go. */
805 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
806  kmp_bstate_t *thr_bar,
807  kmp_uint32 nproc, int gtid,
808  int tid, kmp_team_t *team) {
809  // Checks to determine if (re-)initialization is needed
810  bool uninitialized = thr_bar->team == NULL;
811  bool team_changed = team != thr_bar->team;
812  bool team_sz_changed = nproc != thr_bar->nproc;
813  bool tid_changed = tid != thr_bar->old_tid;
814  bool retval = false;
815 
816  if (uninitialized || team_sz_changed) {
817  __kmp_get_hierarchy(nproc, thr_bar);
818  }
819 
820  if (uninitialized || team_sz_changed || tid_changed) {
821  thr_bar->my_level = thr_bar->depth - 1; // default for master
822  thr_bar->parent_tid = -1; // default for master
823  if (!KMP_MASTER_TID(
824  tid)) { // if not master, find parent thread in hierarchy
825  kmp_uint32 d = 0;
826  while (d < thr_bar->depth) { // find parent based on level of thread in
827  // hierarchy, and note level
828  kmp_uint32 rem;
829  if (d == thr_bar->depth - 2) { // reached level right below the master
830  thr_bar->parent_tid = 0;
831  thr_bar->my_level = d;
832  break;
833  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
834  0) { // TODO: can we make this op faster?
835  // thread is not a subtree root at next level, so this is max
836  thr_bar->parent_tid = tid - rem;
837  thr_bar->my_level = d;
838  break;
839  }
840  ++d;
841  }
842  }
843  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
844  thr_bar->old_tid = tid;
845  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846  thr_bar->team = team;
847  thr_bar->parent_bar =
848  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849  }
850  if (uninitialized || team_changed || tid_changed) {
851  thr_bar->team = team;
852  thr_bar->parent_bar =
853  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854  retval = true;
855  }
856  if (uninitialized || team_sz_changed || tid_changed) {
857  thr_bar->nproc = nproc;
858  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859  if (thr_bar->my_level == 0)
860  thr_bar->leaf_kids = 0;
861  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862  thr_bar->leaf_kids = nproc - tid - 1;
863  thr_bar->leaf_state = 0;
864  for (int i = 0; i < thr_bar->leaf_kids; ++i)
865  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866  }
867  return retval;
868 }
869 
870 static void __kmp_hierarchical_barrier_gather(
871  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874  kmp_team_t *team = this_thr->th.th_team;
875  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876  kmp_uint32 nproc = this_thr->th.th_team_nproc;
877  kmp_info_t **other_threads = team->t.t_threads;
878  kmp_uint64 new_state;
879 
880  int level = team->t.t_level;
881  if (other_threads[0]
882  ->th.th_teams_microtask) // are we inside the teams construct?
883  if (this_thr->th.th_teams_size.nteams > 1)
884  ++level; // level was not increased in teams construct for team_of_masters
885  if (level == 1)
886  thr_bar->use_oncore_barrier = 1;
887  else
888  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889 
890  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891  "barrier type %d\n",
892  gtid, team->t.t_id, tid, bt));
893  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894 
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896  // Barrier imbalance - save arrive time to the thread
897  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899  }
900 #endif
901 
902  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903  team);
904 
905  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906  kmp_int32 child_tid;
907  new_state =
908  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910  thr_bar->use_oncore_barrier) {
911  if (thr_bar->leaf_kids) {
912  // First, wait for leaf children to check-in on my b_arrived flag
913  kmp_uint64 leaf_state =
914  KMP_MASTER_TID(tid)
915  ? thr_bar->b_arrived | thr_bar->leaf_state
916  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918  "for leaf kids\n",
919  gtid, team->t.t_id, tid));
920  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
921  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922  if (reduce) {
923  ANNOTATE_REDUCE_AFTER(reduce);
924  OMPT_REDUCTION_DECL(this_thr, gtid);
925  OMPT_REDUCTION_BEGIN;
926  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927  ++child_tid) {
928  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929  "T#%d(%d:%d)\n",
930  gtid, team->t.t_id, tid,
931  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932  child_tid));
933  ANNOTATE_BARRIER_END(other_threads[child_tid]);
934  (*reduce)(this_thr->th.th_local.reduce_data,
935  other_threads[child_tid]->th.th_local.reduce_data);
936  }
937  OMPT_REDUCTION_END;
938  ANNOTATE_REDUCE_BEFORE(reduce);
939  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940  }
941  // clear leaf_state bits
942  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943  }
944  // Next, wait for higher level children on each child's b_arrived flag
945  for (kmp_uint32 d = 1; d < thr_bar->my_level;
946  ++d) { // gather lowest level threads first, but skip 0
947  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948  skip = thr_bar->skip_per_level[d];
949  if (last > nproc)
950  last = nproc;
951  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952  kmp_info_t *child_thr = other_threads[child_tid];
953  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955  "T#%d(%d:%d) "
956  "arrived(%p) == %llu\n",
957  gtid, team->t.t_id, tid,
958  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959  child_tid, &child_bar->b_arrived, new_state));
960  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
961  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962  ANNOTATE_BARRIER_END(child_thr);
963  if (reduce) {
964  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965  "T#%d(%d:%d)\n",
966  gtid, team->t.t_id, tid,
967  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968  child_tid));
969  ANNOTATE_REDUCE_AFTER(reduce);
970  (*reduce)(this_thr->th.th_local.reduce_data,
971  child_thr->th.th_local.reduce_data);
972  ANNOTATE_REDUCE_BEFORE(reduce);
973  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974  }
975  }
976  }
977  } else { // Blocktime is not infinite
978  for (kmp_uint32 d = 0; d < thr_bar->my_level;
979  ++d) { // Gather lowest level threads first
980  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981  skip = thr_bar->skip_per_level[d];
982  if (last > nproc)
983  last = nproc;
984  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985  kmp_info_t *child_thr = other_threads[child_tid];
986  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988  "T#%d(%d:%d) "
989  "arrived(%p) == %llu\n",
990  gtid, team->t.t_id, tid,
991  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992  child_tid, &child_bar->b_arrived, new_state));
993  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
994  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995  ANNOTATE_BARRIER_END(child_thr);
996  if (reduce) {
997  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998  "T#%d(%d:%d)\n",
999  gtid, team->t.t_id, tid,
1000  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001  child_tid));
1002  ANNOTATE_REDUCE_AFTER(reduce);
1003  (*reduce)(this_thr->th.th_local.reduce_data,
1004  child_thr->th.th_local.reduce_data);
1005  ANNOTATE_REDUCE_BEFORE(reduce);
1006  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007  }
1008  }
1009  }
1010  }
1011  }
1012  // All subordinates are gathered; now release parent if not master thread
1013 
1014  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017  gtid, team->t.t_id, tid,
1018  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021  /* Mark arrival to parent: After performing this write, a worker thread may
1022  not assume that the team is valid any more - it could be deallocated by
1023  the master thread at any time. */
1024  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026  // flag; release it
1027  ANNOTATE_BARRIER_BEGIN(this_thr);
1028  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1029  flag.release();
1030  } else {
1031  // Leaf does special release on "offset" bits of parent's b_arrived flag
1032  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1033  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1034  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1035  flag.release();
1036  }
1037  } else { // Master thread needs to update the team's b_arrived value
1038  team->t.t_bar[bt].b_arrived = new_state;
1039  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1040  "arrived(%p) = %llu\n",
1041  gtid, team->t.t_id, tid, team->t.t_id,
1042  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1043  }
1044  // Is the team access below unsafe or just technically invalid?
1045  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1046  "barrier type %d\n",
1047  gtid, team->t.t_id, tid, bt));
1048 }
1049 
1050 static void __kmp_hierarchical_barrier_release(
1051  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1052  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1053  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1054  kmp_team_t *team;
1055  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1056  kmp_uint32 nproc;
1057  bool team_change = false; // indicates on-core barrier shouldn't be used
1058 
1059  if (KMP_MASTER_TID(tid)) {
1060  team = __kmp_threads[gtid]->th.th_team;
1061  KMP_DEBUG_ASSERT(team != NULL);
1062  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1063  "entered barrier type %d\n",
1064  gtid, team->t.t_id, tid, bt));
1065  } else { // Worker threads
1066  // Wait for parent thread to release me
1067  if (!thr_bar->use_oncore_barrier ||
1068  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1069  thr_bar->team == NULL) {
1070  // Use traditional method of waiting on my own b_go flag
1071  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1072  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1073  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1074  ANNOTATE_BARRIER_END(this_thr);
1075  TCW_8(thr_bar->b_go,
1076  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1078  // infinite, not nested
1079  // Wait on my "offset" bits on parent's b_go flag
1080  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1081  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1082  thr_bar->offset, bt,
1083  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1084  flag.wait(this_thr, TRUE);
1085  if (thr_bar->wait_flag ==
1086  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1087  TCW_8(thr_bar->b_go,
1088  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1089  } else { // Reset my bits on parent's b_go flag
1090  (RCAST(volatile char *,
1091  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1092  }
1093  }
1094  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1095  // Early exit for reaping threads releasing forkjoin barrier
1096  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1097  return;
1098  // The worker thread may now assume that the team is valid.
1099  team = __kmp_threads[gtid]->th.th_team;
1100  KMP_DEBUG_ASSERT(team != NULL);
1101  tid = __kmp_tid_from_gtid(gtid);
1102 
1103  KA_TRACE(
1104  20,
1105  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1106  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1107  KMP_MB(); // Flush all pending memory write invalidates.
1108  }
1109 
1110  nproc = this_thr->th.th_team_nproc;
1111  int level = team->t.t_level;
1112  if (team->t.t_threads[0]
1113  ->th.th_teams_microtask) { // are we inside the teams construct?
1114  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1115  this_thr->th.th_teams_level == level)
1116  ++level; // level was not increased in teams construct for team_of_workers
1117  if (this_thr->th.th_teams_size.nteams > 1)
1118  ++level; // level was not increased in teams construct for team_of_masters
1119  }
1120  if (level == 1)
1121  thr_bar->use_oncore_barrier = 1;
1122  else
1123  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1124 
1125  // If the team size has increased, we still communicate with old leaves via
1126  // oncore barrier.
1127  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1128  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1129  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1130  tid, team);
1131  // But if the entire team changes, we won't use oncore barrier at all
1132  if (team_change)
1133  old_leaf_kids = 0;
1134 
1135 #if KMP_BARRIER_ICV_PUSH
1136  if (propagate_icvs) {
1137  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1138  FALSE);
1139  if (KMP_MASTER_TID(
1140  tid)) { // master already has copy in final destination; copy
1141  copy_icvs(&thr_bar->th_fixed_icvs,
1142  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1143  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1144  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1145  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1146  // leaves (on-core children) pull parent's fixed ICVs directly to local
1147  // ICV store
1148  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149  &thr_bar->parent_bar->th_fixed_icvs);
1150  // non-leaves will get ICVs piggybacked with b_go via NGO store
1151  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1152  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1153  // access
1154  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1155  else // leaves copy parent's fixed ICVs directly to local ICV store
1156  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1157  &thr_bar->parent_bar->th_fixed_icvs);
1158  }
1159  }
1160 #endif // KMP_BARRIER_ICV_PUSH
1161 
1162  // Now, release my children
1163  if (thr_bar->my_level) { // not a leaf
1164  kmp_int32 child_tid;
1165  kmp_uint32 last;
1166  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1167  thr_bar->use_oncore_barrier) {
1168  if (KMP_MASTER_TID(tid)) { // do a flat release
1169  // Set local b_go to bump children via NGO store of the cache line
1170  // containing IVCs and b_go.
1171  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1172  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1173  // the cache line
1174  ngo_load(&thr_bar->th_fixed_icvs);
1175  // This loops over all the threads skipping only the leaf nodes in the
1176  // hierarchy
1177  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1178  child_tid += thr_bar->skip_per_level[1]) {
1179  kmp_bstate_t *child_bar =
1180  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1181  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1182  "releasing T#%d(%d:%d)"
1183  " go(%p): %u => %u\n",
1184  gtid, team->t.t_id, tid,
1185  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1186  child_tid, &child_bar->b_go, child_bar->b_go,
1187  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1188  // Use ngo store (if available) to both store ICVs and release child
1189  // via child's b_go
1190  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1191  }
1192  ngo_sync();
1193  }
1194  TCW_8(thr_bar->b_go,
1195  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1196  // Now, release leaf children
1197  if (thr_bar->leaf_kids) { // if there are any
1198  // We test team_change on the off-chance that the level 1 team changed.
1199  if (team_change ||
1200  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1201  if (old_leaf_kids) { // release old leaf kids
1202  thr_bar->b_go |= old_leaf_state;
1203  }
1204  // Release new leaf kids
1205  last = tid + thr_bar->skip_per_level[1];
1206  if (last > nproc)
1207  last = nproc;
1208  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1209  ++child_tid) { // skip_per_level[0]=1
1210  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1211  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1212  KA_TRACE(
1213  20,
1214  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1215  " T#%d(%d:%d) go(%p): %u => %u\n",
1216  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1217  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1218  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1219  // Release child using child's b_go flag
1220  ANNOTATE_BARRIER_BEGIN(child_thr);
1221  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1222  flag.release();
1223  }
1224  } else { // Release all children at once with leaf_state bits on my own
1225  // b_go flag
1226  thr_bar->b_go |= thr_bar->leaf_state;
1227  }
1228  }
1229  } else { // Blocktime is not infinite; do a simple hierarchical release
1230  for (int d = thr_bar->my_level - 1; d >= 0;
1231  --d) { // Release highest level threads first
1232  last = tid + thr_bar->skip_per_level[d + 1];
1233  kmp_uint32 skip = thr_bar->skip_per_level[d];
1234  if (last > nproc)
1235  last = nproc;
1236  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1237  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1238  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1239  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1240  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1241  gtid, team->t.t_id, tid,
1242  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1243  child_tid, &child_bar->b_go, child_bar->b_go,
1244  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1245  // Release child using child's b_go flag
1246  ANNOTATE_BARRIER_BEGIN(child_thr);
1247  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1248  flag.release();
1249  }
1250  }
1251  }
1252 #if KMP_BARRIER_ICV_PUSH
1253  if (propagate_icvs && !KMP_MASTER_TID(tid))
1254  // non-leaves copy ICVs from fixed ICVs to local dest
1255  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1256  &thr_bar->th_fixed_icvs);
1257 #endif // KMP_BARRIER_ICV_PUSH
1258  }
1259  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1260  "barrier type %d\n",
1261  gtid, team->t.t_id, tid, bt));
1262 }
1263 
1264 // End of Barrier Algorithms
1265 
1266 // type traits for cancellable value
1267 // if cancellable is true, then is_cancellable is a normal boolean variable
1268 // if cancellable is false, then is_cancellable is a compile time constant
1269 template <bool cancellable> struct is_cancellable {};
1270 template <> struct is_cancellable<true> {
1271  bool value;
1272  is_cancellable() : value(false) {}
1273  is_cancellable(bool b) : value(b) {}
1274  is_cancellable &operator=(bool b) {
1275  value = b;
1276  return *this;
1277  }
1278  operator bool() const { return value; }
1279 };
1280 template <> struct is_cancellable<false> {
1281  is_cancellable &operator=(bool b) { return *this; }
1282  constexpr operator bool() const { return false; }
1283 };
1284 
1285 // Internal function to do a barrier.
1286 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1287  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1288  barrier
1289  When cancellable = false,
1290  Returns 0 if master thread, 1 if worker thread.
1291  When cancellable = true
1292  Returns 0 if not cancelled, 1 if cancelled. */
1293 template <bool cancellable = false>
1294 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1295  size_t reduce_size, void *reduce_data,
1296  void (*reduce)(void *, void *)) {
1297  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1298  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1299  int tid = __kmp_tid_from_gtid(gtid);
1300  kmp_info_t *this_thr = __kmp_threads[gtid];
1301  kmp_team_t *team = this_thr->th.th_team;
1302  int status = 0;
1303  is_cancellable<cancellable> cancelled;
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305  ompt_data_t *my_task_data;
1306  ompt_data_t *my_parallel_data;
1307  void *return_address;
1308  ompt_sync_region_t barrier_kind;
1309 #endif
1310 
1311  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1312  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1313 
1314  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1315 #if OMPT_SUPPORT
1316  if (ompt_enabled.enabled) {
1317 #if OMPT_OPTIONAL
1318  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1319  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1320  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1321  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1322  if (ompt_enabled.ompt_callback_sync_region) {
1323  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1324  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1325  return_address);
1326  }
1327  if (ompt_enabled.ompt_callback_sync_region_wait) {
1328  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1329  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1330  return_address);
1331  }
1332 #endif
1333  // It is OK to report the barrier state after the barrier begin callback.
1334  // According to the OMPT specification, a compliant implementation may
1335  // even delay reporting this state until the barrier begins to wait.
1336  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1337  }
1338 #endif
1339 
1340  if (!team->t.t_serialized) {
1341 #if USE_ITT_BUILD
1342  // This value will be used in itt notify events below.
1343  void *itt_sync_obj = NULL;
1344 #if USE_ITT_NOTIFY
1345  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1346  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1347 #endif
1348 #endif /* USE_ITT_BUILD */
1349  if (__kmp_tasking_mode == tskm_extra_barrier) {
1350  __kmp_tasking_barrier(team, this_thr, gtid);
1351  KA_TRACE(15,
1352  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1353  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1354  }
1355 
1356  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1357  access it when the team struct is not guaranteed to exist. */
1358  // See note about the corresponding code in __kmp_join_barrier() being
1359  // performance-critical.
1360  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1361 #if KMP_USE_MONITOR
1362  this_thr->th.th_team_bt_intervals =
1363  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1364  this_thr->th.th_team_bt_set =
1365  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1366 #else
1367  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1368 #endif
1369  }
1370 
1371 #if USE_ITT_BUILD
1372  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1373  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1374 #endif /* USE_ITT_BUILD */
1375 #if USE_DEBUGGER
1376  // Let the debugger know: the thread arrived to the barrier and waiting.
1377  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1378  team->t.t_bar[bt].b_master_arrived += 1;
1379  } else {
1380  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1381  } // if
1382 #endif /* USE_DEBUGGER */
1383  if (reduce != NULL) {
1384  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1385  this_thr->th.th_local.reduce_data = reduce_data;
1386  }
1387 
1388  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1389  // use 0 to only setup the current team if nthreads > 1
1390  __kmp_task_team_setup(this_thr, team, 0);
1391 
1392  if (cancellable) {
1393  cancelled = __kmp_linear_barrier_gather_cancellable(
1394  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1395  } else {
1396  switch (__kmp_barrier_gather_pattern[bt]) {
1397  case bp_hyper_bar: {
1398  // don't set branch bits to 0; use linear
1399  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1401  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1402  break;
1403  }
1404  case bp_hierarchical_bar: {
1405  __kmp_hierarchical_barrier_gather(
1406  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407  break;
1408  }
1409  case bp_tree_bar: {
1410  // don't set branch bits to 0; use linear
1411  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1412  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1413  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1414  break;
1415  }
1416  default: {
1417  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1418  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1419  }
1420  }
1421  }
1422 
1423  KMP_MB();
1424 
1425  if (KMP_MASTER_TID(tid)) {
1426  status = 0;
1427  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1428  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1429  }
1430 #if USE_DEBUGGER
1431  // Let the debugger know: All threads are arrived and starting leaving the
1432  // barrier.
1433  team->t.t_bar[bt].b_team_arrived += 1;
1434 #endif
1435 
1436  if (__kmp_omp_cancellation) {
1437  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1438  // Reset cancellation flag for worksharing constructs
1439  if (cancel_request == cancel_loop ||
1440  cancel_request == cancel_sections) {
1441  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1442  }
1443  }
1444 #if USE_ITT_BUILD
1445  /* TODO: In case of split reduction barrier, master thread may send
1446  acquired event early, before the final summation into the shared
1447  variable is done (final summation can be a long operation for array
1448  reductions). */
1449  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1450  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1451 #endif /* USE_ITT_BUILD */
1452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1453  // Barrier - report frame end (only if active_level == 1)
1454  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1455  __kmp_forkjoin_frames_mode &&
1456  this_thr->th.th_teams_microtask == NULL &&
1457  team->t.t_active_level == 1) {
1458  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1459  kmp_uint64 cur_time = __itt_get_timestamp();
1460  kmp_info_t **other_threads = team->t.t_threads;
1461  int nproc = this_thr->th.th_team_nproc;
1462  int i;
1463  switch (__kmp_forkjoin_frames_mode) {
1464  case 1:
1465  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1466  loc, nproc);
1467  this_thr->th.th_frame_time = cur_time;
1468  break;
1469  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1470  // be fixed)
1471  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1472  1, loc, nproc);
1473  break;
1474  case 3:
1475  if (__itt_metadata_add_ptr) {
1476  // Initialize with master's wait time
1477  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1478  // Set arrive time to zero to be able to check it in
1479  // __kmp_invoke_task(); the same is done inside the loop below
1480  this_thr->th.th_bar_arrive_time = 0;
1481  for (i = 1; i < nproc; ++i) {
1482  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1483  other_threads[i]->th.th_bar_arrive_time = 0;
1484  }
1485  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1486  cur_time, delta,
1487  (kmp_uint64)(reduce != NULL));
1488  }
1489  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1490  loc, nproc);
1491  this_thr->th.th_frame_time = cur_time;
1492  break;
1493  }
1494  }
1495 #endif /* USE_ITT_BUILD */
1496  } else {
1497  status = 1;
1498 #if USE_ITT_BUILD
1499  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1500  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 #endif /* USE_ITT_BUILD */
1502  }
1503  if ((status == 1 || !is_split) && !cancelled) {
1504  if (cancellable) {
1505  cancelled = __kmp_linear_barrier_release_cancellable(
1506  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1507  } else {
1508  switch (__kmp_barrier_release_pattern[bt]) {
1509  case bp_hyper_bar: {
1510  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1511  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1512  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1513  break;
1514  }
1515  case bp_hierarchical_bar: {
1516  __kmp_hierarchical_barrier_release(
1517  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518  break;
1519  }
1520  case bp_tree_bar: {
1521  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1522  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1523  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1524  break;
1525  }
1526  default: {
1527  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1528  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1529  }
1530  }
1531  }
1532  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1533  __kmp_task_team_sync(this_thr, team);
1534  }
1535  }
1536 
1537 #if USE_ITT_BUILD
1538  /* GEH: TODO: Move this under if-condition above and also include in
1539  __kmp_end_split_barrier(). This will more accurately represent the actual
1540  release time of the threads for split barriers. */
1541  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1542  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1543 #endif /* USE_ITT_BUILD */
1544  } else { // Team is serialized.
1545  status = 0;
1546  if (__kmp_tasking_mode != tskm_immediate_exec) {
1547  if (this_thr->th.th_task_team != NULL) {
1548 #if USE_ITT_NOTIFY
1549  void *itt_sync_obj = NULL;
1550  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1551  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1552  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1553  }
1554 #endif
1555 
1556  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1557  TRUE);
1558  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1559  __kmp_task_team_setup(this_thr, team, 0);
1560 
1561 #if USE_ITT_BUILD
1562  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1563  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1564 #endif /* USE_ITT_BUILD */
1565  }
1566  }
1567  }
1568  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1569  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1570  __kmp_tid_from_gtid(gtid), status));
1571 
1572 #if OMPT_SUPPORT
1573  if (ompt_enabled.enabled) {
1574 #if OMPT_OPTIONAL
1575  if (ompt_enabled.ompt_callback_sync_region_wait) {
1576  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1577  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1578  return_address);
1579  }
1580  if (ompt_enabled.ompt_callback_sync_region) {
1581  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1582  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1583  return_address);
1584  }
1585 #endif
1586  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1587  }
1588 #endif
1589  ANNOTATE_BARRIER_END(&team->t.t_bar);
1590 
1591  if (cancellable)
1592  return (int)cancelled;
1593  return status;
1594 }
1595 
1596 // Returns 0 if master thread, 1 if worker thread.
1597 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1598  size_t reduce_size, void *reduce_data,
1599  void (*reduce)(void *, void *)) {
1600  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1601  reduce);
1602 }
1603 
1604 #if defined(KMP_GOMP_COMPAT)
1605 // Returns 1 if cancelled, 0 otherwise
1606 int __kmp_barrier_gomp_cancel(int gtid) {
1607  if (__kmp_omp_cancellation) {
1608  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1609  0, NULL, NULL);
1610  if (cancelled) {
1611  int tid = __kmp_tid_from_gtid(gtid);
1612  kmp_info_t *this_thr = __kmp_threads[gtid];
1613  if (KMP_MASTER_TID(tid)) {
1614  // Master does not need to revert anything
1615  } else {
1616  // Workers need to revert their private b_arrived flag
1617  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1618  KMP_BARRIER_STATE_BUMP;
1619  }
1620  }
1621  return cancelled;
1622  }
1623  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1624  return FALSE;
1625 }
1626 #endif
1627 
1628 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1629  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1630  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1631  int tid = __kmp_tid_from_gtid(gtid);
1632  kmp_info_t *this_thr = __kmp_threads[gtid];
1633  kmp_team_t *team = this_thr->th.th_team;
1634 
1635  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1636  if (!team->t.t_serialized) {
1637  if (KMP_MASTER_GTID(gtid)) {
1638  switch (__kmp_barrier_release_pattern[bt]) {
1639  case bp_hyper_bar: {
1640  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1641  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1642  FALSE USE_ITT_BUILD_ARG(NULL));
1643  break;
1644  }
1645  case bp_hierarchical_bar: {
1646  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1647  FALSE USE_ITT_BUILD_ARG(NULL));
1648  break;
1649  }
1650  case bp_tree_bar: {
1651  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1652  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1653  FALSE USE_ITT_BUILD_ARG(NULL));
1654  break;
1655  }
1656  default: {
1657  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1658  FALSE USE_ITT_BUILD_ARG(NULL));
1659  }
1660  }
1661  if (__kmp_tasking_mode != tskm_immediate_exec) {
1662  __kmp_task_team_sync(this_thr, team);
1663  } // if
1664  }
1665  }
1666  ANNOTATE_BARRIER_END(&team->t.t_bar);
1667 }
1668 
1669 void __kmp_join_barrier(int gtid) {
1670  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1671  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1672  kmp_info_t *this_thr = __kmp_threads[gtid];
1673  kmp_team_t *team;
1674  kmp_uint nproc;
1675  kmp_info_t *master_thread;
1676  int tid;
1677 #ifdef KMP_DEBUG
1678  int team_id;
1679 #endif /* KMP_DEBUG */
1680 #if USE_ITT_BUILD
1681  void *itt_sync_obj = NULL;
1682 #if USE_ITT_NOTIFY
1683  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1684  // Get object created at fork_barrier
1685  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1686 #endif
1687 #endif /* USE_ITT_BUILD */
1688  KMP_MB();
1689 
1690  // Get current info
1691  team = this_thr->th.th_team;
1692  nproc = this_thr->th.th_team_nproc;
1693  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1694  tid = __kmp_tid_from_gtid(gtid);
1695 #ifdef KMP_DEBUG
1696  team_id = team->t.t_id;
1697 #endif /* KMP_DEBUG */
1698  master_thread = this_thr->th.th_team_master;
1699 #ifdef KMP_DEBUG
1700  if (master_thread != team->t.t_threads[0]) {
1701  __kmp_print_structure();
1702  }
1703 #endif /* KMP_DEBUG */
1704  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1705  KMP_MB();
1706 
1707  // Verify state
1708  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1709  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1710  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1711  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1712  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1713  gtid, team_id, tid));
1714 
1715  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1716 #if OMPT_SUPPORT
1717  if (ompt_enabled.enabled) {
1718 #if OMPT_OPTIONAL
1719  ompt_data_t *my_task_data;
1720  ompt_data_t *my_parallel_data;
1721  void *codeptr = NULL;
1722  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1723  if (KMP_MASTER_TID(ds_tid) &&
1724  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1725  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1726  codeptr = team->t.ompt_team_info.master_return_address;
1727  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1728  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1729  if (ompt_enabled.ompt_callback_sync_region) {
1730  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1731  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1732  my_task_data, codeptr);
1733  }
1734  if (ompt_enabled.ompt_callback_sync_region_wait) {
1735  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1736  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1737  my_task_data, codeptr);
1738  }
1739  if (!KMP_MASTER_TID(ds_tid))
1740  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1741 #endif
1742  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1743  }
1744 #endif
1745 
1746  if (__kmp_tasking_mode == tskm_extra_barrier) {
1747  __kmp_tasking_barrier(team, this_thr, gtid);
1748  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1749  team_id, tid));
1750  }
1751 #ifdef KMP_DEBUG
1752  if (__kmp_tasking_mode != tskm_immediate_exec) {
1753  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1754  "%p, th_task_team = %p\n",
1755  __kmp_gtid_from_thread(this_thr), team_id,
1756  team->t.t_task_team[this_thr->th.th_task_state],
1757  this_thr->th.th_task_team));
1758  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1759  team->t.t_task_team[this_thr->th.th_task_state]);
1760  }
1761 #endif /* KMP_DEBUG */
1762 
1763  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1764  access it when the team struct is not guaranteed to exist. Doing these
1765  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1766  we do not perform the copy if blocktime=infinite, since the values are not
1767  used by __kmp_wait_template() in that case. */
1768  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1769 #if KMP_USE_MONITOR
1770  this_thr->th.th_team_bt_intervals =
1771  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1772  this_thr->th.th_team_bt_set =
1773  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1774 #else
1775  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1776 #endif
1777  }
1778 
1779 #if USE_ITT_BUILD
1780  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1781  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1782 #endif /* USE_ITT_BUILD */
1783 
1784  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1785  case bp_hyper_bar: {
1786  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1787  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1788  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1789  break;
1790  }
1791  case bp_hierarchical_bar: {
1792  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1793  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1794  break;
1795  }
1796  case bp_tree_bar: {
1797  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1798  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1799  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1800  break;
1801  }
1802  default: {
1803  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1804  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1805  }
1806  }
1807 
1808  /* From this point on, the team data structure may be deallocated at any time
1809  by the master thread - it is unsafe to reference it in any of the worker
1810  threads. Any per-team data items that need to be referenced before the
1811  end of the barrier should be moved to the kmp_task_team_t structs. */
1812  if (KMP_MASTER_TID(tid)) {
1813  if (__kmp_tasking_mode != tskm_immediate_exec) {
1814  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1815  }
1816  if (__kmp_display_affinity) {
1817  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1818  }
1819 #if KMP_STATS_ENABLED
1820  // Have master thread flag the workers to indicate they are now waiting for
1821  // next parallel region, Also wake them up so they switch their timers to
1822  // idle.
1823  for (int i = 0; i < team->t.t_nproc; ++i) {
1824  kmp_info_t *team_thread = team->t.t_threads[i];
1825  if (team_thread == this_thr)
1826  continue;
1827  team_thread->th.th_stats->setIdleFlag();
1828  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1829  team_thread->th.th_sleep_loc != NULL)
1830  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1831  team_thread->th.th_sleep_loc);
1832  }
1833 #endif
1834 #if USE_ITT_BUILD
1835  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1837 #endif /* USE_ITT_BUILD */
1838 
1839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1840  // Join barrier - report frame end
1841  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1842  __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1843  team->t.t_active_level == 1) {
1844  kmp_uint64 cur_time = __itt_get_timestamp();
1845  ident_t *loc = team->t.t_ident;
1846  kmp_info_t **other_threads = team->t.t_threads;
1847  int nproc = this_thr->th.th_team_nproc;
1848  int i;
1849  switch (__kmp_forkjoin_frames_mode) {
1850  case 1:
1851  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1852  loc, nproc);
1853  break;
1854  case 2:
1855  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1856  loc, nproc);
1857  break;
1858  case 3:
1859  if (__itt_metadata_add_ptr) {
1860  // Initialize with master's wait time
1861  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1862  // Set arrive time to zero to be able to check it in
1863  // __kmp_invoke_task(); the same is done inside the loop below
1864  this_thr->th.th_bar_arrive_time = 0;
1865  for (i = 1; i < nproc; ++i) {
1866  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1867  other_threads[i]->th.th_bar_arrive_time = 0;
1868  }
1869  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1870  cur_time, delta, 0);
1871  }
1872  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1873  loc, nproc);
1874  this_thr->th.th_frame_time = cur_time;
1875  break;
1876  }
1877  }
1878 #endif /* USE_ITT_BUILD */
1879  }
1880 #if USE_ITT_BUILD
1881  else {
1882  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1883  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1884  }
1885 #endif /* USE_ITT_BUILD */
1886 
1887 #if KMP_DEBUG
1888  if (KMP_MASTER_TID(tid)) {
1889  KA_TRACE(
1890  15,
1891  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1892  gtid, team_id, tid, nproc));
1893  }
1894 #endif /* KMP_DEBUG */
1895 
1896  // TODO now, mark worker threads as done so they may be disbanded
1897  KMP_MB(); // Flush all pending memory write invalidates.
1898  KA_TRACE(10,
1899  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1900 
1901  ANNOTATE_BARRIER_END(&team->t.t_bar);
1902 }
1903 
1904 // TODO release worker threads' fork barriers as we are ready instead of all at
1905 // once
1906 void __kmp_fork_barrier(int gtid, int tid) {
1907  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1908  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1909  kmp_info_t *this_thr = __kmp_threads[gtid];
1910  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1911 #if USE_ITT_BUILD
1912  void *itt_sync_obj = NULL;
1913 #endif /* USE_ITT_BUILD */
1914  if (team)
1915  ANNOTATE_BARRIER_END(&team->t.t_bar);
1916 
1917  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1918  (team != NULL) ? team->t.t_id : -1, tid));
1919 
1920  // th_team pointer only valid for master thread here
1921  if (KMP_MASTER_TID(tid)) {
1922 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1923  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1924  // Create itt barrier object
1925  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1926  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1927  }
1928 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1929 
1930 #ifdef KMP_DEBUG
1931  kmp_info_t **other_threads = team->t.t_threads;
1932  int i;
1933 
1934  // Verify state
1935  KMP_MB();
1936 
1937  for (i = 1; i < team->t.t_nproc; ++i) {
1938  KA_TRACE(500,
1939  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1940  "== %u.\n",
1941  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1942  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1943  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1944  KMP_DEBUG_ASSERT(
1945  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1946  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1947  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1948  }
1949 #endif
1950 
1951  if (__kmp_tasking_mode != tskm_immediate_exec) {
1952  // 0 indicates setup current task team if nthreads > 1
1953  __kmp_task_team_setup(this_thr, team, 0);
1954  }
1955 
1956  /* The master thread may have changed its blocktime between the join barrier
1957  and the fork barrier. Copy the blocktime info to the thread, where
1958  __kmp_wait_template() can access it when the team struct is not
1959  guaranteed to exist. */
1960  // See note about the corresponding code in __kmp_join_barrier() being
1961  // performance-critical
1962  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1963 #if KMP_USE_MONITOR
1964  this_thr->th.th_team_bt_intervals =
1965  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1966  this_thr->th.th_team_bt_set =
1967  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1968 #else
1969  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1970 #endif
1971  }
1972  } // master
1973 
1974  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1975  case bp_hyper_bar: {
1976  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1977  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1978  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1979  break;
1980  }
1981  case bp_hierarchical_bar: {
1982  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1984  break;
1985  }
1986  case bp_tree_bar: {
1987  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1988  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1989  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1990  break;
1991  }
1992  default: {
1993  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1995  }
1996  }
1997 
1998 #if OMPT_SUPPORT
1999  if (ompt_enabled.enabled &&
2000  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2001  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2002  ompt_data_t *task_data = (team)
2003  ? OMPT_CUR_TASK_DATA(this_thr)
2004  : &(this_thr->th.ompt_thread_info.task_data);
2005  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2006 #if OMPT_OPTIONAL
2007  void *codeptr = NULL;
2008  if (KMP_MASTER_TID(ds_tid) &&
2009  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2010  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2011  codeptr = team->t.ompt_team_info.master_return_address;
2012  if (ompt_enabled.ompt_callback_sync_region_wait) {
2013  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2014  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2015  codeptr);
2016  }
2017  if (ompt_enabled.ompt_callback_sync_region) {
2018  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2019  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2020  codeptr);
2021  }
2022 #endif
2023  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2024  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2025  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2026  }
2027  }
2028 #endif
2029 
2030  // Early exit for reaping threads releasing forkjoin barrier
2031  if (TCR_4(__kmp_global.g.g_done)) {
2032  this_thr->th.th_task_team = NULL;
2033 
2034 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2035  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2036  if (!KMP_MASTER_TID(tid)) {
2037  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2038  if (itt_sync_obj)
2039  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2040  }
2041  }
2042 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2043  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2044  return;
2045  }
2046 
2047  /* We can now assume that a valid team structure has been allocated by the
2048  master and propagated to all worker threads. The current thread, however,
2049  may not be part of the team, so we can't blindly assume that the team
2050  pointer is non-null. */
2051  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2052  KMP_DEBUG_ASSERT(team != NULL);
2053  tid = __kmp_tid_from_gtid(gtid);
2054 
2055 #if KMP_BARRIER_ICV_PULL
2056  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2057  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2058  implicit task has this data before this function is called. We cannot
2059  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2060  struct, because it is not always the case that the threads arrays have
2061  been allocated when __kmp_fork_call() is executed. */
2062  {
2063  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2064  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2065  // Copy the initial ICVs from the master's thread struct to the implicit
2066  // task for this tid.
2067  KA_TRACE(10,
2068  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2069  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2070  tid, FALSE);
2071  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2072  &team->t.t_threads[0]
2073  ->th.th_bar[bs_forkjoin_barrier]
2074  .bb.th_fixed_icvs);
2075  }
2076  }
2077 #endif // KMP_BARRIER_ICV_PULL
2078 
2079  if (__kmp_tasking_mode != tskm_immediate_exec) {
2080  __kmp_task_team_sync(this_thr, team);
2081  }
2082 
2083 #if KMP_AFFINITY_SUPPORTED
2084  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2085  if (proc_bind == proc_bind_intel) {
2086  // Call dynamic affinity settings
2087  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2088  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2089  }
2090  } else if (proc_bind != proc_bind_false) {
2091  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2092  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2093  __kmp_gtid_from_thread(this_thr),
2094  this_thr->th.th_current_place));
2095  } else {
2096  __kmp_affinity_set_place(gtid);
2097  }
2098  }
2099 #endif // KMP_AFFINITY_SUPPORTED
2100  // Perform the display affinity functionality
2101  if (__kmp_display_affinity) {
2102  if (team->t.t_display_affinity
2103 #if KMP_AFFINITY_SUPPORTED
2104  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2105 #endif
2106  ) {
2107  // NULL means use the affinity-format-var ICV
2108  __kmp_aux_display_affinity(gtid, NULL);
2109  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2110  this_thr->th.th_prev_level = team->t.t_level;
2111  }
2112  }
2113  if (!KMP_MASTER_TID(tid))
2114  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2115 
2116 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2117  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2118  if (!KMP_MASTER_TID(tid)) {
2119  // Get correct barrier object
2120  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2121  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2122  } // (prepare called inside barrier_release)
2123  }
2124 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2125  ANNOTATE_BARRIER_END(&team->t.t_bar);
2126  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2127  team->t.t_id, tid));
2128 }
2129 
2130 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2131  kmp_internal_control_t *new_icvs, ident_t *loc) {
2132  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2133 
2134  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2135  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2136 
2137 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2138  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2139  implicit task has this data before this function is called. */
2140 #if KMP_BARRIER_ICV_PULL
2141  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2142  untouched), where all of the worker threads can access them and make their
2143  own copies after the barrier. */
2144  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2145  // allocated at this point
2146  copy_icvs(
2147  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2148  new_icvs);
2149  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2150  team->t.t_threads[0], team));
2151 #elif KMP_BARRIER_ICV_PUSH
2152  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2153  // done here.
2154  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2155  team->t.t_threads[0], team));
2156 #else
2157  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2158  // time.
2159  ngo_load(new_icvs);
2160  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2161  // allocated at this point
2162  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2163  // TODO: GEH - pass in better source location info since usually NULL here
2164  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2165  f, team->t.t_threads[f], team));
2166  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2167  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2168  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2169  f, team->t.t_threads[f], team));
2170  }
2171  ngo_sync();
2172 #endif // KMP_BARRIER_ICV_PULL
2173 }
ident
Definition: kmp.h:222