LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  // __kmp_gtid_get_specific can return negative value because this
182  // function can be called by thread destructor. However, before the
183  // thread destructor is called, the value of the corresponding
184  // thread-specific data will be reset to NULL.
185  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
186  __kmp_gtid_get_specific() == i);
187  return i;
188  }
189  }
190  }
191 
192  /* get specific to try and determine our gtid */
193  KA_TRACE(1000,
194  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
195  "thread, using TLS\n"));
196  i = __kmp_gtid_get_specific();
197 
198  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
199 
200  /* if we havn't been assigned a gtid, then return code */
201  if (i < 0)
202  return i;
203 
204  // other_threads[i] can be nullptr at this point because the corresponding
205  // thread could have already been destructed. It can happen when this function
206  // is called in end library routine.
207  if (!TCR_SYNC_PTR(other_threads[i]))
208  return i;
209 
210  /* dynamically updated stack window for uber threads to avoid get_specific
211  call */
212  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
213  KMP_FATAL(StackOverflow, i);
214  }
215 
216  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
217  if (stack_addr > stack_base) {
218  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
219  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
220  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
221  stack_base);
222  } else {
223  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
224  stack_base - stack_addr);
225  }
226 
227  /* Reprint stack bounds for ubermaster since they have been refined */
228  if (__kmp_storage_map) {
229  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
230  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
231  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
232  other_threads[i]->th.th_info.ds.ds_stacksize,
233  "th_%d stack (refinement)", i);
234  }
235  return i;
236 }
237 
238 int __kmp_get_global_thread_id_reg() {
239  int gtid;
240 
241  if (!__kmp_init_serial) {
242  gtid = KMP_GTID_DNE;
243  } else
244 #ifdef KMP_TDATA_GTID
245  if (TCR_4(__kmp_gtid_mode) >= 3) {
246  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
247  gtid = __kmp_gtid;
248  } else
249 #endif
250  if (TCR_4(__kmp_gtid_mode) >= 2) {
251  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
252  gtid = __kmp_gtid_get_specific();
253  } else {
254  KA_TRACE(1000,
255  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
256  gtid = __kmp_get_global_thread_id();
257  }
258 
259  /* we must be a new uber master sibling thread */
260  if (gtid == KMP_GTID_DNE) {
261  KA_TRACE(10,
262  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
263  "Registering a new gtid.\n"));
264  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
265  if (!__kmp_init_serial) {
266  __kmp_do_serial_initialize();
267  gtid = __kmp_gtid_get_specific();
268  } else {
269  gtid = __kmp_register_root(FALSE);
270  }
271  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
272  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
273  }
274 
275  KMP_DEBUG_ASSERT(gtid >= 0);
276 
277  return gtid;
278 }
279 
280 /* caller must hold forkjoin_lock */
281 void __kmp_check_stack_overlap(kmp_info_t *th) {
282  int f;
283  char *stack_beg = NULL;
284  char *stack_end = NULL;
285  int gtid;
286 
287  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
288  if (__kmp_storage_map) {
289  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
290  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
291 
292  gtid = __kmp_gtid_from_thread(th);
293 
294  if (gtid == KMP_GTID_MONITOR) {
295  __kmp_print_storage_map_gtid(
296  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
297  "th_%s stack (%s)", "mon",
298  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
299  } else {
300  __kmp_print_storage_map_gtid(
301  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
302  "th_%d stack (%s)", gtid,
303  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
304  }
305  }
306 
307  /* No point in checking ubermaster threads since they use refinement and
308  * cannot overlap */
309  gtid = __kmp_gtid_from_thread(th);
310  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
311  KA_TRACE(10,
312  ("__kmp_check_stack_overlap: performing extensive checking\n"));
313  if (stack_beg == NULL) {
314  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
315  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
316  }
317 
318  for (f = 0; f < __kmp_threads_capacity; f++) {
319  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
320 
321  if (f_th && f_th != th) {
322  char *other_stack_end =
323  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
324  char *other_stack_beg =
325  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
326  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
327  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
328 
329  /* Print the other stack values before the abort */
330  if (__kmp_storage_map)
331  __kmp_print_storage_map_gtid(
332  -1, other_stack_beg, other_stack_end,
333  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
334  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
335 
336  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
337  __kmp_msg_null);
338  }
339  }
340  }
341  }
342  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
343 }
344 
345 /* ------------------------------------------------------------------------ */
346 
347 void __kmp_infinite_loop(void) {
348  static int done = FALSE;
349 
350  while (!done) {
351  KMP_YIELD(TRUE);
352  }
353 }
354 
355 #define MAX_MESSAGE 512
356 
357 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
358  char const *format, ...) {
359  char buffer[MAX_MESSAGE];
360  va_list ap;
361 
362  va_start(ap, format);
363  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
364  p2, (unsigned long)size, format);
365  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
366  __kmp_vprintf(kmp_err, buffer, ap);
367 #if KMP_PRINT_DATA_PLACEMENT
368  int node;
369  if (gtid >= 0) {
370  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
371  if (__kmp_storage_map_verbose) {
372  node = __kmp_get_host_node(p1);
373  if (node < 0) /* doesn't work, so don't try this next time */
374  __kmp_storage_map_verbose = FALSE;
375  else {
376  char *last;
377  int lastNode;
378  int localProc = __kmp_get_cpu_from_gtid(gtid);
379 
380  const int page_size = KMP_GET_PAGE_SIZE();
381 
382  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
383  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
384  if (localProc >= 0)
385  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
386  localProc >> 1);
387  else
388  __kmp_printf_no_lock(" GTID %d\n", gtid);
389 #if KMP_USE_PRCTL
390  /* The more elaborate format is disabled for now because of the prctl
391  * hanging bug. */
392  do {
393  last = p1;
394  lastNode = node;
395  /* This loop collates adjacent pages with the same host node. */
396  do {
397  (char *)p1 += page_size;
398  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
399  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
400  lastNode);
401  } while (p1 <= p2);
402 #else
403  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
404  (char *)p1 + (page_size - 1),
405  __kmp_get_host_node(p1));
406  if (p1 < p2) {
407  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
408  (char *)p2 + (page_size - 1),
409  __kmp_get_host_node(p2));
410  }
411 #endif
412  }
413  }
414  } else
415  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
416  }
417 #endif /* KMP_PRINT_DATA_PLACEMENT */
418  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420  va_end(ap);
421 }
422 
423 void __kmp_warn(char const *format, ...) {
424  char buffer[MAX_MESSAGE];
425  va_list ap;
426 
427  if (__kmp_generate_warnings == kmp_warnings_off) {
428  return;
429  }
430 
431  va_start(ap, format);
432 
433  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
434  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
435  __kmp_vprintf(kmp_err, buffer, ap);
436  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
437 
438  va_end(ap);
439 }
440 
441 void __kmp_abort_process() {
442  // Later threads may stall here, but that's ok because abort() will kill them.
443  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
444 
445  if (__kmp_debug_buf) {
446  __kmp_dump_debug_buffer();
447  }
448 
449  if (KMP_OS_WINDOWS) {
450  // Let other threads know of abnormal termination and prevent deadlock
451  // if abort happened during library initialization or shutdown
452  __kmp_global.g.g_abort = SIGABRT;
453 
454  /* On Windows* OS by default abort() causes pop-up error box, which stalls
455  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
456  boxes. _set_abort_behavior() works well, but this function is not
457  available in VS7 (this is not problem for DLL, but it is a problem for
458  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
459  help, at least in some versions of MS C RTL.
460 
461  It seems following sequence is the only way to simulate abort() and
462  avoid pop-up error box. */
463  raise(SIGABRT);
464  _exit(3); // Just in case, if signal ignored, exit anyway.
465  } else {
466  __kmp_unregister_library();
467  abort();
468  }
469 
470  __kmp_infinite_loop();
471  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
472 
473 } // __kmp_abort_process
474 
475 void __kmp_abort_thread(void) {
476  // TODO: Eliminate g_abort global variable and this function.
477  // In case of abort just call abort(), it will kill all the threads.
478  __kmp_infinite_loop();
479 } // __kmp_abort_thread
480 
481 /* Print out the storage map for the major kmp_info_t thread data structures
482  that are allocated together. */
483 
484 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
485  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
486  gtid);
487 
488  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
489  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
490 
491  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
492  sizeof(kmp_local_t), "th_%d.th_local", gtid);
493 
494  __kmp_print_storage_map_gtid(
495  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
496  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
497 
498  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
499  &thr->th.th_bar[bs_plain_barrier + 1],
500  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
501  gtid);
502 
503  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
504  &thr->th.th_bar[bs_forkjoin_barrier + 1],
505  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
506  gtid);
507 
508 #if KMP_FAST_REDUCTION_BARRIER
509  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
510  &thr->th.th_bar[bs_reduction_barrier + 1],
511  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
512  gtid);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514 }
515 
516 /* Print out the storage map for the major kmp_team_t team data structures
517  that are allocated together. */
518 
519 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
520  int team_id, int num_thr) {
521  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
522  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
523  header, team_id);
524 
525  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
526  &team->t.t_bar[bs_last_barrier],
527  sizeof(kmp_balign_team_t) * bs_last_barrier,
528  "%s_%d.t_bar", header, team_id);
529 
530  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
531  &team->t.t_bar[bs_plain_barrier + 1],
532  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
533  header, team_id);
534 
535  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
536  &team->t.t_bar[bs_forkjoin_barrier + 1],
537  sizeof(kmp_balign_team_t),
538  "%s_%d.t_bar[forkjoin]", header, team_id);
539 
540 #if KMP_FAST_REDUCTION_BARRIER
541  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
542  &team->t.t_bar[bs_reduction_barrier + 1],
543  sizeof(kmp_balign_team_t),
544  "%s_%d.t_bar[reduction]", header, team_id);
545 #endif // KMP_FAST_REDUCTION_BARRIER
546 
547  __kmp_print_storage_map_gtid(
548  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
549  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
550 
551  __kmp_print_storage_map_gtid(
552  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
553  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
554 
555  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
556  &team->t.t_disp_buffer[num_disp_buff],
557  sizeof(dispatch_shared_info_t) * num_disp_buff,
558  "%s_%d.t_disp_buffer", header, team_id);
559 }
560 
561 static void __kmp_init_allocator() {
562  __kmp_init_memkind();
563  __kmp_init_target_mem();
564 }
565 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
566 
567 /* ------------------------------------------------------------------------ */
568 
569 #if ENABLE_LIBOMPTARGET
570 static void __kmp_init_omptarget() {
571  __kmp_init_target_task();
572 }
573 #endif
574 
575 /* ------------------------------------------------------------------------ */
576 
577 #if KMP_DYNAMIC_LIB
578 #if KMP_OS_WINDOWS
579 
580 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
581  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
582 
583  switch (fdwReason) {
584 
585  case DLL_PROCESS_ATTACH:
586  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
587 
588  return TRUE;
589 
590  case DLL_PROCESS_DETACH:
591  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
592 
593  // According to Windows* documentation for DllMain entry point:
594  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
595  // lpReserved == NULL when FreeLibrary() is called,
596  // lpReserved != NULL when the process is terminated.
597  // When FreeLibrary() is called, worker threads remain alive. So the
598  // runtime's state is consistent and executing proper shutdown is OK.
599  // When the process is terminated, worker threads have exited or been
600  // forcefully terminated by the OS and only the shutdown thread remains.
601  // This can leave the runtime in an inconsistent state.
602  // Hence, only attempt proper cleanup when FreeLibrary() is called.
603  // Otherwise, rely on OS to reclaim resources.
604  if (lpReserved == NULL)
605  __kmp_internal_end_library(__kmp_gtid_get_specific());
606 
607  return TRUE;
608 
609  case DLL_THREAD_ATTACH:
610  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
611 
612  /* if we want to register new siblings all the time here call
613  * __kmp_get_gtid(); */
614  return TRUE;
615 
616  case DLL_THREAD_DETACH:
617  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
618 
619  __kmp_internal_end_thread(__kmp_gtid_get_specific());
620  return TRUE;
621  }
622 
623  return TRUE;
624 }
625 
626 #endif /* KMP_OS_WINDOWS */
627 #endif /* KMP_DYNAMIC_LIB */
628 
629 /* __kmp_parallel_deo -- Wait until it's our turn. */
630 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
631  int gtid = *gtid_ref;
632 #ifdef BUILD_PARALLEL_ORDERED
633  kmp_team_t *team = __kmp_team_from_gtid(gtid);
634 #endif /* BUILD_PARALLEL_ORDERED */
635 
636  if (__kmp_env_consistency_check) {
637  if (__kmp_threads[gtid]->th.th_root->r.r_active)
638 #if KMP_USE_DYNAMIC_LOCK
639  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
640 #else
641  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
642 #endif
643  }
644 #ifdef BUILD_PARALLEL_ORDERED
645  if (!team->t.t_serialized) {
646  KMP_MB();
647  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
648  NULL);
649  KMP_MB();
650  }
651 #endif /* BUILD_PARALLEL_ORDERED */
652 }
653 
654 /* __kmp_parallel_dxo -- Signal the next task. */
655 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
656  int gtid = *gtid_ref;
657 #ifdef BUILD_PARALLEL_ORDERED
658  int tid = __kmp_tid_from_gtid(gtid);
659  kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661 
662  if (__kmp_env_consistency_check) {
663  if (__kmp_threads[gtid]->th.th_root->r.r_active)
664  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
665  }
666 #ifdef BUILD_PARALLEL_ORDERED
667  if (!team->t.t_serialized) {
668  KMP_MB(); /* Flush all pending memory write invalidates. */
669 
670  /* use the tid of the next thread in this team */
671  /* TODO replace with general release procedure */
672  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
673 
674  KMP_MB(); /* Flush all pending memory write invalidates. */
675  }
676 #endif /* BUILD_PARALLEL_ORDERED */
677 }
678 
679 /* ------------------------------------------------------------------------ */
680 /* The BARRIER for a SINGLE process section is always explicit */
681 
682 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
683  int status;
684  kmp_info_t *th;
685  kmp_team_t *team;
686 
687  if (!TCR_4(__kmp_init_parallel))
688  __kmp_parallel_initialize();
689  __kmp_resume_if_soft_paused();
690 
691  th = __kmp_threads[gtid];
692  team = th->th.th_team;
693  status = 0;
694 
695  th->th.th_ident = id_ref;
696 
697  if (team->t.t_serialized) {
698  status = 1;
699  } else {
700  kmp_int32 old_this = th->th.th_local.this_construct;
701 
702  ++th->th.th_local.this_construct;
703  /* try to set team count to thread count--success means thread got the
704  single block */
705  /* TODO: Should this be acquire or release? */
706  if (team->t.t_construct == old_this) {
707  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
708  th->th.th_local.this_construct);
709  }
710 #if USE_ITT_BUILD
711  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
712  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
713  team->t.t_active_level == 1) {
714  // Only report metadata by primary thread of active team at level 1
715  __kmp_itt_metadata_single(id_ref);
716  }
717 #endif /* USE_ITT_BUILD */
718  }
719 
720  if (__kmp_env_consistency_check) {
721  if (status && push_ws) {
722  __kmp_push_workshare(gtid, ct_psingle, id_ref);
723  } else {
724  __kmp_check_workshare(gtid, ct_psingle, id_ref);
725  }
726  }
727 #if USE_ITT_BUILD
728  if (status) {
729  __kmp_itt_single_start(gtid);
730  }
731 #endif /* USE_ITT_BUILD */
732  return status;
733 }
734 
735 void __kmp_exit_single(int gtid) {
736 #if USE_ITT_BUILD
737  __kmp_itt_single_end(gtid);
738 #endif /* USE_ITT_BUILD */
739  if (__kmp_env_consistency_check)
740  __kmp_pop_workshare(gtid, ct_psingle, NULL);
741 }
742 
743 /* determine if we can go parallel or must use a serialized parallel region and
744  * how many threads we can use
745  * set_nproc is the number of threads requested for the team
746  * returns 0 if we should serialize or only use one thread,
747  * otherwise the number of threads to use
748  * The forkjoin lock is held by the caller. */
749 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
750  int master_tid, int set_nthreads,
751  int enter_teams) {
752  int capacity;
753  int new_nthreads;
754  KMP_DEBUG_ASSERT(__kmp_init_serial);
755  KMP_DEBUG_ASSERT(root && parent_team);
756  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
757 
758  // If dyn-var is set, dynamically adjust the number of desired threads,
759  // according to the method specified by dynamic_mode.
760  new_nthreads = set_nthreads;
761  if (!get__dynamic_2(parent_team, master_tid)) {
762  ;
763  }
764 #ifdef USE_LOAD_BALANCE
765  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
766  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
767  if (new_nthreads == 1) {
768  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
769  "reservation to 1 thread\n",
770  master_tid));
771  return 1;
772  }
773  if (new_nthreads < set_nthreads) {
774  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
775  "reservation to %d threads\n",
776  master_tid, new_nthreads));
777  }
778  }
779 #endif /* USE_LOAD_BALANCE */
780  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
781  new_nthreads = __kmp_avail_proc - __kmp_nth +
782  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
783  if (new_nthreads <= 1) {
784  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
785  "reservation to 1 thread\n",
786  master_tid));
787  return 1;
788  }
789  if (new_nthreads < set_nthreads) {
790  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
791  "reservation to %d threads\n",
792  master_tid, new_nthreads));
793  } else {
794  new_nthreads = set_nthreads;
795  }
796  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
797  if (set_nthreads > 2) {
798  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
799  new_nthreads = (new_nthreads % set_nthreads) + 1;
800  if (new_nthreads == 1) {
801  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
802  "reservation to 1 thread\n",
803  master_tid));
804  return 1;
805  }
806  if (new_nthreads < set_nthreads) {
807  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
808  "reservation to %d threads\n",
809  master_tid, new_nthreads));
810  }
811  }
812  } else {
813  KMP_ASSERT(0);
814  }
815 
816  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
817  if (__kmp_nth + new_nthreads -
818  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
819  __kmp_max_nth) {
820  int tl_nthreads = __kmp_max_nth - __kmp_nth +
821  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
822  if (tl_nthreads <= 0) {
823  tl_nthreads = 1;
824  }
825 
826  // If dyn-var is false, emit a 1-time warning.
827  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
828  __kmp_reserve_warn = 1;
829  __kmp_msg(kmp_ms_warning,
830  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
831  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
832  }
833  if (tl_nthreads == 1) {
834  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
835  "reduced reservation to 1 thread\n",
836  master_tid));
837  return 1;
838  }
839  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
840  "reservation to %d threads\n",
841  master_tid, tl_nthreads));
842  new_nthreads = tl_nthreads;
843  }
844 
845  // Respect OMP_THREAD_LIMIT
846  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
847  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
848  if (cg_nthreads + new_nthreads -
849  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
850  max_cg_threads) {
851  int tl_nthreads = max_cg_threads - cg_nthreads +
852  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
853  if (tl_nthreads <= 0) {
854  tl_nthreads = 1;
855  }
856 
857  // If dyn-var is false, emit a 1-time warning.
858  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
859  __kmp_reserve_warn = 1;
860  __kmp_msg(kmp_ms_warning,
861  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
862  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
863  }
864  if (tl_nthreads == 1) {
865  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
866  "reduced reservation to 1 thread\n",
867  master_tid));
868  return 1;
869  }
870  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
871  "reservation to %d threads\n",
872  master_tid, tl_nthreads));
873  new_nthreads = tl_nthreads;
874  }
875 
876  // Check if the threads array is large enough, or needs expanding.
877  // See comment in __kmp_register_root() about the adjustment if
878  // __kmp_threads[0] == NULL.
879  capacity = __kmp_threads_capacity;
880  if (TCR_PTR(__kmp_threads[0]) == NULL) {
881  --capacity;
882  }
883  // If it is not for initializing the hidden helper team, we need to take
884  // __kmp_hidden_helper_threads_num out of the capacity because it is included
885  // in __kmp_threads_capacity.
886  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
887  capacity -= __kmp_hidden_helper_threads_num;
888  }
889  if (__kmp_nth + new_nthreads -
890  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
891  capacity) {
892  // Expand the threads array.
893  int slotsRequired = __kmp_nth + new_nthreads -
894  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
895  capacity;
896  int slotsAdded = __kmp_expand_threads(slotsRequired);
897  if (slotsAdded < slotsRequired) {
898  // The threads array was not expanded enough.
899  new_nthreads -= (slotsRequired - slotsAdded);
900  KMP_ASSERT(new_nthreads >= 1);
901 
902  // If dyn-var is false, emit a 1-time warning.
903  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
904  __kmp_reserve_warn = 1;
905  if (__kmp_tp_cached) {
906  __kmp_msg(kmp_ms_warning,
907  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
908  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
909  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
910  } else {
911  __kmp_msg(kmp_ms_warning,
912  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
913  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
914  }
915  }
916  }
917  }
918 
919 #ifdef KMP_DEBUG
920  if (new_nthreads == 1) {
921  KC_TRACE(10,
922  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
923  "dead roots and rechecking; requested %d threads\n",
924  __kmp_get_gtid(), set_nthreads));
925  } else {
926  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
927  " %d threads\n",
928  __kmp_get_gtid(), new_nthreads, set_nthreads));
929  }
930 #endif // KMP_DEBUG
931  return new_nthreads;
932 }
933 
934 /* Allocate threads from the thread pool and assign them to the new team. We are
935  assured that there are enough threads available, because we checked on that
936  earlier within critical section forkjoin */
937 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
938  kmp_info_t *master_th, int master_gtid,
939  int fork_teams_workers) {
940  int i;
941  int use_hot_team;
942 
943  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
944  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
945  KMP_MB();
946 
947  /* first, let's setup the primary thread */
948  master_th->th.th_info.ds.ds_tid = 0;
949  master_th->th.th_team = team;
950  master_th->th.th_team_nproc = team->t.t_nproc;
951  master_th->th.th_team_master = master_th;
952  master_th->th.th_team_serialized = FALSE;
953  master_th->th.th_dispatch = &team->t.t_dispatch[0];
954 
955 /* make sure we are not the optimized hot team */
956 #if KMP_NESTED_HOT_TEAMS
957  use_hot_team = 0;
958  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
959  if (hot_teams) { // hot teams array is not allocated if
960  // KMP_HOT_TEAMS_MAX_LEVEL=0
961  int level = team->t.t_active_level - 1; // index in array of hot teams
962  if (master_th->th.th_teams_microtask) { // are we inside the teams?
963  if (master_th->th.th_teams_size.nteams > 1) {
964  ++level; // level was not increased in teams construct for
965  // team_of_masters
966  }
967  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
968  master_th->th.th_teams_level == team->t.t_level) {
969  ++level; // level was not increased in teams construct for
970  // team_of_workers before the parallel
971  } // team->t.t_level will be increased inside parallel
972  }
973  if (level < __kmp_hot_teams_max_level) {
974  if (hot_teams[level].hot_team) {
975  // hot team has already been allocated for given level
976  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
977  use_hot_team = 1; // the team is ready to use
978  } else {
979  use_hot_team = 0; // AC: threads are not allocated yet
980  hot_teams[level].hot_team = team; // remember new hot team
981  hot_teams[level].hot_team_nth = team->t.t_nproc;
982  }
983  } else {
984  use_hot_team = 0;
985  }
986  }
987 #else
988  use_hot_team = team == root->r.r_hot_team;
989 #endif
990  if (!use_hot_team) {
991 
992  /* install the primary thread */
993  team->t.t_threads[0] = master_th;
994  __kmp_initialize_info(master_th, team, 0, master_gtid);
995 
996  /* now, install the worker threads */
997  for (i = 1; i < team->t.t_nproc; i++) {
998 
999  /* fork or reallocate a new thread and install it in team */
1000  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1001  team->t.t_threads[i] = thr;
1002  KMP_DEBUG_ASSERT(thr);
1003  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1004  /* align team and thread arrived states */
1005  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1006  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1007  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1008  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1009  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1010  team->t.t_bar[bs_plain_barrier].b_arrived));
1011  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1012  thr->th.th_teams_level = master_th->th.th_teams_level;
1013  thr->th.th_teams_size = master_th->th.th_teams_size;
1014  { // Initialize threads' barrier data.
1015  int b;
1016  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1017  for (b = 0; b < bs_last_barrier; ++b) {
1018  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1019  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1020 #if USE_DEBUGGER
1021  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1022 #endif
1023  }
1024  }
1025  }
1026 
1027 #if KMP_AFFINITY_SUPPORTED
1028  // Do not partition the places list for teams construct workers who
1029  // haven't actually been forked to do real work yet. This partitioning
1030  // will take place in the parallel region nested within the teams construct.
1031  if (!fork_teams_workers) {
1032  __kmp_partition_places(team);
1033  }
1034 #endif
1035 
1036  if (team->t.t_nproc > 1 &&
1037  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1038  team->t.b->update_num_threads(team->t.t_nproc);
1039  __kmp_add_threads_to_team(team, team->t.t_nproc);
1040  }
1041  }
1042 
1043  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1044  for (i = 0; i < team->t.t_nproc; i++) {
1045  kmp_info_t *thr = team->t.t_threads[i];
1046  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1047  thr->th.th_prev_level != team->t.t_level) {
1048  team->t.t_display_affinity = 1;
1049  break;
1050  }
1051  }
1052  }
1053 
1054  KMP_MB();
1055 }
1056 
1057 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1058 // Propagate any changes to the floating point control registers out to the team
1059 // We try to avoid unnecessary writes to the relevant cache line in the team
1060 // structure, so we don't make changes unless they are needed.
1061 inline static void propagateFPControl(kmp_team_t *team) {
1062  if (__kmp_inherit_fp_control) {
1063  kmp_int16 x87_fpu_control_word;
1064  kmp_uint32 mxcsr;
1065 
1066  // Get primary thread's values of FPU control flags (both X87 and vector)
1067  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1068  __kmp_store_mxcsr(&mxcsr);
1069  mxcsr &= KMP_X86_MXCSR_MASK;
1070 
1071  // There is no point looking at t_fp_control_saved here.
1072  // If it is TRUE, we still have to update the values if they are different
1073  // from those we now have. If it is FALSE we didn't save anything yet, but
1074  // our objective is the same. We have to ensure that the values in the team
1075  // are the same as those we have.
1076  // So, this code achieves what we need whether or not t_fp_control_saved is
1077  // true. By checking whether the value needs updating we avoid unnecessary
1078  // writes that would put the cache-line into a written state, causing all
1079  // threads in the team to have to read it again.
1080  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1081  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1082  // Although we don't use this value, other code in the runtime wants to know
1083  // whether it should restore them. So we must ensure it is correct.
1084  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1085  } else {
1086  // Similarly here. Don't write to this cache-line in the team structure
1087  // unless we have to.
1088  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1089  }
1090 }
1091 
1092 // Do the opposite, setting the hardware registers to the updated values from
1093 // the team.
1094 inline static void updateHWFPControl(kmp_team_t *team) {
1095  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1096  // Only reset the fp control regs if they have been changed in the team.
1097  // the parallel region that we are exiting.
1098  kmp_int16 x87_fpu_control_word;
1099  kmp_uint32 mxcsr;
1100  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1101  __kmp_store_mxcsr(&mxcsr);
1102  mxcsr &= KMP_X86_MXCSR_MASK;
1103 
1104  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1105  __kmp_clear_x87_fpu_status_word();
1106  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1107  }
1108 
1109  if (team->t.t_mxcsr != mxcsr) {
1110  __kmp_load_mxcsr(&team->t.t_mxcsr);
1111  }
1112  }
1113 }
1114 #else
1115 #define propagateFPControl(x) ((void)0)
1116 #define updateHWFPControl(x) ((void)0)
1117 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1118 
1119 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1120  int realloc); // forward declaration
1121 
1122 /* Run a parallel region that has been serialized, so runs only in a team of the
1123  single primary thread. */
1124 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1125  kmp_info_t *this_thr;
1126  kmp_team_t *serial_team;
1127 
1128  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1129 
1130  /* Skip all this code for autopar serialized loops since it results in
1131  unacceptable overhead */
1132  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1133  return;
1134 
1135  if (!TCR_4(__kmp_init_parallel))
1136  __kmp_parallel_initialize();
1137  __kmp_resume_if_soft_paused();
1138 
1139  this_thr = __kmp_threads[global_tid];
1140  serial_team = this_thr->th.th_serial_team;
1141 
1142  /* utilize the serialized team held by this thread */
1143  KMP_DEBUG_ASSERT(serial_team);
1144  KMP_MB();
1145 
1146  if (__kmp_tasking_mode != tskm_immediate_exec) {
1147  KMP_DEBUG_ASSERT(
1148  this_thr->th.th_task_team ==
1149  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1150  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1151  NULL);
1152  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1153  "team %p, new task_team = NULL\n",
1154  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1155  this_thr->th.th_task_team = NULL;
1156  }
1157 
1158  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1159  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1160  proc_bind = proc_bind_false;
1161  } else if (proc_bind == proc_bind_default) {
1162  // No proc_bind clause was specified, so use the current value
1163  // of proc-bind-var for this parallel region.
1164  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1165  }
1166  // Reset for next parallel region
1167  this_thr->th.th_set_proc_bind = proc_bind_default;
1168 
1169  // Reset num_threads for next parallel region
1170  this_thr->th.th_set_nproc = 0;
1171 
1172 #if OMPT_SUPPORT
1173  ompt_data_t ompt_parallel_data = ompt_data_none;
1174  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1175  if (ompt_enabled.enabled &&
1176  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1177 
1178  ompt_task_info_t *parent_task_info;
1179  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1180 
1181  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1182  if (ompt_enabled.ompt_callback_parallel_begin) {
1183  int team_size = 1;
1184 
1185  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1186  &(parent_task_info->task_data), &(parent_task_info->frame),
1187  &ompt_parallel_data, team_size,
1188  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1189  }
1190  }
1191 #endif // OMPT_SUPPORT
1192 
1193  if (this_thr->th.th_team != serial_team) {
1194  // Nested level will be an index in the nested nthreads array
1195  int level = this_thr->th.th_team->t.t_level;
1196 
1197  if (serial_team->t.t_serialized) {
1198  /* this serial team was already used
1199  TODO increase performance by making this locks more specific */
1200  kmp_team_t *new_team;
1201 
1202  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1203 
1204  new_team =
1205  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1206 #if OMPT_SUPPORT
1207  ompt_parallel_data,
1208 #endif
1209  proc_bind, &this_thr->th.th_current_task->td_icvs,
1210  0 USE_NESTED_HOT_ARG(NULL));
1211  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1212  KMP_ASSERT(new_team);
1213 
1214  /* setup new serialized team and install it */
1215  new_team->t.t_threads[0] = this_thr;
1216  new_team->t.t_parent = this_thr->th.th_team;
1217  serial_team = new_team;
1218  this_thr->th.th_serial_team = serial_team;
1219 
1220  KF_TRACE(
1221  10,
1222  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1223  global_tid, serial_team));
1224 
1225  /* TODO the above breaks the requirement that if we run out of resources,
1226  then we can still guarantee that serialized teams are ok, since we may
1227  need to allocate a new one */
1228  } else {
1229  KF_TRACE(
1230  10,
1231  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1232  global_tid, serial_team));
1233  }
1234 
1235  /* we have to initialize this serial team */
1236  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1237  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1238  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1239  serial_team->t.t_ident = loc;
1240  serial_team->t.t_serialized = 1;
1241  serial_team->t.t_nproc = 1;
1242  serial_team->t.t_parent = this_thr->th.th_team;
1243  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1244  this_thr->th.th_team = serial_team;
1245  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1246 
1247  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1248  this_thr->th.th_current_task));
1249  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1250  this_thr->th.th_current_task->td_flags.executing = 0;
1251 
1252  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1253 
1254  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1255  implicit task for each serialized task represented by
1256  team->t.t_serialized? */
1257  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1258  &this_thr->th.th_current_task->td_parent->td_icvs);
1259 
1260  // Thread value exists in the nested nthreads array for the next nested
1261  // level
1262  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1263  this_thr->th.th_current_task->td_icvs.nproc =
1264  __kmp_nested_nth.nth[level + 1];
1265  }
1266 
1267  if (__kmp_nested_proc_bind.used &&
1268  (level + 1 < __kmp_nested_proc_bind.used)) {
1269  this_thr->th.th_current_task->td_icvs.proc_bind =
1270  __kmp_nested_proc_bind.bind_types[level + 1];
1271  }
1272 
1273 #if USE_DEBUGGER
1274  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1275 #endif
1276  this_thr->th.th_info.ds.ds_tid = 0;
1277 
1278  /* set thread cache values */
1279  this_thr->th.th_team_nproc = 1;
1280  this_thr->th.th_team_master = this_thr;
1281  this_thr->th.th_team_serialized = 1;
1282 
1283  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1284  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1285  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1286 
1287  propagateFPControl(serial_team);
1288 
1289  /* check if we need to allocate dispatch buffers stack */
1290  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1291  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1292  serial_team->t.t_dispatch->th_disp_buffer =
1293  (dispatch_private_info_t *)__kmp_allocate(
1294  sizeof(dispatch_private_info_t));
1295  }
1296  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1297 
1298  KMP_MB();
1299 
1300  } else {
1301  /* this serialized team is already being used,
1302  * that's fine, just add another nested level */
1303  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1304  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1305  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1306  ++serial_team->t.t_serialized;
1307  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1308 
1309  // Nested level will be an index in the nested nthreads array
1310  int level = this_thr->th.th_team->t.t_level;
1311  // Thread value exists in the nested nthreads array for the next nested
1312  // level
1313  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1314  this_thr->th.th_current_task->td_icvs.nproc =
1315  __kmp_nested_nth.nth[level + 1];
1316  }
1317  serial_team->t.t_level++;
1318  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1319  "of serial team %p to %d\n",
1320  global_tid, serial_team, serial_team->t.t_level));
1321 
1322  /* allocate/push dispatch buffers stack */
1323  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1324  {
1325  dispatch_private_info_t *disp_buffer =
1326  (dispatch_private_info_t *)__kmp_allocate(
1327  sizeof(dispatch_private_info_t));
1328  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1329  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1330  }
1331  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1332 
1333  KMP_MB();
1334  }
1335  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1336 
1337  // Perform the display affinity functionality for
1338  // serialized parallel regions
1339  if (__kmp_display_affinity) {
1340  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1341  this_thr->th.th_prev_num_threads != 1) {
1342  // NULL means use the affinity-format-var ICV
1343  __kmp_aux_display_affinity(global_tid, NULL);
1344  this_thr->th.th_prev_level = serial_team->t.t_level;
1345  this_thr->th.th_prev_num_threads = 1;
1346  }
1347  }
1348 
1349  if (__kmp_env_consistency_check)
1350  __kmp_push_parallel(global_tid, NULL);
1351 #if OMPT_SUPPORT
1352  serial_team->t.ompt_team_info.master_return_address = codeptr;
1353  if (ompt_enabled.enabled &&
1354  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1355  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1356  OMPT_GET_FRAME_ADDRESS(0);
1357 
1358  ompt_lw_taskteam_t lw_taskteam;
1359  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1360  &ompt_parallel_data, codeptr);
1361 
1362  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1363  // don't use lw_taskteam after linking. content was swaped
1364 
1365  /* OMPT implicit task begin */
1366  if (ompt_enabled.ompt_callback_implicit_task) {
1367  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1368  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1369  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1370  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1371  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1372  __kmp_tid_from_gtid(global_tid);
1373  }
1374 
1375  /* OMPT state */
1376  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1377  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1378  OMPT_GET_FRAME_ADDRESS(0);
1379  }
1380 #endif
1381 }
1382 
1383 // Test if this fork is for a team closely nested in a teams construct
1384 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1385  microtask_t microtask, int level,
1386  int teams_level, kmp_va_list ap) {
1387  return (master_th->th.th_teams_microtask && ap &&
1388  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1389 }
1390 
1391 // Test if this fork is for the teams construct, i.e. to form the outer league
1392 // of teams
1393 static inline bool __kmp_is_entering_teams(int active_level, int level,
1394  int teams_level, kmp_va_list ap) {
1395  return ((ap == NULL && active_level == 0) ||
1396  (ap && teams_level > 0 && teams_level == level));
1397 }
1398 
1399 // AC: This is start of parallel that is nested inside teams construct.
1400 // The team is actual (hot), all workers are ready at the fork barrier.
1401 // No lock needed to initialize the team a bit, then free workers.
1402 static inline int
1403 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1404  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1405  enum fork_context_e call_context, microtask_t microtask,
1406  launch_t invoker, int master_set_numthreads, int level,
1407 #if OMPT_SUPPORT
1408  ompt_data_t ompt_parallel_data, void *return_address,
1409 #endif
1410  kmp_va_list ap) {
1411  void **argv;
1412  int i;
1413 
1414  parent_team->t.t_ident = loc;
1415  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1416  parent_team->t.t_argc = argc;
1417  argv = (void **)parent_team->t.t_argv;
1418  for (i = argc - 1; i >= 0; --i) {
1419  *argv++ = va_arg(kmp_va_deref(ap), void *);
1420  }
1421  // Increment our nested depth levels, but not increase the serialization
1422  if (parent_team == master_th->th.th_serial_team) {
1423  // AC: we are in serialized parallel
1424  __kmpc_serialized_parallel(loc, gtid);
1425  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1426 
1427  if (call_context == fork_context_gnu) {
1428  // AC: need to decrement t_serialized for enquiry functions to work
1429  // correctly, will restore at join time
1430  parent_team->t.t_serialized--;
1431  return TRUE;
1432  }
1433 
1434 #if OMPD_SUPPORT
1435  parent_team->t.t_pkfn = microtask;
1436 #endif
1437 
1438 #if OMPT_SUPPORT
1439  void *dummy;
1440  void **exit_frame_p;
1441  ompt_data_t *implicit_task_data;
1442  ompt_lw_taskteam_t lw_taskteam;
1443 
1444  if (ompt_enabled.enabled) {
1445  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1446  &ompt_parallel_data, return_address);
1447  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1448 
1449  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1450  // Don't use lw_taskteam after linking. Content was swapped.
1451 
1452  /* OMPT implicit task begin */
1453  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1454  if (ompt_enabled.ompt_callback_implicit_task) {
1455  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1456  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1457  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1458  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1459  }
1460 
1461  /* OMPT state */
1462  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1463  } else {
1464  exit_frame_p = &dummy;
1465  }
1466 #endif
1467 
1468  // AC: need to decrement t_serialized for enquiry functions to work
1469  // correctly, will restore at join time
1470  parent_team->t.t_serialized--;
1471 
1472  {
1473  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1474  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1475  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1476 #if OMPT_SUPPORT
1477  ,
1478  exit_frame_p
1479 #endif
1480  );
1481  }
1482 
1483 #if OMPT_SUPPORT
1484  if (ompt_enabled.enabled) {
1485  *exit_frame_p = NULL;
1486  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1487  if (ompt_enabled.ompt_callback_implicit_task) {
1488  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1489  ompt_scope_end, NULL, implicit_task_data, 1,
1490  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1491  }
1492  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1493  __ompt_lw_taskteam_unlink(master_th);
1494  if (ompt_enabled.ompt_callback_parallel_end) {
1495  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1496  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1497  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1498  }
1499  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1500  }
1501 #endif
1502  return TRUE;
1503  }
1504 
1505  parent_team->t.t_pkfn = microtask;
1506  parent_team->t.t_invoke = invoker;
1507  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1508  parent_team->t.t_active_level++;
1509  parent_team->t.t_level++;
1510  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1511 
1512  // If the threads allocated to the team are less than the thread limit, update
1513  // the thread limit here. th_teams_size.nth is specific to this team nested
1514  // in a teams construct, the team is fully created, and we're about to do
1515  // the actual fork. Best to do this here so that the subsequent uses below
1516  // and in the join have the correct value.
1517  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1518 
1519 #if OMPT_SUPPORT
1520  if (ompt_enabled.enabled) {
1521  ompt_lw_taskteam_t lw_taskteam;
1522  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1523  return_address);
1524  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1525  }
1526 #endif
1527 
1528  /* Change number of threads in the team if requested */
1529  if (master_set_numthreads) { // The parallel has num_threads clause
1530  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1531  // AC: only can reduce number of threads dynamically, can't increase
1532  kmp_info_t **other_threads = parent_team->t.t_threads;
1533  // NOTE: if using distributed barrier, we need to run this code block
1534  // even when the team size appears not to have changed from the max.
1535  int old_proc = master_th->th.th_teams_size.nth;
1536  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1537  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1538  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1539  }
1540  parent_team->t.t_nproc = master_set_numthreads;
1541  for (i = 0; i < master_set_numthreads; ++i) {
1542  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1543  }
1544  }
1545  // Keep extra threads hot in the team for possible next parallels
1546  master_th->th.th_set_nproc = 0;
1547  }
1548 
1549 #if USE_DEBUGGER
1550  if (__kmp_debugging) { // Let debugger override number of threads.
1551  int nth = __kmp_omp_num_threads(loc);
1552  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1553  master_set_numthreads = nth;
1554  }
1555  }
1556 #endif
1557 
1558  // Figure out the proc_bind policy for the nested parallel within teams
1559  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1560  // proc_bind_default means don't update
1561  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1562  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1563  proc_bind = proc_bind_false;
1564  } else {
1565  // No proc_bind clause specified; use current proc-bind-var
1566  if (proc_bind == proc_bind_default) {
1567  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1568  }
1569  /* else: The proc_bind policy was specified explicitly on parallel clause.
1570  This overrides proc-bind-var for this parallel region, but does not
1571  change proc-bind-var. */
1572  // Figure the value of proc-bind-var for the child threads.
1573  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1574  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1575  master_th->th.th_current_task->td_icvs.proc_bind)) {
1576  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1577  }
1578  }
1579  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1580  // Need to change the bind-var ICV to correct value for each implicit task
1581  if (proc_bind_icv != proc_bind_default &&
1582  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1583  kmp_info_t **other_threads = parent_team->t.t_threads;
1584  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1585  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1586  }
1587  }
1588  // Reset for next parallel region
1589  master_th->th.th_set_proc_bind = proc_bind_default;
1590 
1591 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1592  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1593  KMP_ITT_DEBUG) &&
1594  __kmp_forkjoin_frames_mode == 3 &&
1595  parent_team->t.t_active_level == 1 // only report frames at level 1
1596  && master_th->th.th_teams_size.nteams == 1) {
1597  kmp_uint64 tmp_time = __itt_get_timestamp();
1598  master_th->th.th_frame_time = tmp_time;
1599  parent_team->t.t_region_time = tmp_time;
1600  }
1601  if (__itt_stack_caller_create_ptr) {
1602  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1603  // create new stack stitching id before entering fork barrier
1604  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1605  }
1606 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1607 #if KMP_AFFINITY_SUPPORTED
1608  __kmp_partition_places(parent_team);
1609 #endif
1610 
1611  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1612  "master_th=%p, gtid=%d\n",
1613  root, parent_team, master_th, gtid));
1614  __kmp_internal_fork(loc, gtid, parent_team);
1615  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1616  "master_th=%p, gtid=%d\n",
1617  root, parent_team, master_th, gtid));
1618 
1619  if (call_context == fork_context_gnu)
1620  return TRUE;
1621 
1622  /* Invoke microtask for PRIMARY thread */
1623  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1624  parent_team->t.t_id, parent_team->t.t_pkfn));
1625 
1626  if (!parent_team->t.t_invoke(gtid)) {
1627  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1628  }
1629  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1630  parent_team->t.t_id, parent_team->t.t_pkfn));
1631  KMP_MB(); /* Flush all pending memory write invalidates. */
1632 
1633  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1634 
1635  return TRUE;
1636 }
1637 
1638 // Create a serialized parallel region
1639 static inline int
1640 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1641  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1642  kmp_info_t *master_th, kmp_team_t *parent_team,
1643 #if OMPT_SUPPORT
1644  ompt_data_t *ompt_parallel_data, void **return_address,
1645  ompt_data_t **parent_task_data,
1646 #endif
1647  kmp_va_list ap) {
1648  kmp_team_t *team;
1649  int i;
1650  void **argv;
1651 
1652 /* josh todo: hypothetical question: what do we do for OS X*? */
1653 #if KMP_OS_LINUX && \
1654  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1655  void *args[argc];
1656 #else
1657  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1658 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1659  KMP_ARCH_AARCH64) */
1660 
1661  KA_TRACE(
1662  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1663 
1664  __kmpc_serialized_parallel(loc, gtid);
1665 
1666 #if OMPD_SUPPORT
1667  master_th->th.th_serial_team->t.t_pkfn = microtask;
1668 #endif
1669 
1670  if (call_context == fork_context_intel) {
1671  /* TODO this sucks, use the compiler itself to pass args! :) */
1672  master_th->th.th_serial_team->t.t_ident = loc;
1673  if (!ap) {
1674  // revert change made in __kmpc_serialized_parallel()
1675  master_th->th.th_serial_team->t.t_level--;
1676 // Get args from parent team for teams construct
1677 
1678 #if OMPT_SUPPORT
1679  void *dummy;
1680  void **exit_frame_p;
1681  ompt_task_info_t *task_info;
1682  ompt_lw_taskteam_t lw_taskteam;
1683 
1684  if (ompt_enabled.enabled) {
1685  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1686  ompt_parallel_data, *return_address);
1687 
1688  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1689  // don't use lw_taskteam after linking. content was swaped
1690  task_info = OMPT_CUR_TASK_INFO(master_th);
1691  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1692  if (ompt_enabled.ompt_callback_implicit_task) {
1693  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1694  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1695  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1696  &(task_info->task_data), 1,
1697  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1698  }
1699 
1700  /* OMPT state */
1701  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1702  } else {
1703  exit_frame_p = &dummy;
1704  }
1705 #endif
1706 
1707  {
1708  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1709  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1710  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1711 #if OMPT_SUPPORT
1712  ,
1713  exit_frame_p
1714 #endif
1715  );
1716  }
1717 
1718 #if OMPT_SUPPORT
1719  if (ompt_enabled.enabled) {
1720  *exit_frame_p = NULL;
1721  if (ompt_enabled.ompt_callback_implicit_task) {
1722  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1723  ompt_scope_end, NULL, &(task_info->task_data), 1,
1724  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1725  }
1726  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1727  __ompt_lw_taskteam_unlink(master_th);
1728  if (ompt_enabled.ompt_callback_parallel_end) {
1729  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1730  ompt_parallel_data, *parent_task_data,
1731  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1732  }
1733  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1734  }
1735 #endif
1736  } else if (microtask == (microtask_t)__kmp_teams_master) {
1737  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1738  team = master_th->th.th_team;
1739  // team->t.t_pkfn = microtask;
1740  team->t.t_invoke = invoker;
1741  __kmp_alloc_argv_entries(argc, team, TRUE);
1742  team->t.t_argc = argc;
1743  argv = (void **)team->t.t_argv;
1744  if (ap) {
1745  for (i = argc - 1; i >= 0; --i)
1746  *argv++ = va_arg(kmp_va_deref(ap), void *);
1747  } else {
1748  for (i = 0; i < argc; ++i)
1749  // Get args from parent team for teams construct
1750  argv[i] = parent_team->t.t_argv[i];
1751  }
1752  // AC: revert change made in __kmpc_serialized_parallel()
1753  // because initial code in teams should have level=0
1754  team->t.t_level--;
1755  // AC: call special invoker for outer "parallel" of teams construct
1756  invoker(gtid);
1757 #if OMPT_SUPPORT
1758  if (ompt_enabled.enabled) {
1759  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1760  if (ompt_enabled.ompt_callback_implicit_task) {
1761  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1762  ompt_scope_end, NULL, &(task_info->task_data), 0,
1763  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1764  }
1765  if (ompt_enabled.ompt_callback_parallel_end) {
1766  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1767  ompt_parallel_data, *parent_task_data,
1768  OMPT_INVOKER(call_context) | ompt_parallel_league,
1769  *return_address);
1770  }
1771  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1772  }
1773 #endif
1774  } else {
1775  argv = args;
1776  for (i = argc - 1; i >= 0; --i)
1777  *argv++ = va_arg(kmp_va_deref(ap), void *);
1778  KMP_MB();
1779 
1780 #if OMPT_SUPPORT
1781  void *dummy;
1782  void **exit_frame_p;
1783  ompt_task_info_t *task_info;
1784  ompt_lw_taskteam_t lw_taskteam;
1785  ompt_data_t *implicit_task_data;
1786 
1787  if (ompt_enabled.enabled) {
1788  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1789  ompt_parallel_data, *return_address);
1790  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1791  // don't use lw_taskteam after linking. content was swaped
1792  task_info = OMPT_CUR_TASK_INFO(master_th);
1793  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1794 
1795  /* OMPT implicit task begin */
1796  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1797  if (ompt_enabled.ompt_callback_implicit_task) {
1798  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1799  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1800  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1801  ompt_task_implicit);
1802  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1803  }
1804 
1805  /* OMPT state */
1806  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1807  } else {
1808  exit_frame_p = &dummy;
1809  }
1810 #endif
1811 
1812  {
1813  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1814  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1815  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1816 #if OMPT_SUPPORT
1817  ,
1818  exit_frame_p
1819 #endif
1820  );
1821  }
1822 
1823 #if OMPT_SUPPORT
1824  if (ompt_enabled.enabled) {
1825  *exit_frame_p = NULL;
1826  if (ompt_enabled.ompt_callback_implicit_task) {
1827  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828  ompt_scope_end, NULL, &(task_info->task_data), 1,
1829  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1830  }
1831 
1832  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1833  __ompt_lw_taskteam_unlink(master_th);
1834  if (ompt_enabled.ompt_callback_parallel_end) {
1835  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1836  ompt_parallel_data, *parent_task_data,
1837  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1838  }
1839  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1840  }
1841 #endif
1842  }
1843  } else if (call_context == fork_context_gnu) {
1844 #if OMPT_SUPPORT
1845  if (ompt_enabled.enabled) {
1846  ompt_lw_taskteam_t lwt;
1847  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1848  *return_address);
1849 
1850  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1851  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1852  }
1853 // don't use lw_taskteam after linking. content was swaped
1854 #endif
1855 
1856  // we were called from GNU native code
1857  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1858  return FALSE;
1859  } else {
1860  KMP_ASSERT2(call_context < fork_context_last,
1861  "__kmp_serial_fork_call: unknown fork_context parameter");
1862  }
1863 
1864  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1865  KMP_MB();
1866  return FALSE;
1867 }
1868 
1869 /* most of the work for a fork */
1870 /* return true if we really went parallel, false if serialized */
1871 int __kmp_fork_call(ident_t *loc, int gtid,
1872  enum fork_context_e call_context, // Intel, GNU, ...
1873  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1874  kmp_va_list ap) {
1875  void **argv;
1876  int i;
1877  int master_tid;
1878  int master_this_cons;
1879  kmp_team_t *team;
1880  kmp_team_t *parent_team;
1881  kmp_info_t *master_th;
1882  kmp_root_t *root;
1883  int nthreads;
1884  int master_active;
1885  int master_set_numthreads;
1886  int task_thread_limit = 0;
1887  int level;
1888  int active_level;
1889  int teams_level;
1890 #if KMP_NESTED_HOT_TEAMS
1891  kmp_hot_team_ptr_t **p_hot_teams;
1892 #endif
1893  { // KMP_TIME_BLOCK
1894  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1895  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1896 
1897  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1898  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1899  /* Some systems prefer the stack for the root thread(s) to start with */
1900  /* some gap from the parent stack to prevent false sharing. */
1901  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1902  /* These 2 lines below are so this does not get optimized out */
1903  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1904  __kmp_stkpadding += (short)((kmp_int64)dummy);
1905  }
1906 
1907  /* initialize if needed */
1908  KMP_DEBUG_ASSERT(
1909  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1910  if (!TCR_4(__kmp_init_parallel))
1911  __kmp_parallel_initialize();
1912  __kmp_resume_if_soft_paused();
1913 
1914  /* setup current data */
1915  // AC: potentially unsafe, not in sync with library shutdown,
1916  // __kmp_threads can be freed
1917  master_th = __kmp_threads[gtid];
1918 
1919  parent_team = master_th->th.th_team;
1920  master_tid = master_th->th.th_info.ds.ds_tid;
1921  master_this_cons = master_th->th.th_local.this_construct;
1922  root = master_th->th.th_root;
1923  master_active = root->r.r_active;
1924  master_set_numthreads = master_th->th.th_set_nproc;
1925  task_thread_limit =
1926  master_th->th.th_current_task->td_icvs.task_thread_limit;
1927 
1928 #if OMPT_SUPPORT
1929  ompt_data_t ompt_parallel_data = ompt_data_none;
1930  ompt_data_t *parent_task_data;
1931  ompt_frame_t *ompt_frame;
1932  void *return_address = NULL;
1933 
1934  if (ompt_enabled.enabled) {
1935  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1936  NULL, NULL);
1937  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1938  }
1939 #endif
1940 
1941  // Assign affinity to root thread if it hasn't happened yet
1942  __kmp_assign_root_init_mask();
1943 
1944  // Nested level will be an index in the nested nthreads array
1945  level = parent_team->t.t_level;
1946  // used to launch non-serial teams even if nested is not allowed
1947  active_level = parent_team->t.t_active_level;
1948  // needed to check nesting inside the teams
1949  teams_level = master_th->th.th_teams_level;
1950 #if KMP_NESTED_HOT_TEAMS
1951  p_hot_teams = &master_th->th.th_hot_teams;
1952  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1953  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1954  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1955  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1956  // it is either actual or not needed (when active_level > 0)
1957  (*p_hot_teams)[0].hot_team_nth = 1;
1958  }
1959 #endif
1960 
1961 #if OMPT_SUPPORT
1962  if (ompt_enabled.enabled) {
1963  if (ompt_enabled.ompt_callback_parallel_begin) {
1964  int team_size = master_set_numthreads
1965  ? master_set_numthreads
1966  : get__nproc_2(parent_team, master_tid);
1967  int flags = OMPT_INVOKER(call_context) |
1968  ((microtask == (microtask_t)__kmp_teams_master)
1969  ? ompt_parallel_league
1970  : ompt_parallel_team);
1971  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1972  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1973  return_address);
1974  }
1975  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1976  }
1977 #endif
1978 
1979  master_th->th.th_ident = loc;
1980 
1981  // Parallel closely nested in teams construct:
1982  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1983  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1984  call_context, microtask, invoker,
1985  master_set_numthreads, level,
1986 #if OMPT_SUPPORT
1987  ompt_parallel_data, return_address,
1988 #endif
1989  ap);
1990  } // End parallel closely nested in teams construct
1991 
1992 #if KMP_DEBUG
1993  if (__kmp_tasking_mode != tskm_immediate_exec) {
1994  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1995  parent_team->t.t_task_team[master_th->th.th_task_state]);
1996  }
1997 #endif
1998 
1999  // Need this to happen before we determine the number of threads, not while
2000  // we are allocating the team
2001  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2002 
2003  // Determine the number of threads
2004  int enter_teams =
2005  __kmp_is_entering_teams(active_level, level, teams_level, ap);
2006  if ((!enter_teams &&
2007  (parent_team->t.t_active_level >=
2008  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2009  (__kmp_library == library_serial)) {
2010  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2011  nthreads = 1;
2012  } else {
2013  nthreads = master_set_numthreads
2014  ? master_set_numthreads
2015  // TODO: get nproc directly from current task
2016  : get__nproc_2(parent_team, master_tid);
2017  // Use the thread_limit set for the current target task if exists, else go
2018  // with the deduced nthreads
2019  nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2020  ? task_thread_limit
2021  : nthreads;
2022  // Check if we need to take forkjoin lock? (no need for serialized
2023  // parallel out of teams construct).
2024  if (nthreads > 1) {
2025  /* determine how many new threads we can use */
2026  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2027  /* AC: If we execute teams from parallel region (on host), then teams
2028  should be created but each can only have 1 thread if nesting is
2029  disabled. If teams called from serial region, then teams and their
2030  threads should be created regardless of the nesting setting. */
2031  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2032  nthreads, enter_teams);
2033  if (nthreads == 1) {
2034  // Free lock for single thread execution here; for multi-thread
2035  // execution it will be freed later after team of threads created
2036  // and initialized
2037  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2038  }
2039  }
2040  }
2041  KMP_DEBUG_ASSERT(nthreads > 0);
2042 
2043  // If we temporarily changed the set number of threads then restore it now
2044  master_th->th.th_set_nproc = 0;
2045 
2046  if (nthreads == 1) {
2047  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2048  invoker, master_th, parent_team,
2049 #if OMPT_SUPPORT
2050  &ompt_parallel_data, &return_address,
2051  &parent_task_data,
2052 #endif
2053  ap);
2054  } // if (nthreads == 1)
2055 
2056  // GEH: only modify the executing flag in the case when not serialized
2057  // serialized case is handled in kmpc_serialized_parallel
2058  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2059  "curtask=%p, curtask_max_aclevel=%d\n",
2060  parent_team->t.t_active_level, master_th,
2061  master_th->th.th_current_task,
2062  master_th->th.th_current_task->td_icvs.max_active_levels));
2063  // TODO: GEH - cannot do this assertion because root thread not set up as
2064  // executing
2065  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2066  master_th->th.th_current_task->td_flags.executing = 0;
2067 
2068  if (!master_th->th.th_teams_microtask || level > teams_level) {
2069  /* Increment our nested depth level */
2070  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2071  }
2072 
2073  // See if we need to make a copy of the ICVs.
2074  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2075  if ((level + 1 < __kmp_nested_nth.used) &&
2076  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2077  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2078  } else {
2079  nthreads_icv = 0; // don't update
2080  }
2081 
2082  // Figure out the proc_bind_policy for the new team.
2083  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2084  // proc_bind_default means don't update
2085  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2086  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2087  proc_bind = proc_bind_false;
2088  } else {
2089  // No proc_bind clause specified; use current proc-bind-var for this
2090  // parallel region
2091  if (proc_bind == proc_bind_default) {
2092  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2093  }
2094  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2095  if (master_th->th.th_teams_microtask &&
2096  microtask == (microtask_t)__kmp_teams_master) {
2097  proc_bind = __kmp_teams_proc_bind;
2098  }
2099  /* else: The proc_bind policy was specified explicitly on parallel clause.
2100  This overrides proc-bind-var for this parallel region, but does not
2101  change proc-bind-var. */
2102  // Figure the value of proc-bind-var for the child threads.
2103  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2104  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2105  master_th->th.th_current_task->td_icvs.proc_bind)) {
2106  // Do not modify the proc bind icv for the two teams construct forks
2107  // They just let the proc bind icv pass through
2108  if (!master_th->th.th_teams_microtask ||
2109  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2110  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2111  }
2112  }
2113 
2114  // Reset for next parallel region
2115  master_th->th.th_set_proc_bind = proc_bind_default;
2116 
2117  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2118  kmp_internal_control_t new_icvs;
2119  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2120  new_icvs.next = NULL;
2121  if (nthreads_icv > 0) {
2122  new_icvs.nproc = nthreads_icv;
2123  }
2124  if (proc_bind_icv != proc_bind_default) {
2125  new_icvs.proc_bind = proc_bind_icv;
2126  }
2127 
2128  /* allocate a new parallel team */
2129  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2130  team = __kmp_allocate_team(root, nthreads, nthreads,
2131 #if OMPT_SUPPORT
2132  ompt_parallel_data,
2133 #endif
2134  proc_bind, &new_icvs,
2135  argc USE_NESTED_HOT_ARG(master_th));
2136  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2137  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2138  } else {
2139  /* allocate a new parallel team */
2140  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2141  team = __kmp_allocate_team(root, nthreads, nthreads,
2142 #if OMPT_SUPPORT
2143  ompt_parallel_data,
2144 #endif
2145  proc_bind,
2146  &master_th->th.th_current_task->td_icvs,
2147  argc USE_NESTED_HOT_ARG(master_th));
2148  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2149  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2150  &master_th->th.th_current_task->td_icvs);
2151  }
2152  KF_TRACE(
2153  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2154 
2155  /* setup the new team */
2156  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2157  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2158  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2159  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2160  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2161 #if OMPT_SUPPORT
2162  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2163  return_address);
2164 #endif
2165  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2166  // TODO: parent_team->t.t_level == INT_MAX ???
2167  if (!master_th->th.th_teams_microtask || level > teams_level) {
2168  int new_level = parent_team->t.t_level + 1;
2169  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2170  new_level = parent_team->t.t_active_level + 1;
2171  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2172  } else {
2173  // AC: Do not increase parallel level at start of the teams construct
2174  int new_level = parent_team->t.t_level;
2175  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2176  new_level = parent_team->t.t_active_level;
2177  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2178  }
2179  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2180  // set primary thread's schedule as new run-time schedule
2181  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2182 
2183  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2184  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2185 
2186  // Update the floating point rounding in the team if required.
2187  propagateFPControl(team);
2188 #if OMPD_SUPPORT
2189  if (ompd_state & OMPD_ENABLE_BP)
2190  ompd_bp_parallel_begin();
2191 #endif
2192 
2193  if (__kmp_tasking_mode != tskm_immediate_exec) {
2194  // Set primary thread's task team to team's task team. Unless this is hot
2195  // team, it should be NULL.
2196  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2197  parent_team->t.t_task_team[master_th->th.th_task_state]);
2198  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2199  "%p, new task_team %p / team %p\n",
2200  __kmp_gtid_from_thread(master_th),
2201  master_th->th.th_task_team, parent_team,
2202  team->t.t_task_team[master_th->th.th_task_state], team));
2203 
2204  if (active_level || master_th->th.th_task_team) {
2205  // Take a memo of primary thread's task_state
2206  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2207  if (master_th->th.th_task_state_top >=
2208  master_th->th.th_task_state_stack_sz) { // increase size
2209  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2210  kmp_uint8 *old_stack, *new_stack;
2211  kmp_uint32 i;
2212  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2213  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2214  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2215  }
2216  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2217  ++i) { // zero-init rest of stack
2218  new_stack[i] = 0;
2219  }
2220  old_stack = master_th->th.th_task_state_memo_stack;
2221  master_th->th.th_task_state_memo_stack = new_stack;
2222  master_th->th.th_task_state_stack_sz = new_size;
2223  __kmp_free(old_stack);
2224  }
2225  // Store primary thread's task_state on stack
2226  master_th->th
2227  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2228  master_th->th.th_task_state;
2229  master_th->th.th_task_state_top++;
2230 #if KMP_NESTED_HOT_TEAMS
2231  if (master_th->th.th_hot_teams &&
2232  active_level < __kmp_hot_teams_max_level &&
2233  team == master_th->th.th_hot_teams[active_level].hot_team) {
2234  // Restore primary thread's nested state if nested hot team
2235  master_th->th.th_task_state =
2236  master_th->th
2237  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2238  } else {
2239 #endif
2240  master_th->th.th_task_state = 0;
2241 #if KMP_NESTED_HOT_TEAMS
2242  }
2243 #endif
2244  }
2245 #if !KMP_NESTED_HOT_TEAMS
2246  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2247  (team == root->r.r_hot_team));
2248 #endif
2249  }
2250 
2251  KA_TRACE(
2252  20,
2253  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2254  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2255  team->t.t_nproc));
2256  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2257  (team->t.t_master_tid == 0 &&
2258  (team->t.t_parent == root->r.r_root_team ||
2259  team->t.t_parent->t.t_serialized)));
2260  KMP_MB();
2261 
2262  /* now, setup the arguments */
2263  argv = (void **)team->t.t_argv;
2264  if (ap) {
2265  for (i = argc - 1; i >= 0; --i) {
2266  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2267  KMP_CHECK_UPDATE(*argv, new_argv);
2268  argv++;
2269  }
2270  } else {
2271  for (i = 0; i < argc; ++i) {
2272  // Get args from parent team for teams construct
2273  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2274  }
2275  }
2276 
2277  /* now actually fork the threads */
2278  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2279  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2280  root->r.r_active = TRUE;
2281 
2282  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2283  __kmp_setup_icv_copy(team, nthreads,
2284  &master_th->th.th_current_task->td_icvs, loc);
2285 
2286 #if OMPT_SUPPORT
2287  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2288 #endif
2289 
2290  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2291 
2292 #if USE_ITT_BUILD
2293  if (team->t.t_active_level == 1 // only report frames at level 1
2294  && !master_th->th.th_teams_microtask) { // not in teams construct
2295 #if USE_ITT_NOTIFY
2296  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2297  (__kmp_forkjoin_frames_mode == 3 ||
2298  __kmp_forkjoin_frames_mode == 1)) {
2299  kmp_uint64 tmp_time = 0;
2300  if (__itt_get_timestamp_ptr)
2301  tmp_time = __itt_get_timestamp();
2302  // Internal fork - report frame begin
2303  master_th->th.th_frame_time = tmp_time;
2304  if (__kmp_forkjoin_frames_mode == 3)
2305  team->t.t_region_time = tmp_time;
2306  } else
2307 // only one notification scheme (either "submit" or "forking/joined", not both)
2308 #endif /* USE_ITT_NOTIFY */
2309  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2310  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2311  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2312  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2313  }
2314  }
2315 #endif /* USE_ITT_BUILD */
2316 
2317  /* now go on and do the work */
2318  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2319  KMP_MB();
2320  KF_TRACE(10,
2321  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2322  root, team, master_th, gtid));
2323 
2324 #if USE_ITT_BUILD
2325  if (__itt_stack_caller_create_ptr) {
2326  // create new stack stitching id before entering fork barrier
2327  if (!enter_teams) {
2328  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2329  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2330  } else if (parent_team->t.t_serialized) {
2331  // keep stack stitching id in the serialized parent_team;
2332  // current team will be used for parallel inside the teams;
2333  // if parent_team is active, then it already keeps stack stitching id
2334  // for the league of teams
2335  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2336  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2337  }
2338  }
2339 #endif /* USE_ITT_BUILD */
2340 
2341  // AC: skip __kmp_internal_fork at teams construct, let only primary
2342  // threads execute
2343  if (ap) {
2344  __kmp_internal_fork(loc, gtid, team);
2345  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2346  "master_th=%p, gtid=%d\n",
2347  root, team, master_th, gtid));
2348  }
2349 
2350  if (call_context == fork_context_gnu) {
2351  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352  return TRUE;
2353  }
2354 
2355  /* Invoke microtask for PRIMARY thread */
2356  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2357  team->t.t_id, team->t.t_pkfn));
2358  } // END of timer KMP_fork_call block
2359 
2360 #if KMP_STATS_ENABLED
2361  // If beginning a teams construct, then change thread state
2362  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2363  if (!ap) {
2364  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2365  }
2366 #endif
2367 
2368  if (!team->t.t_invoke(gtid)) {
2369  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2370  }
2371 
2372 #if KMP_STATS_ENABLED
2373  // If was beginning of a teams construct, then reset thread state
2374  if (!ap) {
2375  KMP_SET_THREAD_STATE(previous_state);
2376  }
2377 #endif
2378 
2379  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2380  team->t.t_id, team->t.t_pkfn));
2381  KMP_MB(); /* Flush all pending memory write invalidates. */
2382 
2383  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2384 #if OMPT_SUPPORT
2385  if (ompt_enabled.enabled) {
2386  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2387  }
2388 #endif
2389 
2390  return TRUE;
2391 }
2392 
2393 #if OMPT_SUPPORT
2394 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2395  kmp_team_t *team) {
2396  // restore state outside the region
2397  thread->th.ompt_thread_info.state =
2398  ((team->t.t_serialized) ? ompt_state_work_serial
2399  : ompt_state_work_parallel);
2400 }
2401 
2402 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2403  kmp_team_t *team, ompt_data_t *parallel_data,
2404  int flags, void *codeptr) {
2405  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2406  if (ompt_enabled.ompt_callback_parallel_end) {
2407  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2408  parallel_data, &(task_info->task_data), flags, codeptr);
2409  }
2410 
2411  task_info->frame.enter_frame = ompt_data_none;
2412  __kmp_join_restore_state(thread, team);
2413 }
2414 #endif
2415 
2416 void __kmp_join_call(ident_t *loc, int gtid
2417 #if OMPT_SUPPORT
2418  ,
2419  enum fork_context_e fork_context
2420 #endif
2421  ,
2422  int exit_teams) {
2423  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2424  kmp_team_t *team;
2425  kmp_team_t *parent_team;
2426  kmp_info_t *master_th;
2427  kmp_root_t *root;
2428  int master_active;
2429 
2430  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2431 
2432  /* setup current data */
2433  master_th = __kmp_threads[gtid];
2434  root = master_th->th.th_root;
2435  team = master_th->th.th_team;
2436  parent_team = team->t.t_parent;
2437 
2438  master_th->th.th_ident = loc;
2439 
2440 #if OMPT_SUPPORT
2441  void *team_microtask = (void *)team->t.t_pkfn;
2442  // For GOMP interface with serialized parallel, need the
2443  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2444  // and end-parallel events.
2445  if (ompt_enabled.enabled &&
2446  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2447  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2448  }
2449 #endif
2450 
2451 #if KMP_DEBUG
2452  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2453  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2454  "th_task_team = %p\n",
2455  __kmp_gtid_from_thread(master_th), team,
2456  team->t.t_task_team[master_th->th.th_task_state],
2457  master_th->th.th_task_team));
2458  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2459  team->t.t_task_team[master_th->th.th_task_state]);
2460  }
2461 #endif
2462 
2463  if (team->t.t_serialized) {
2464  if (master_th->th.th_teams_microtask) {
2465  // We are in teams construct
2466  int level = team->t.t_level;
2467  int tlevel = master_th->th.th_teams_level;
2468  if (level == tlevel) {
2469  // AC: we haven't incremented it earlier at start of teams construct,
2470  // so do it here - at the end of teams construct
2471  team->t.t_level++;
2472  } else if (level == tlevel + 1) {
2473  // AC: we are exiting parallel inside teams, need to increment
2474  // serialization in order to restore it in the next call to
2475  // __kmpc_end_serialized_parallel
2476  team->t.t_serialized++;
2477  }
2478  }
2479  __kmpc_end_serialized_parallel(loc, gtid);
2480 
2481 #if OMPT_SUPPORT
2482  if (ompt_enabled.enabled) {
2483  if (fork_context == fork_context_gnu) {
2484  __ompt_lw_taskteam_unlink(master_th);
2485  }
2486  __kmp_join_restore_state(master_th, parent_team);
2487  }
2488 #endif
2489 
2490  return;
2491  }
2492 
2493  master_active = team->t.t_master_active;
2494 
2495  if (!exit_teams) {
2496  // AC: No barrier for internal teams at exit from teams construct.
2497  // But there is barrier for external team (league).
2498  __kmp_internal_join(loc, gtid, team);
2499 #if USE_ITT_BUILD
2500  if (__itt_stack_caller_create_ptr) {
2501  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2502  // destroy the stack stitching id after join barrier
2503  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2504  team->t.t_stack_id = NULL;
2505  }
2506 #endif
2507  } else {
2508  master_th->th.th_task_state =
2509  0; // AC: no tasking in teams (out of any parallel)
2510 #if USE_ITT_BUILD
2511  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2512  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2513  // destroy the stack stitching id on exit from the teams construct
2514  // if parent_team is active, then the id will be destroyed later on
2515  // by master of the league of teams
2516  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2517  parent_team->t.t_stack_id = NULL;
2518  }
2519 #endif
2520  }
2521 
2522  KMP_MB();
2523 
2524 #if OMPT_SUPPORT
2525  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2526  void *codeptr = team->t.ompt_team_info.master_return_address;
2527 #endif
2528 
2529 #if USE_ITT_BUILD
2530  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2531  if (team->t.t_active_level == 1 &&
2532  (!master_th->th.th_teams_microtask || /* not in teams construct */
2533  master_th->th.th_teams_size.nteams == 1)) {
2534  master_th->th.th_ident = loc;
2535  // only one notification scheme (either "submit" or "forking/joined", not
2536  // both)
2537  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2538  __kmp_forkjoin_frames_mode == 3)
2539  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2540  master_th->th.th_frame_time, 0, loc,
2541  master_th->th.th_team_nproc, 1);
2542  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2543  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2544  __kmp_itt_region_joined(gtid);
2545  } // active_level == 1
2546 #endif /* USE_ITT_BUILD */
2547 
2548 #if KMP_AFFINITY_SUPPORTED
2549  if (!exit_teams) {
2550  // Restore master thread's partition.
2551  master_th->th.th_first_place = team->t.t_first_place;
2552  master_th->th.th_last_place = team->t.t_last_place;
2553  }
2554 #endif // KMP_AFFINITY_SUPPORTED
2555 
2556  if (master_th->th.th_teams_microtask && !exit_teams &&
2557  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2558  team->t.t_level == master_th->th.th_teams_level + 1) {
2559 // AC: We need to leave the team structure intact at the end of parallel
2560 // inside the teams construct, so that at the next parallel same (hot) team
2561 // works, only adjust nesting levels
2562 #if OMPT_SUPPORT
2563  ompt_data_t ompt_parallel_data = ompt_data_none;
2564  if (ompt_enabled.enabled) {
2565  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2566  if (ompt_enabled.ompt_callback_implicit_task) {
2567  int ompt_team_size = team->t.t_nproc;
2568  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2569  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2570  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2571  }
2572  task_info->frame.exit_frame = ompt_data_none;
2573  task_info->task_data = ompt_data_none;
2574  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2575  __ompt_lw_taskteam_unlink(master_th);
2576  }
2577 #endif
2578  /* Decrement our nested depth level */
2579  team->t.t_level--;
2580  team->t.t_active_level--;
2581  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2582 
2583  // Restore number of threads in the team if needed. This code relies on
2584  // the proper adjustment of th_teams_size.nth after the fork in
2585  // __kmp_teams_master on each teams primary thread in the case that
2586  // __kmp_reserve_threads reduced it.
2587  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2588  int old_num = master_th->th.th_team_nproc;
2589  int new_num = master_th->th.th_teams_size.nth;
2590  kmp_info_t **other_threads = team->t.t_threads;
2591  team->t.t_nproc = new_num;
2592  for (int i = 0; i < old_num; ++i) {
2593  other_threads[i]->th.th_team_nproc = new_num;
2594  }
2595  // Adjust states of non-used threads of the team
2596  for (int i = old_num; i < new_num; ++i) {
2597  // Re-initialize thread's barrier data.
2598  KMP_DEBUG_ASSERT(other_threads[i]);
2599  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2600  for (int b = 0; b < bs_last_barrier; ++b) {
2601  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2602  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2603 #if USE_DEBUGGER
2604  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2605 #endif
2606  }
2607  if (__kmp_tasking_mode != tskm_immediate_exec) {
2608  // Synchronize thread's task state
2609  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2610  }
2611  }
2612  }
2613 
2614 #if OMPT_SUPPORT
2615  if (ompt_enabled.enabled) {
2616  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2617  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2618  }
2619 #endif
2620 
2621  return;
2622  }
2623 
2624  /* do cleanup and restore the parent team */
2625  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2626  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2627 
2628  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2629 
2630  /* jc: The following lock has instructions with REL and ACQ semantics,
2631  separating the parallel user code called in this parallel region
2632  from the serial user code called after this function returns. */
2633  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2634 
2635  if (!master_th->th.th_teams_microtask ||
2636  team->t.t_level > master_th->th.th_teams_level) {
2637  /* Decrement our nested depth level */
2638  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2639  }
2640  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2641 
2642 #if OMPT_SUPPORT
2643  if (ompt_enabled.enabled) {
2644  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2645  if (ompt_enabled.ompt_callback_implicit_task) {
2646  int flags = (team_microtask == (void *)__kmp_teams_master)
2647  ? ompt_task_initial
2648  : ompt_task_implicit;
2649  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2650  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2651  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2652  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2653  }
2654  task_info->frame.exit_frame = ompt_data_none;
2655  task_info->task_data = ompt_data_none;
2656  }
2657 #endif
2658 
2659  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2660  master_th, team));
2661  __kmp_pop_current_task_from_thread(master_th);
2662 
2663  master_th->th.th_def_allocator = team->t.t_def_allocator;
2664 
2665 #if OMPD_SUPPORT
2666  if (ompd_state & OMPD_ENABLE_BP)
2667  ompd_bp_parallel_end();
2668 #endif
2669  updateHWFPControl(team);
2670 
2671  if (root->r.r_active != master_active)
2672  root->r.r_active = master_active;
2673 
2674  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2675  master_th)); // this will free worker threads
2676 
2677  /* this race was fun to find. make sure the following is in the critical
2678  region otherwise assertions may fail occasionally since the old team may be
2679  reallocated and the hierarchy appears inconsistent. it is actually safe to
2680  run and won't cause any bugs, but will cause those assertion failures. it's
2681  only one deref&assign so might as well put this in the critical region */
2682  master_th->th.th_team = parent_team;
2683  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2684  master_th->th.th_team_master = parent_team->t.t_threads[0];
2685  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2686 
2687  /* restore serialized team, if need be */
2688  if (parent_team->t.t_serialized &&
2689  parent_team != master_th->th.th_serial_team &&
2690  parent_team != root->r.r_root_team) {
2691  __kmp_free_team(root,
2692  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2693  master_th->th.th_serial_team = parent_team;
2694  }
2695 
2696  if (__kmp_tasking_mode != tskm_immediate_exec) {
2697  if (master_th->th.th_task_state_top >
2698  0) { // Restore task state from memo stack
2699  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2700  // Remember primary thread's state if we re-use this nested hot team
2701  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2702  master_th->th.th_task_state;
2703  --master_th->th.th_task_state_top; // pop
2704  // Now restore state at this level
2705  master_th->th.th_task_state =
2706  master_th->th
2707  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2708  } else if (team != root->r.r_hot_team) {
2709  // Reset the task state of primary thread if we are not hot team because
2710  // in this case all the worker threads will be free, and their task state
2711  // will be reset. If not reset the primary's, the task state will be
2712  // inconsistent.
2713  master_th->th.th_task_state = 0;
2714  }
2715  // Copy the task team from the parent team to the primary thread
2716  master_th->th.th_task_team =
2717  parent_team->t.t_task_team[master_th->th.th_task_state];
2718  KA_TRACE(20,
2719  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2720  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2721  parent_team));
2722  }
2723 
2724  // TODO: GEH - cannot do this assertion because root thread not set up as
2725  // executing
2726  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2727  master_th->th.th_current_task->td_flags.executing = 1;
2728 
2729  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2730 
2731 #if KMP_AFFINITY_SUPPORTED
2732  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2733  __kmp_reset_root_init_mask(gtid);
2734  }
2735 #endif
2736 #if OMPT_SUPPORT
2737  int flags =
2738  OMPT_INVOKER(fork_context) |
2739  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2740  : ompt_parallel_team);
2741  if (ompt_enabled.enabled) {
2742  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2743  codeptr);
2744  }
2745 #endif
2746 
2747  KMP_MB();
2748  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2749 }
2750 
2751 /* Check whether we should push an internal control record onto the
2752  serial team stack. If so, do it. */
2753 void __kmp_save_internal_controls(kmp_info_t *thread) {
2754 
2755  if (thread->th.th_team != thread->th.th_serial_team) {
2756  return;
2757  }
2758  if (thread->th.th_team->t.t_serialized > 1) {
2759  int push = 0;
2760 
2761  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2762  push = 1;
2763  } else {
2764  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2765  thread->th.th_team->t.t_serialized) {
2766  push = 1;
2767  }
2768  }
2769  if (push) { /* push a record on the serial team's stack */
2770  kmp_internal_control_t *control =
2771  (kmp_internal_control_t *)__kmp_allocate(
2772  sizeof(kmp_internal_control_t));
2773 
2774  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2775 
2776  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2777 
2778  control->next = thread->th.th_team->t.t_control_stack_top;
2779  thread->th.th_team->t.t_control_stack_top = control;
2780  }
2781  }
2782 }
2783 
2784 /* Changes set_nproc */
2785 void __kmp_set_num_threads(int new_nth, int gtid) {
2786  kmp_info_t *thread;
2787  kmp_root_t *root;
2788 
2789  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2790  KMP_DEBUG_ASSERT(__kmp_init_serial);
2791 
2792  if (new_nth < 1)
2793  new_nth = 1;
2794  else if (new_nth > __kmp_max_nth)
2795  new_nth = __kmp_max_nth;
2796 
2797  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2798  thread = __kmp_threads[gtid];
2799  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2800  return; // nothing to do
2801 
2802  __kmp_save_internal_controls(thread);
2803 
2804  set__nproc(thread, new_nth);
2805 
2806  // If this omp_set_num_threads() call will cause the hot team size to be
2807  // reduced (in the absence of a num_threads clause), then reduce it now,
2808  // rather than waiting for the next parallel region.
2809  root = thread->th.th_root;
2810  if (__kmp_init_parallel && (!root->r.r_active) &&
2811  (root->r.r_hot_team->t.t_nproc > new_nth)
2812 #if KMP_NESTED_HOT_TEAMS
2813  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2814 #endif
2815  ) {
2816  kmp_team_t *hot_team = root->r.r_hot_team;
2817  int f;
2818 
2819  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2820 
2821  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2822  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2823  }
2824  // Release the extra threads we don't need any more.
2825  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2826  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2827  if (__kmp_tasking_mode != tskm_immediate_exec) {
2828  // When decreasing team size, threads no longer in the team should unref
2829  // task team.
2830  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2831  }
2832  __kmp_free_thread(hot_team->t.t_threads[f]);
2833  hot_team->t.t_threads[f] = NULL;
2834  }
2835  hot_team->t.t_nproc = new_nth;
2836 #if KMP_NESTED_HOT_TEAMS
2837  if (thread->th.th_hot_teams) {
2838  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2839  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2840  }
2841 #endif
2842 
2843  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2844  hot_team->t.b->update_num_threads(new_nth);
2845  __kmp_add_threads_to_team(hot_team, new_nth);
2846  }
2847 
2848  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2849 
2850  // Update the t_nproc field in the threads that are still active.
2851  for (f = 0; f < new_nth; f++) {
2852  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2853  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2854  }
2855  // Special flag in case omp_set_num_threads() call
2856  hot_team->t.t_size_changed = -1;
2857  }
2858 }
2859 
2860 /* Changes max_active_levels */
2861 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2862  kmp_info_t *thread;
2863 
2864  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2865  "%d = (%d)\n",
2866  gtid, max_active_levels));
2867  KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869  // validate max_active_levels
2870  if (max_active_levels < 0) {
2871  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2872  // We ignore this call if the user has specified a negative value.
2873  // The current setting won't be changed. The last valid setting will be
2874  // used. A warning will be issued (if warnings are allowed as controlled by
2875  // the KMP_WARNINGS env var).
2876  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2877  "max_active_levels for thread %d = (%d)\n",
2878  gtid, max_active_levels));
2879  return;
2880  }
2881  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2882  // it's OK, the max_active_levels is within the valid range: [ 0;
2883  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2884  // We allow a zero value. (implementation defined behavior)
2885  } else {
2886  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2887  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2888  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2889  // Current upper limit is MAX_INT. (implementation defined behavior)
2890  // If the input exceeds the upper limit, we correct the input to be the
2891  // upper limit. (implementation defined behavior)
2892  // Actually, the flow should never get here until we use MAX_INT limit.
2893  }
2894  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2895  "max_active_levels for thread %d = (%d)\n",
2896  gtid, max_active_levels));
2897 
2898  thread = __kmp_threads[gtid];
2899 
2900  __kmp_save_internal_controls(thread);
2901 
2902  set__max_active_levels(thread, max_active_levels);
2903 }
2904 
2905 /* Gets max_active_levels */
2906 int __kmp_get_max_active_levels(int gtid) {
2907  kmp_info_t *thread;
2908 
2909  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2910  KMP_DEBUG_ASSERT(__kmp_init_serial);
2911 
2912  thread = __kmp_threads[gtid];
2913  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2914  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2915  "curtask_maxaclevel=%d\n",
2916  gtid, thread->th.th_current_task,
2917  thread->th.th_current_task->td_icvs.max_active_levels));
2918  return thread->th.th_current_task->td_icvs.max_active_levels;
2919 }
2920 
2921 // nteams-var per-device ICV
2922 void __kmp_set_num_teams(int num_teams) {
2923  if (num_teams > 0)
2924  __kmp_nteams = num_teams;
2925 }
2926 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2927 // teams-thread-limit-var per-device ICV
2928 void __kmp_set_teams_thread_limit(int limit) {
2929  if (limit > 0)
2930  __kmp_teams_thread_limit = limit;
2931 }
2932 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2933 
2934 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2935 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2936 
2937 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2938 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2939  kmp_info_t *thread;
2940  kmp_sched_t orig_kind;
2941  // kmp_team_t *team;
2942 
2943  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2944  gtid, (int)kind, chunk));
2945  KMP_DEBUG_ASSERT(__kmp_init_serial);
2946 
2947  // Check if the kind parameter is valid, correct if needed.
2948  // Valid parameters should fit in one of two intervals - standard or extended:
2949  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2950  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2951  orig_kind = kind;
2952  kind = __kmp_sched_without_mods(kind);
2953 
2954  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2955  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2956  // TODO: Hint needs attention in case we change the default schedule.
2957  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2958  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2959  __kmp_msg_null);
2960  kind = kmp_sched_default;
2961  chunk = 0; // ignore chunk value in case of bad kind
2962  }
2963 
2964  thread = __kmp_threads[gtid];
2965 
2966  __kmp_save_internal_controls(thread);
2967 
2968  if (kind < kmp_sched_upper_std) {
2969  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2970  // differ static chunked vs. unchunked: chunk should be invalid to
2971  // indicate unchunked schedule (which is the default)
2972  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2973  } else {
2974  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2975  __kmp_sch_map[kind - kmp_sched_lower - 1];
2976  }
2977  } else {
2978  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2979  // kmp_sched_lower - 2 ];
2980  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2982  kmp_sched_lower - 2];
2983  }
2984  __kmp_sched_apply_mods_intkind(
2985  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2986  if (kind == kmp_sched_auto || chunk < 1) {
2987  // ignore parameter chunk for schedule auto
2988  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2989  } else {
2990  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2991  }
2992 }
2993 
2994 /* Gets def_sched_var ICV values */
2995 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2996  kmp_info_t *thread;
2997  enum sched_type th_type;
2998 
2999  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3000  KMP_DEBUG_ASSERT(__kmp_init_serial);
3001 
3002  thread = __kmp_threads[gtid];
3003 
3004  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3005  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3006  case kmp_sch_static:
3007  case kmp_sch_static_greedy:
3008  case kmp_sch_static_balanced:
3009  *kind = kmp_sched_static;
3010  __kmp_sched_apply_mods_stdkind(kind, th_type);
3011  *chunk = 0; // chunk was not set, try to show this fact via zero value
3012  return;
3013  case kmp_sch_static_chunked:
3014  *kind = kmp_sched_static;
3015  break;
3016  case kmp_sch_dynamic_chunked:
3017  *kind = kmp_sched_dynamic;
3018  break;
3020  case kmp_sch_guided_iterative_chunked:
3021  case kmp_sch_guided_analytical_chunked:
3022  *kind = kmp_sched_guided;
3023  break;
3024  case kmp_sch_auto:
3025  *kind = kmp_sched_auto;
3026  break;
3027  case kmp_sch_trapezoidal:
3028  *kind = kmp_sched_trapezoidal;
3029  break;
3030 #if KMP_STATIC_STEAL_ENABLED
3031  case kmp_sch_static_steal:
3032  *kind = kmp_sched_static_steal;
3033  break;
3034 #endif
3035  default:
3036  KMP_FATAL(UnknownSchedulingType, th_type);
3037  }
3038 
3039  __kmp_sched_apply_mods_stdkind(kind, th_type);
3040  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3041 }
3042 
3043 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3044 
3045  int ii, dd;
3046  kmp_team_t *team;
3047  kmp_info_t *thr;
3048 
3049  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3050  KMP_DEBUG_ASSERT(__kmp_init_serial);
3051 
3052  // validate level
3053  if (level == 0)
3054  return 0;
3055  if (level < 0)
3056  return -1;
3057  thr = __kmp_threads[gtid];
3058  team = thr->th.th_team;
3059  ii = team->t.t_level;
3060  if (level > ii)
3061  return -1;
3062 
3063  if (thr->th.th_teams_microtask) {
3064  // AC: we are in teams region where multiple nested teams have same level
3065  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3066  if (level <=
3067  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3068  KMP_DEBUG_ASSERT(ii >= tlevel);
3069  // AC: As we need to pass by the teams league, we need to artificially
3070  // increase ii
3071  if (ii == tlevel) {
3072  ii += 2; // three teams have same level
3073  } else {
3074  ii++; // two teams have same level
3075  }
3076  }
3077  }
3078 
3079  if (ii == level)
3080  return __kmp_tid_from_gtid(gtid);
3081 
3082  dd = team->t.t_serialized;
3083  level++;
3084  while (ii > level) {
3085  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3086  }
3087  if ((team->t.t_serialized) && (!dd)) {
3088  team = team->t.t_parent;
3089  continue;
3090  }
3091  if (ii > level) {
3092  team = team->t.t_parent;
3093  dd = team->t.t_serialized;
3094  ii--;
3095  }
3096  }
3097 
3098  return (dd > 1) ? (0) : (team->t.t_master_tid);
3099 }
3100 
3101 int __kmp_get_team_size(int gtid, int level) {
3102 
3103  int ii, dd;
3104  kmp_team_t *team;
3105  kmp_info_t *thr;
3106 
3107  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3108  KMP_DEBUG_ASSERT(__kmp_init_serial);
3109 
3110  // validate level
3111  if (level == 0)
3112  return 1;
3113  if (level < 0)
3114  return -1;
3115  thr = __kmp_threads[gtid];
3116  team = thr->th.th_team;
3117  ii = team->t.t_level;
3118  if (level > ii)
3119  return -1;
3120 
3121  if (thr->th.th_teams_microtask) {
3122  // AC: we are in teams region where multiple nested teams have same level
3123  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3124  if (level <=
3125  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3126  KMP_DEBUG_ASSERT(ii >= tlevel);
3127  // AC: As we need to pass by the teams league, we need to artificially
3128  // increase ii
3129  if (ii == tlevel) {
3130  ii += 2; // three teams have same level
3131  } else {
3132  ii++; // two teams have same level
3133  }
3134  }
3135  }
3136 
3137  while (ii > level) {
3138  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3139  }
3140  if (team->t.t_serialized && (!dd)) {
3141  team = team->t.t_parent;
3142  continue;
3143  }
3144  if (ii > level) {
3145  team = team->t.t_parent;
3146  ii--;
3147  }
3148  }
3149 
3150  return team->t.t_nproc;
3151 }
3152 
3153 kmp_r_sched_t __kmp_get_schedule_global() {
3154  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3155  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3156  // independently. So one can get the updated schedule here.
3157 
3158  kmp_r_sched_t r_sched;
3159 
3160  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3161  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3162  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3163  // different roots (even in OMP 2.5)
3164  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3165  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3166  if (s == kmp_sch_static) {
3167  // replace STATIC with more detailed schedule (balanced or greedy)
3168  r_sched.r_sched_type = __kmp_static;
3169  } else if (s == kmp_sch_guided_chunked) {
3170  // replace GUIDED with more detailed schedule (iterative or analytical)
3171  r_sched.r_sched_type = __kmp_guided;
3172  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3173  r_sched.r_sched_type = __kmp_sched;
3174  }
3175  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3176 
3177  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3178  // __kmp_chunk may be wrong here (if it was not ever set)
3179  r_sched.chunk = KMP_DEFAULT_CHUNK;
3180  } else {
3181  r_sched.chunk = __kmp_chunk;
3182  }
3183 
3184  return r_sched;
3185 }
3186 
3187 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3188  at least argc number of *t_argv entries for the requested team. */
3189 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3190 
3191  KMP_DEBUG_ASSERT(team);
3192  if (!realloc || argc > team->t.t_max_argc) {
3193 
3194  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3195  "current entries=%d\n",
3196  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3197  /* if previously allocated heap space for args, free them */
3198  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3199  __kmp_free((void *)team->t.t_argv);
3200 
3201  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3202  /* use unused space in the cache line for arguments */
3203  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3204  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3205  "argv entries\n",
3206  team->t.t_id, team->t.t_max_argc));
3207  team->t.t_argv = &team->t.t_inline_argv[0];
3208  if (__kmp_storage_map) {
3209  __kmp_print_storage_map_gtid(
3210  -1, &team->t.t_inline_argv[0],
3211  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3212  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3213  team->t.t_id);
3214  }
3215  } else {
3216  /* allocate space for arguments in the heap */
3217  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3218  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3219  : 2 * argc;
3220  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3221  "argv entries\n",
3222  team->t.t_id, team->t.t_max_argc));
3223  team->t.t_argv =
3224  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3225  if (__kmp_storage_map) {
3226  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3227  &team->t.t_argv[team->t.t_max_argc],
3228  sizeof(void *) * team->t.t_max_argc,
3229  "team_%d.t_argv", team->t.t_id);
3230  }
3231  }
3232  }
3233 }
3234 
3235 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3236  int i;
3237  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3238  team->t.t_threads =
3239  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3240  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3241  sizeof(dispatch_shared_info_t) * num_disp_buff);
3242  team->t.t_dispatch =
3243  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3244  team->t.t_implicit_task_taskdata =
3245  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3246  team->t.t_max_nproc = max_nth;
3247 
3248  /* setup dispatch buffers */
3249  for (i = 0; i < num_disp_buff; ++i) {
3250  team->t.t_disp_buffer[i].buffer_index = i;
3251  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3252  }
3253 }
3254 
3255 static void __kmp_free_team_arrays(kmp_team_t *team) {
3256  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3257  int i;
3258  for (i = 0; i < team->t.t_max_nproc; ++i) {
3259  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3260  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3261  team->t.t_dispatch[i].th_disp_buffer = NULL;
3262  }
3263  }
3264 #if KMP_USE_HIER_SCHED
3265  __kmp_dispatch_free_hierarchies(team);
3266 #endif
3267  __kmp_free(team->t.t_threads);
3268  __kmp_free(team->t.t_disp_buffer);
3269  __kmp_free(team->t.t_dispatch);
3270  __kmp_free(team->t.t_implicit_task_taskdata);
3271  team->t.t_threads = NULL;
3272  team->t.t_disp_buffer = NULL;
3273  team->t.t_dispatch = NULL;
3274  team->t.t_implicit_task_taskdata = 0;
3275 }
3276 
3277 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3278  kmp_info_t **oldThreads = team->t.t_threads;
3279 
3280  __kmp_free(team->t.t_disp_buffer);
3281  __kmp_free(team->t.t_dispatch);
3282  __kmp_free(team->t.t_implicit_task_taskdata);
3283  __kmp_allocate_team_arrays(team, max_nth);
3284 
3285  KMP_MEMCPY(team->t.t_threads, oldThreads,
3286  team->t.t_nproc * sizeof(kmp_info_t *));
3287 
3288  __kmp_free(oldThreads);
3289 }
3290 
3291 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3292 
3293  kmp_r_sched_t r_sched =
3294  __kmp_get_schedule_global(); // get current state of scheduling globals
3295 
3296  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3297 
3298  kmp_internal_control_t g_icvs = {
3299  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3300  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3301  // adjustment of threads (per thread)
3302  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3303  // whether blocktime is explicitly set
3304  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3305 #if KMP_USE_MONITOR
3306  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3307 // intervals
3308 #endif
3309  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3310  // next parallel region (per thread)
3311  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3312  __kmp_cg_max_nth, // int thread_limit;
3313  __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3314  // on task. This is used in the case of target thread_limit
3315  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3316  // for max_active_levels
3317  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3318  // {sched,chunk} pair
3319  __kmp_nested_proc_bind.bind_types[0],
3320  __kmp_default_device,
3321  NULL // struct kmp_internal_control *next;
3322  };
3323 
3324  return g_icvs;
3325 }
3326 
3327 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3328 
3329  kmp_internal_control_t gx_icvs;
3330  gx_icvs.serial_nesting_level =
3331  0; // probably =team->t.t_serial like in save_inter_controls
3332  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3333  gx_icvs.next = NULL;
3334 
3335  return gx_icvs;
3336 }
3337 
3338 static void __kmp_initialize_root(kmp_root_t *root) {
3339  int f;
3340  kmp_team_t *root_team;
3341  kmp_team_t *hot_team;
3342  int hot_team_max_nth;
3343  kmp_r_sched_t r_sched =
3344  __kmp_get_schedule_global(); // get current state of scheduling globals
3345  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3346  KMP_DEBUG_ASSERT(root);
3347  KMP_ASSERT(!root->r.r_begin);
3348 
3349  /* setup the root state structure */
3350  __kmp_init_lock(&root->r.r_begin_lock);
3351  root->r.r_begin = FALSE;
3352  root->r.r_active = FALSE;
3353  root->r.r_in_parallel = 0;
3354  root->r.r_blocktime = __kmp_dflt_blocktime;
3355 #if KMP_AFFINITY_SUPPORTED
3356  root->r.r_affinity_assigned = FALSE;
3357 #endif
3358 
3359  /* setup the root team for this task */
3360  /* allocate the root team structure */
3361  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3362 
3363  root_team =
3364  __kmp_allocate_team(root,
3365  1, // new_nproc
3366  1, // max_nproc
3367 #if OMPT_SUPPORT
3368  ompt_data_none, // root parallel id
3369 #endif
3370  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3371  0 // argc
3372  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3373  );
3374 #if USE_DEBUGGER
3375  // Non-NULL value should be assigned to make the debugger display the root
3376  // team.
3377  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3378 #endif
3379 
3380  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3381 
3382  root->r.r_root_team = root_team;
3383  root_team->t.t_control_stack_top = NULL;
3384 
3385  /* initialize root team */
3386  root_team->t.t_threads[0] = NULL;
3387  root_team->t.t_nproc = 1;
3388  root_team->t.t_serialized = 1;
3389  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3390  root_team->t.t_sched.sched = r_sched.sched;
3391  KA_TRACE(
3392  20,
3393  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3394  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3395 
3396  /* setup the hot team for this task */
3397  /* allocate the hot team structure */
3398  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3399 
3400  hot_team =
3401  __kmp_allocate_team(root,
3402  1, // new_nproc
3403  __kmp_dflt_team_nth_ub * 2, // max_nproc
3404 #if OMPT_SUPPORT
3405  ompt_data_none, // root parallel id
3406 #endif
3407  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3408  0 // argc
3409  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3410  );
3411  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3412 
3413  root->r.r_hot_team = hot_team;
3414  root_team->t.t_control_stack_top = NULL;
3415 
3416  /* first-time initialization */
3417  hot_team->t.t_parent = root_team;
3418 
3419  /* initialize hot team */
3420  hot_team_max_nth = hot_team->t.t_max_nproc;
3421  for (f = 0; f < hot_team_max_nth; ++f) {
3422  hot_team->t.t_threads[f] = NULL;
3423  }
3424  hot_team->t.t_nproc = 1;
3425  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3426  hot_team->t.t_sched.sched = r_sched.sched;
3427  hot_team->t.t_size_changed = 0;
3428 }
3429 
3430 #ifdef KMP_DEBUG
3431 
3432 typedef struct kmp_team_list_item {
3433  kmp_team_p const *entry;
3434  struct kmp_team_list_item *next;
3435 } kmp_team_list_item_t;
3436 typedef kmp_team_list_item_t *kmp_team_list_t;
3437 
3438 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3439  kmp_team_list_t list, // List of teams.
3440  kmp_team_p const *team // Team to add.
3441 ) {
3442 
3443  // List must terminate with item where both entry and next are NULL.
3444  // Team is added to the list only once.
3445  // List is sorted in ascending order by team id.
3446  // Team id is *not* a key.
3447 
3448  kmp_team_list_t l;
3449 
3450  KMP_DEBUG_ASSERT(list != NULL);
3451  if (team == NULL) {
3452  return;
3453  }
3454 
3455  __kmp_print_structure_team_accum(list, team->t.t_parent);
3456  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3457 
3458  // Search list for the team.
3459  l = list;
3460  while (l->next != NULL && l->entry != team) {
3461  l = l->next;
3462  }
3463  if (l->next != NULL) {
3464  return; // Team has been added before, exit.
3465  }
3466 
3467  // Team is not found. Search list again for insertion point.
3468  l = list;
3469  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3470  l = l->next;
3471  }
3472 
3473  // Insert team.
3474  {
3475  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3476  sizeof(kmp_team_list_item_t));
3477  *item = *l;
3478  l->entry = team;
3479  l->next = item;
3480  }
3481 }
3482 
3483 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3484 
3485 ) {
3486  __kmp_printf("%s", title);
3487  if (team != NULL) {
3488  __kmp_printf("%2x %p\n", team->t.t_id, team);
3489  } else {
3490  __kmp_printf(" - (nil)\n");
3491  }
3492 }
3493 
3494 static void __kmp_print_structure_thread(char const *title,
3495  kmp_info_p const *thread) {
3496  __kmp_printf("%s", title);
3497  if (thread != NULL) {
3498  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3499  } else {
3500  __kmp_printf(" - (nil)\n");
3501  }
3502 }
3503 
3504 void __kmp_print_structure(void) {
3505 
3506  kmp_team_list_t list;
3507 
3508  // Initialize list of teams.
3509  list =
3510  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3511  list->entry = NULL;
3512  list->next = NULL;
3513 
3514  __kmp_printf("\n------------------------------\nGlobal Thread "
3515  "Table\n------------------------------\n");
3516  {
3517  int gtid;
3518  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3519  __kmp_printf("%2d", gtid);
3520  if (__kmp_threads != NULL) {
3521  __kmp_printf(" %p", __kmp_threads[gtid]);
3522  }
3523  if (__kmp_root != NULL) {
3524  __kmp_printf(" %p", __kmp_root[gtid]);
3525  }
3526  __kmp_printf("\n");
3527  }
3528  }
3529 
3530  // Print out __kmp_threads array.
3531  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3532  "----------\n");
3533  if (__kmp_threads != NULL) {
3534  int gtid;
3535  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3536  kmp_info_t const *thread = __kmp_threads[gtid];
3537  if (thread != NULL) {
3538  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3539  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3540  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3541  __kmp_print_structure_team(" Serial Team: ",
3542  thread->th.th_serial_team);
3543  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3544  __kmp_print_structure_thread(" Primary: ",
3545  thread->th.th_team_master);
3546  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3547  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3548  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3549  __kmp_print_structure_thread(" Next in pool: ",
3550  thread->th.th_next_pool);
3551  __kmp_printf("\n");
3552  __kmp_print_structure_team_accum(list, thread->th.th_team);
3553  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3554  }
3555  }
3556  } else {
3557  __kmp_printf("Threads array is not allocated.\n");
3558  }
3559 
3560  // Print out __kmp_root array.
3561  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3562  "--------\n");
3563  if (__kmp_root != NULL) {
3564  int gtid;
3565  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3566  kmp_root_t const *root = __kmp_root[gtid];
3567  if (root != NULL) {
3568  __kmp_printf("GTID %2d %p:\n", gtid, root);
3569  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3570  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3571  __kmp_print_structure_thread(" Uber Thread: ",
3572  root->r.r_uber_thread);
3573  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3574  __kmp_printf(" In Parallel: %2d\n",
3575  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3576  __kmp_printf("\n");
3577  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3578  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3579  }
3580  }
3581  } else {
3582  __kmp_printf("Ubers array is not allocated.\n");
3583  }
3584 
3585  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3586  "--------\n");
3587  while (list->next != NULL) {
3588  kmp_team_p const *team = list->entry;
3589  int i;
3590  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3591  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3592  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3593  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3594  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3595  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3596  for (i = 0; i < team->t.t_nproc; ++i) {
3597  __kmp_printf(" Thread %2d: ", i);
3598  __kmp_print_structure_thread("", team->t.t_threads[i]);
3599  }
3600  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3601  __kmp_printf("\n");
3602  list = list->next;
3603  }
3604 
3605  // Print out __kmp_thread_pool and __kmp_team_pool.
3606  __kmp_printf("\n------------------------------\nPools\n----------------------"
3607  "--------\n");
3608  __kmp_print_structure_thread("Thread pool: ",
3609  CCAST(kmp_info_t *, __kmp_thread_pool));
3610  __kmp_print_structure_team("Team pool: ",
3611  CCAST(kmp_team_t *, __kmp_team_pool));
3612  __kmp_printf("\n");
3613 
3614  // Free team list.
3615  while (list != NULL) {
3616  kmp_team_list_item_t *item = list;
3617  list = list->next;
3618  KMP_INTERNAL_FREE(item);
3619  }
3620 }
3621 
3622 #endif
3623 
3624 //---------------------------------------------------------------------------
3625 // Stuff for per-thread fast random number generator
3626 // Table of primes
3627 static const unsigned __kmp_primes[] = {
3628  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3629  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3630  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3631  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3632  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3633  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3634  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3635  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3636  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3637  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3638  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3639 
3640 //---------------------------------------------------------------------------
3641 // __kmp_get_random: Get a random number using a linear congruential method.
3642 unsigned short __kmp_get_random(kmp_info_t *thread) {
3643  unsigned x = thread->th.th_x;
3644  unsigned short r = (unsigned short)(x >> 16);
3645 
3646  thread->th.th_x = x * thread->th.th_a + 1;
3647 
3648  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3649  thread->th.th_info.ds.ds_tid, r));
3650 
3651  return r;
3652 }
3653 //--------------------------------------------------------
3654 // __kmp_init_random: Initialize a random number generator
3655 void __kmp_init_random(kmp_info_t *thread) {
3656  unsigned seed = thread->th.th_info.ds.ds_tid;
3657 
3658  thread->th.th_a =
3659  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3660  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3661  KA_TRACE(30,
3662  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3663 }
3664 
3665 #if KMP_OS_WINDOWS
3666 /* reclaim array entries for root threads that are already dead, returns number
3667  * reclaimed */
3668 static int __kmp_reclaim_dead_roots(void) {
3669  int i, r = 0;
3670 
3671  for (i = 0; i < __kmp_threads_capacity; ++i) {
3672  if (KMP_UBER_GTID(i) &&
3673  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3674  !__kmp_root[i]
3675  ->r.r_active) { // AC: reclaim only roots died in non-active state
3676  r += __kmp_unregister_root_other_thread(i);
3677  }
3678  }
3679  return r;
3680 }
3681 #endif
3682 
3683 /* This function attempts to create free entries in __kmp_threads and
3684  __kmp_root, and returns the number of free entries generated.
3685 
3686  For Windows* OS static library, the first mechanism used is to reclaim array
3687  entries for root threads that are already dead.
3688 
3689  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3690  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3691  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3692  threadprivate cache array has been created. Synchronization with
3693  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3694 
3695  After any dead root reclamation, if the clipping value allows array expansion
3696  to result in the generation of a total of nNeed free slots, the function does
3697  that expansion. If not, nothing is done beyond the possible initial root
3698  thread reclamation.
3699 
3700  If any argument is negative, the behavior is undefined. */
3701 static int __kmp_expand_threads(int nNeed) {
3702  int added = 0;
3703  int minimumRequiredCapacity;
3704  int newCapacity;
3705  kmp_info_t **newThreads;
3706  kmp_root_t **newRoot;
3707 
3708  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3709  // resizing __kmp_threads does not need additional protection if foreign
3710  // threads are present
3711 
3712 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3713  /* only for Windows static library */
3714  /* reclaim array entries for root threads that are already dead */
3715  added = __kmp_reclaim_dead_roots();
3716 
3717  if (nNeed) {
3718  nNeed -= added;
3719  if (nNeed < 0)
3720  nNeed = 0;
3721  }
3722 #endif
3723  if (nNeed <= 0)
3724  return added;
3725 
3726  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3727  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3728  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3729  // > __kmp_max_nth in one of two ways:
3730  //
3731  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3732  // may not be reused by another thread, so we may need to increase
3733  // __kmp_threads_capacity to __kmp_max_nth + 1.
3734  //
3735  // 2) New foreign root(s) are encountered. We always register new foreign
3736  // roots. This may cause a smaller # of threads to be allocated at
3737  // subsequent parallel regions, but the worker threads hang around (and
3738  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3739  //
3740  // Anyway, that is the reason for moving the check to see if
3741  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3742  // instead of having it performed here. -BB
3743 
3744  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3745 
3746  /* compute expansion headroom to check if we can expand */
3747  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3748  /* possible expansion too small -- give up */
3749  return added;
3750  }
3751  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3752 
3753  newCapacity = __kmp_threads_capacity;
3754  do {
3755  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3756  : __kmp_sys_max_nth;
3757  } while (newCapacity < minimumRequiredCapacity);
3758  newThreads = (kmp_info_t **)__kmp_allocate(
3759  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3760  newRoot =
3761  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3762  KMP_MEMCPY(newThreads, __kmp_threads,
3763  __kmp_threads_capacity * sizeof(kmp_info_t *));
3764  KMP_MEMCPY(newRoot, __kmp_root,
3765  __kmp_threads_capacity * sizeof(kmp_root_t *));
3766  // Put old __kmp_threads array on a list. Any ongoing references to the old
3767  // list will be valid. This list is cleaned up at library shutdown.
3768  kmp_old_threads_list_t *node =
3769  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3770  node->threads = __kmp_threads;
3771  node->next = __kmp_old_threads_list;
3772  __kmp_old_threads_list = node;
3773 
3774  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3775  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3776  added += newCapacity - __kmp_threads_capacity;
3777  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3778 
3779  if (newCapacity > __kmp_tp_capacity) {
3780  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3781  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3782  __kmp_threadprivate_resize_cache(newCapacity);
3783  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3784  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3785  }
3786  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3787  }
3788 
3789  return added;
3790 }
3791 
3792 /* Register the current thread as a root thread and obtain our gtid. We must
3793  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3794  thread that calls from __kmp_do_serial_initialize() */
3795 int __kmp_register_root(int initial_thread) {
3796  kmp_info_t *root_thread;
3797  kmp_root_t *root;
3798  int gtid;
3799  int capacity;
3800  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3801  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3802  KMP_MB();
3803 
3804  /* 2007-03-02:
3805  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3806  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3807  work as expected -- it may return false (that means there is at least one
3808  empty slot in __kmp_threads array), but it is possible the only free slot
3809  is #0, which is reserved for initial thread and so cannot be used for this
3810  one. Following code workarounds this bug.
3811 
3812  However, right solution seems to be not reserving slot #0 for initial
3813  thread because:
3814  (1) there is no magic in slot #0,
3815  (2) we cannot detect initial thread reliably (the first thread which does
3816  serial initialization may be not a real initial thread).
3817  */
3818  capacity = __kmp_threads_capacity;
3819  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3820  --capacity;
3821  }
3822 
3823  // If it is not for initializing the hidden helper team, we need to take
3824  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3825  // in __kmp_threads_capacity.
3826  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3827  capacity -= __kmp_hidden_helper_threads_num;
3828  }
3829 
3830  /* see if there are too many threads */
3831  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3832  if (__kmp_tp_cached) {
3833  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3834  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3835  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3836  } else {
3837  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3838  __kmp_msg_null);
3839  }
3840  }
3841 
3842  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3843  // 0: initial thread, also a regular OpenMP thread.
3844  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3845  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3846  // regular OpenMP threads.
3847  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3848  // Find an available thread slot for hidden helper thread. Slots for hidden
3849  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3850  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3851  gtid <= __kmp_hidden_helper_threads_num;
3852  gtid++)
3853  ;
3854  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3855  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3856  "hidden helper thread: T#%d\n",
3857  gtid));
3858  } else {
3859  /* find an available thread slot */
3860  // Don't reassign the zero slot since we need that to only be used by
3861  // initial thread. Slots for hidden helper threads should also be skipped.
3862  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3863  gtid = 0;
3864  } else {
3865  for (gtid = __kmp_hidden_helper_threads_num + 1;
3866  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3867  ;
3868  }
3869  KA_TRACE(
3870  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3871  KMP_ASSERT(gtid < __kmp_threads_capacity);
3872  }
3873 
3874  /* update global accounting */
3875  __kmp_all_nth++;
3876  TCW_4(__kmp_nth, __kmp_nth + 1);
3877 
3878  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3879  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3880  if (__kmp_adjust_gtid_mode) {
3881  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3882  if (TCR_4(__kmp_gtid_mode) != 2) {
3883  TCW_4(__kmp_gtid_mode, 2);
3884  }
3885  } else {
3886  if (TCR_4(__kmp_gtid_mode) != 1) {
3887  TCW_4(__kmp_gtid_mode, 1);
3888  }
3889  }
3890  }
3891 
3892 #ifdef KMP_ADJUST_BLOCKTIME
3893  /* Adjust blocktime to zero if necessary */
3894  /* Middle initialization might not have occurred yet */
3895  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3896  if (__kmp_nth > __kmp_avail_proc) {
3897  __kmp_zero_bt = TRUE;
3898  }
3899  }
3900 #endif /* KMP_ADJUST_BLOCKTIME */
3901 
3902  /* setup this new hierarchy */
3903  if (!(root = __kmp_root[gtid])) {
3904  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3905  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3906  }
3907 
3908 #if KMP_STATS_ENABLED
3909  // Initialize stats as soon as possible (right after gtid assignment).
3910  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3911  __kmp_stats_thread_ptr->startLife();
3912  KMP_SET_THREAD_STATE(SERIAL_REGION);
3913  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3914 #endif
3915  __kmp_initialize_root(root);
3916 
3917  /* setup new root thread structure */
3918  if (root->r.r_uber_thread) {
3919  root_thread = root->r.r_uber_thread;
3920  } else {
3921  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3922  if (__kmp_storage_map) {
3923  __kmp_print_thread_storage_map(root_thread, gtid);
3924  }
3925  root_thread->th.th_info.ds.ds_gtid = gtid;
3926 #if OMPT_SUPPORT
3927  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3928 #endif
3929  root_thread->th.th_root = root;
3930  if (__kmp_env_consistency_check) {
3931  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3932  }
3933 #if USE_FAST_MEMORY
3934  __kmp_initialize_fast_memory(root_thread);
3935 #endif /* USE_FAST_MEMORY */
3936 
3937 #if KMP_USE_BGET
3938  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3939  __kmp_initialize_bget(root_thread);
3940 #endif
3941  __kmp_init_random(root_thread); // Initialize random number generator
3942  }
3943 
3944  /* setup the serial team held in reserve by the root thread */
3945  if (!root_thread->th.th_serial_team) {
3946  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3947  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3948  root_thread->th.th_serial_team = __kmp_allocate_team(
3949  root, 1, 1,
3950 #if OMPT_SUPPORT
3951  ompt_data_none, // root parallel id
3952 #endif
3953  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3954  }
3955  KMP_ASSERT(root_thread->th.th_serial_team);
3956  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3957  root_thread->th.th_serial_team));
3958 
3959  /* drop root_thread into place */
3960  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3961 
3962  root->r.r_root_team->t.t_threads[0] = root_thread;
3963  root->r.r_hot_team->t.t_threads[0] = root_thread;
3964  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3965  // AC: the team created in reserve, not for execution (it is unused for now).
3966  root_thread->th.th_serial_team->t.t_serialized = 0;
3967  root->r.r_uber_thread = root_thread;
3968 
3969  /* initialize the thread, get it ready to go */
3970  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3971  TCW_4(__kmp_init_gtid, TRUE);
3972 
3973  /* prepare the primary thread for get_gtid() */
3974  __kmp_gtid_set_specific(gtid);
3975 
3976 #if USE_ITT_BUILD
3977  __kmp_itt_thread_name(gtid);
3978 #endif /* USE_ITT_BUILD */
3979 
3980 #ifdef KMP_TDATA_GTID
3981  __kmp_gtid = gtid;
3982 #endif
3983  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3984  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3985 
3986  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3987  "plain=%u\n",
3988  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3989  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3990  KMP_INIT_BARRIER_STATE));
3991  { // Initialize barrier data.
3992  int b;
3993  for (b = 0; b < bs_last_barrier; ++b) {
3994  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3995 #if USE_DEBUGGER
3996  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3997 #endif
3998  }
3999  }
4000  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4001  KMP_INIT_BARRIER_STATE);
4002 
4003 #if KMP_AFFINITY_SUPPORTED
4004  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4005  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4006  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4007  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4008 #endif /* KMP_AFFINITY_SUPPORTED */
4009  root_thread->th.th_def_allocator = __kmp_def_allocator;
4010  root_thread->th.th_prev_level = 0;
4011  root_thread->th.th_prev_num_threads = 1;
4012 
4013  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4014  tmp->cg_root = root_thread;
4015  tmp->cg_thread_limit = __kmp_cg_max_nth;
4016  tmp->cg_nthreads = 1;
4017  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4018  " cg_nthreads init to 1\n",
4019  root_thread, tmp));
4020  tmp->up = NULL;
4021  root_thread->th.th_cg_roots = tmp;
4022 
4023  __kmp_root_counter++;
4024 
4025 #if OMPT_SUPPORT
4026  if (!initial_thread && ompt_enabled.enabled) {
4027 
4028  kmp_info_t *root_thread = ompt_get_thread();
4029 
4030  ompt_set_thread_state(root_thread, ompt_state_overhead);
4031 
4032  if (ompt_enabled.ompt_callback_thread_begin) {
4033  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4034  ompt_thread_initial, __ompt_get_thread_data_internal());
4035  }
4036  ompt_data_t *task_data;
4037  ompt_data_t *parallel_data;
4038  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4039  NULL);
4040  if (ompt_enabled.ompt_callback_implicit_task) {
4041  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4042  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4043  }
4044 
4045  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4046  }
4047 #endif
4048 #if OMPD_SUPPORT
4049  if (ompd_state & OMPD_ENABLE_BP)
4050  ompd_bp_thread_begin();
4051 #endif
4052 
4053  KMP_MB();
4054  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4055 
4056  return gtid;
4057 }
4058 
4059 #if KMP_NESTED_HOT_TEAMS
4060 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4061  const int max_level) {
4062  int i, n, nth;
4063  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4064  if (!hot_teams || !hot_teams[level].hot_team) {
4065  return 0;
4066  }
4067  KMP_DEBUG_ASSERT(level < max_level);
4068  kmp_team_t *team = hot_teams[level].hot_team;
4069  nth = hot_teams[level].hot_team_nth;
4070  n = nth - 1; // primary thread is not freed
4071  if (level < max_level - 1) {
4072  for (i = 0; i < nth; ++i) {
4073  kmp_info_t *th = team->t.t_threads[i];
4074  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4075  if (i > 0 && th->th.th_hot_teams) {
4076  __kmp_free(th->th.th_hot_teams);
4077  th->th.th_hot_teams = NULL;
4078  }
4079  }
4080  }
4081  __kmp_free_team(root, team, NULL);
4082  return n;
4083 }
4084 #endif
4085 
4086 // Resets a root thread and clear its root and hot teams.
4087 // Returns the number of __kmp_threads entries directly and indirectly freed.
4088 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4089  kmp_team_t *root_team = root->r.r_root_team;
4090  kmp_team_t *hot_team = root->r.r_hot_team;
4091  int n = hot_team->t.t_nproc;
4092  int i;
4093 
4094  KMP_DEBUG_ASSERT(!root->r.r_active);
4095 
4096  root->r.r_root_team = NULL;
4097  root->r.r_hot_team = NULL;
4098  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4099  // before call to __kmp_free_team().
4100  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4101 #if KMP_NESTED_HOT_TEAMS
4102  if (__kmp_hot_teams_max_level >
4103  0) { // need to free nested hot teams and their threads if any
4104  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4105  kmp_info_t *th = hot_team->t.t_threads[i];
4106  if (__kmp_hot_teams_max_level > 1) {
4107  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4108  }
4109  if (th->th.th_hot_teams) {
4110  __kmp_free(th->th.th_hot_teams);
4111  th->th.th_hot_teams = NULL;
4112  }
4113  }
4114  }
4115 #endif
4116  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4117 
4118  // Before we can reap the thread, we need to make certain that all other
4119  // threads in the teams that had this root as ancestor have stopped trying to
4120  // steal tasks.
4121  if (__kmp_tasking_mode != tskm_immediate_exec) {
4122  __kmp_wait_to_unref_task_teams();
4123  }
4124 
4125 #if KMP_OS_WINDOWS
4126  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4127  KA_TRACE(
4128  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4129  "\n",
4130  (LPVOID) & (root->r.r_uber_thread->th),
4131  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4132  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4133 #endif /* KMP_OS_WINDOWS */
4134 
4135 #if OMPD_SUPPORT
4136  if (ompd_state & OMPD_ENABLE_BP)
4137  ompd_bp_thread_end();
4138 #endif
4139 
4140 #if OMPT_SUPPORT
4141  ompt_data_t *task_data;
4142  ompt_data_t *parallel_data;
4143  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4144  NULL);
4145  if (ompt_enabled.ompt_callback_implicit_task) {
4146  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4147  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4148  }
4149  if (ompt_enabled.ompt_callback_thread_end) {
4150  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4151  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4152  }
4153 #endif
4154 
4155  TCW_4(__kmp_nth,
4156  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4157  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4158  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4159  " to %d\n",
4160  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4161  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4162  if (i == 1) {
4163  // need to free contention group structure
4164  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4165  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4166  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4167  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4168  root->r.r_uber_thread->th.th_cg_roots = NULL;
4169  }
4170  __kmp_reap_thread(root->r.r_uber_thread, 1);
4171 
4172  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4173  // instead of freeing.
4174  root->r.r_uber_thread = NULL;
4175  /* mark root as no longer in use */
4176  root->r.r_begin = FALSE;
4177 
4178  return n;
4179 }
4180 
4181 void __kmp_unregister_root_current_thread(int gtid) {
4182  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4183  /* this lock should be ok, since unregister_root_current_thread is never
4184  called during an abort, only during a normal close. furthermore, if you
4185  have the forkjoin lock, you should never try to get the initz lock */
4186  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4187  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4188  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4189  "exiting T#%d\n",
4190  gtid));
4191  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4192  return;
4193  }
4194  kmp_root_t *root = __kmp_root[gtid];
4195 
4196  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4197  KMP_ASSERT(KMP_UBER_GTID(gtid));
4198  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4199  KMP_ASSERT(root->r.r_active == FALSE);
4200 
4201  KMP_MB();
4202 
4203  kmp_info_t *thread = __kmp_threads[gtid];
4204  kmp_team_t *team = thread->th.th_team;
4205  kmp_task_team_t *task_team = thread->th.th_task_team;
4206 
4207  // we need to wait for the proxy tasks before finishing the thread
4208  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4209  task_team->tt.tt_hidden_helper_task_encountered)) {
4210 #if OMPT_SUPPORT
4211  // the runtime is shutting down so we won't report any events
4212  thread->th.ompt_thread_info.state = ompt_state_undefined;
4213 #endif
4214  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4215  }
4216 
4217  __kmp_reset_root(gtid, root);
4218 
4219  KMP_MB();
4220  KC_TRACE(10,
4221  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4222 
4223  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4224 }
4225 
4226 #if KMP_OS_WINDOWS
4227 /* __kmp_forkjoin_lock must be already held
4228  Unregisters a root thread that is not the current thread. Returns the number
4229  of __kmp_threads entries freed as a result. */
4230 static int __kmp_unregister_root_other_thread(int gtid) {
4231  kmp_root_t *root = __kmp_root[gtid];
4232  int r;
4233 
4234  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4235  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4236  KMP_ASSERT(KMP_UBER_GTID(gtid));
4237  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4238  KMP_ASSERT(root->r.r_active == FALSE);
4239 
4240  r = __kmp_reset_root(gtid, root);
4241  KC_TRACE(10,
4242  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4243  return r;
4244 }
4245 #endif
4246 
4247 #if KMP_DEBUG
4248 void __kmp_task_info() {
4249 
4250  kmp_int32 gtid = __kmp_entry_gtid();
4251  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4252  kmp_info_t *this_thr = __kmp_threads[gtid];
4253  kmp_team_t *steam = this_thr->th.th_serial_team;
4254  kmp_team_t *team = this_thr->th.th_team;
4255 
4256  __kmp_printf(
4257  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4258  "ptask=%p\n",
4259  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4260  team->t.t_implicit_task_taskdata[tid].td_parent);
4261 }
4262 #endif // KMP_DEBUG
4263 
4264 /* TODO optimize with one big memclr, take out what isn't needed, split
4265  responsibility to workers as much as possible, and delay initialization of
4266  features as much as possible */
4267 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4268  int tid, int gtid) {
4269  /* this_thr->th.th_info.ds.ds_gtid is setup in
4270  kmp_allocate_thread/create_worker.
4271  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4272  KMP_DEBUG_ASSERT(this_thr != NULL);
4273  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4274  KMP_DEBUG_ASSERT(team);
4275  KMP_DEBUG_ASSERT(team->t.t_threads);
4276  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4277  kmp_info_t *master = team->t.t_threads[0];
4278  KMP_DEBUG_ASSERT(master);
4279  KMP_DEBUG_ASSERT(master->th.th_root);
4280 
4281  KMP_MB();
4282 
4283  TCW_SYNC_PTR(this_thr->th.th_team, team);
4284 
4285  this_thr->th.th_info.ds.ds_tid = tid;
4286  this_thr->th.th_set_nproc = 0;
4287  if (__kmp_tasking_mode != tskm_immediate_exec)
4288  // When tasking is possible, threads are not safe to reap until they are
4289  // done tasking; this will be set when tasking code is exited in wait
4290  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4291  else // no tasking --> always safe to reap
4292  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4293  this_thr->th.th_set_proc_bind = proc_bind_default;
4294 #if KMP_AFFINITY_SUPPORTED
4295  this_thr->th.th_new_place = this_thr->th.th_current_place;
4296 #endif
4297  this_thr->th.th_root = master->th.th_root;
4298 
4299  /* setup the thread's cache of the team structure */
4300  this_thr->th.th_team_nproc = team->t.t_nproc;
4301  this_thr->th.th_team_master = master;
4302  this_thr->th.th_team_serialized = team->t.t_serialized;
4303 
4304  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4305 
4306  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4307  tid, gtid, this_thr, this_thr->th.th_current_task));
4308 
4309  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4310  team, tid, TRUE);
4311 
4312  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4313  tid, gtid, this_thr, this_thr->th.th_current_task));
4314  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4315  // __kmp_initialize_team()?
4316 
4317  /* TODO no worksharing in speculative threads */
4318  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4319 
4320  this_thr->th.th_local.this_construct = 0;
4321 
4322  if (!this_thr->th.th_pri_common) {
4323  this_thr->th.th_pri_common =
4324  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4325  if (__kmp_storage_map) {
4326  __kmp_print_storage_map_gtid(
4327  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4328  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4329  }
4330  this_thr->th.th_pri_head = NULL;
4331  }
4332 
4333  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4334  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4335  // Make new thread's CG root same as primary thread's
4336  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4337  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4338  if (tmp) {
4339  // worker changes CG, need to check if old CG should be freed
4340  int i = tmp->cg_nthreads--;
4341  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4342  " on node %p of thread %p to %d\n",
4343  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4344  if (i == 1) {
4345  __kmp_free(tmp); // last thread left CG --> free it
4346  }
4347  }
4348  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4349  // Increment new thread's CG root's counter to add the new thread
4350  this_thr->th.th_cg_roots->cg_nthreads++;
4351  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4352  " node %p of thread %p to %d\n",
4353  this_thr, this_thr->th.th_cg_roots,
4354  this_thr->th.th_cg_roots->cg_root,
4355  this_thr->th.th_cg_roots->cg_nthreads));
4356  this_thr->th.th_current_task->td_icvs.thread_limit =
4357  this_thr->th.th_cg_roots->cg_thread_limit;
4358  }
4359 
4360  /* Initialize dynamic dispatch */
4361  {
4362  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4363  // Use team max_nproc since this will never change for the team.
4364  size_t disp_size =
4365  sizeof(dispatch_private_info_t) *
4366  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4367  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4368  team->t.t_max_nproc));
4369  KMP_ASSERT(dispatch);
4370  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4371  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4372 
4373  dispatch->th_disp_index = 0;
4374  dispatch->th_doacross_buf_idx = 0;
4375  if (!dispatch->th_disp_buffer) {
4376  dispatch->th_disp_buffer =
4377  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4378 
4379  if (__kmp_storage_map) {
4380  __kmp_print_storage_map_gtid(
4381  gtid, &dispatch->th_disp_buffer[0],
4382  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4383  ? 1
4384  : __kmp_dispatch_num_buffers],
4385  disp_size,
4386  "th_%d.th_dispatch.th_disp_buffer "
4387  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4388  gtid, team->t.t_id, gtid);
4389  }
4390  } else {
4391  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4392  }
4393 
4394  dispatch->th_dispatch_pr_current = 0;
4395  dispatch->th_dispatch_sh_current = 0;
4396 
4397  dispatch->th_deo_fcn = 0; /* ORDERED */
4398  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4399  }
4400 
4401  this_thr->th.th_next_pool = NULL;
4402 
4403  if (!this_thr->th.th_task_state_memo_stack) {
4404  size_t i;
4405  this_thr->th.th_task_state_memo_stack =
4406  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4407  this_thr->th.th_task_state_top = 0;
4408  this_thr->th.th_task_state_stack_sz = 4;
4409  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4410  ++i) // zero init the stack
4411  this_thr->th.th_task_state_memo_stack[i] = 0;
4412  }
4413 
4414  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4415  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4416 
4417  KMP_MB();
4418 }
4419 
4420 /* allocate a new thread for the requesting team. this is only called from
4421  within a forkjoin critical section. we will first try to get an available
4422  thread from the thread pool. if none is available, we will fork a new one
4423  assuming we are able to create a new one. this should be assured, as the
4424  caller should check on this first. */
4425 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4426  int new_tid) {
4427  kmp_team_t *serial_team;
4428  kmp_info_t *new_thr;
4429  int new_gtid;
4430 
4431  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4432  KMP_DEBUG_ASSERT(root && team);
4433 #if !KMP_NESTED_HOT_TEAMS
4434  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4435 #endif
4436  KMP_MB();
4437 
4438  /* first, try to get one from the thread pool */
4439  if (__kmp_thread_pool) {
4440  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442  if (new_thr == __kmp_thread_pool_insert_pt) {
4443  __kmp_thread_pool_insert_pt = NULL;
4444  }
4445  TCW_4(new_thr->th.th_in_pool, FALSE);
4446  __kmp_suspend_initialize_thread(new_thr);
4447  __kmp_lock_suspend_mx(new_thr);
4448  if (new_thr->th.th_active_in_pool == TRUE) {
4449  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451  new_thr->th.th_active_in_pool = FALSE;
4452  }
4453  __kmp_unlock_suspend_mx(new_thr);
4454 
4455  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457  KMP_ASSERT(!new_thr->th.th_team);
4458  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459 
4460  /* setup the thread structure */
4461  __kmp_initialize_info(new_thr, team, new_tid,
4462  new_thr->th.th_info.ds.ds_gtid);
4463  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464 
4465  TCW_4(__kmp_nth, __kmp_nth + 1);
4466 
4467  new_thr->th.th_task_state = 0;
4468  new_thr->th.th_task_state_top = 0;
4469  new_thr->th.th_task_state_stack_sz = 4;
4470 
4471  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4472  // Make sure pool thread has transitioned to waiting on own thread struct
4473  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4474  // Thread activated in __kmp_allocate_team when increasing team size
4475  }
4476 
4477 #ifdef KMP_ADJUST_BLOCKTIME
4478  /* Adjust blocktime back to zero if necessary */
4479  /* Middle initialization might not have occurred yet */
4480  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4481  if (__kmp_nth > __kmp_avail_proc) {
4482  __kmp_zero_bt = TRUE;
4483  }
4484  }
4485 #endif /* KMP_ADJUST_BLOCKTIME */
4486 
4487 #if KMP_DEBUG
4488  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4489  // KMP_BARRIER_PARENT_FLAG.
4490  int b;
4491  kmp_balign_t *balign = new_thr->th.th_bar;
4492  for (b = 0; b < bs_last_barrier; ++b)
4493  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4494 #endif
4495 
4496  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4497  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4498 
4499  KMP_MB();
4500  return new_thr;
4501  }
4502 
4503  /* no, well fork a new one */
4504  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4505  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4506 
4507 #if KMP_USE_MONITOR
4508  // If this is the first worker thread the RTL is creating, then also
4509  // launch the monitor thread. We try to do this as early as possible.
4510  if (!TCR_4(__kmp_init_monitor)) {
4511  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4512  if (!TCR_4(__kmp_init_monitor)) {
4513  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4514  TCW_4(__kmp_init_monitor, 1);
4515  __kmp_create_monitor(&__kmp_monitor);
4516  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4517 #if KMP_OS_WINDOWS
4518  // AC: wait until monitor has started. This is a fix for CQ232808.
4519  // The reason is that if the library is loaded/unloaded in a loop with
4520  // small (parallel) work in between, then there is high probability that
4521  // monitor thread started after the library shutdown. At shutdown it is
4522  // too late to cope with the problem, because when the primary thread is
4523  // in DllMain (process detach) the monitor has no chances to start (it is
4524  // blocked), and primary thread has no means to inform the monitor that
4525  // the library has gone, because all the memory which the monitor can
4526  // access is going to be released/reset.
4527  while (TCR_4(__kmp_init_monitor) < 2) {
4528  KMP_YIELD(TRUE);
4529  }
4530  KF_TRACE(10, ("after monitor thread has started\n"));
4531 #endif
4532  }
4533  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4534  }
4535 #endif
4536 
4537  KMP_MB();
4538 
4539  {
4540  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4541  ? 1
4542  : __kmp_hidden_helper_threads_num + 1;
4543 
4544  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4545  ++new_gtid) {
4546  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4547  }
4548 
4549  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4550  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4551  }
4552  }
4553 
4554  /* allocate space for it. */
4555  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4556 
4557  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4558 
4559 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4560  // suppress race conditions detection on synchronization flags in debug mode
4561  // this helps to analyze library internals eliminating false positives
4562  __itt_suppress_mark_range(
4563  __itt_suppress_range, __itt_suppress_threading_errors,
4564  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4565  __itt_suppress_mark_range(
4566  __itt_suppress_range, __itt_suppress_threading_errors,
4567  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4568 #if KMP_OS_WINDOWS
4569  __itt_suppress_mark_range(
4570  __itt_suppress_range, __itt_suppress_threading_errors,
4571  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4572 #else
4573  __itt_suppress_mark_range(__itt_suppress_range,
4574  __itt_suppress_threading_errors,
4575  &new_thr->th.th_suspend_init_count,
4576  sizeof(new_thr->th.th_suspend_init_count));
4577 #endif
4578  // TODO: check if we need to also suppress b_arrived flags
4579  __itt_suppress_mark_range(__itt_suppress_range,
4580  __itt_suppress_threading_errors,
4581  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4582  sizeof(new_thr->th.th_bar[0].bb.b_go));
4583  __itt_suppress_mark_range(__itt_suppress_range,
4584  __itt_suppress_threading_errors,
4585  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4586  sizeof(new_thr->th.th_bar[1].bb.b_go));
4587  __itt_suppress_mark_range(__itt_suppress_range,
4588  __itt_suppress_threading_errors,
4589  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4590  sizeof(new_thr->th.th_bar[2].bb.b_go));
4591 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4592  if (__kmp_storage_map) {
4593  __kmp_print_thread_storage_map(new_thr, new_gtid);
4594  }
4595 
4596  // add the reserve serialized team, initialized from the team's primary thread
4597  {
4598  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4599  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4600  new_thr->th.th_serial_team = serial_team =
4601  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4602 #if OMPT_SUPPORT
4603  ompt_data_none, // root parallel id
4604 #endif
4605  proc_bind_default, &r_icvs,
4606  0 USE_NESTED_HOT_ARG(NULL));
4607  }
4608  KMP_ASSERT(serial_team);
4609  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4610  // execution (it is unused for now).
4611  serial_team->t.t_threads[0] = new_thr;
4612  KF_TRACE(10,
4613  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4614  new_thr));
4615 
4616  /* setup the thread structures */
4617  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4618 
4619 #if USE_FAST_MEMORY
4620  __kmp_initialize_fast_memory(new_thr);
4621 #endif /* USE_FAST_MEMORY */
4622 
4623 #if KMP_USE_BGET
4624  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4625  __kmp_initialize_bget(new_thr);
4626 #endif
4627 
4628  __kmp_init_random(new_thr); // Initialize random number generator
4629 
4630  /* Initialize these only once when thread is grabbed for a team allocation */
4631  KA_TRACE(20,
4632  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4633  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4634 
4635  int b;
4636  kmp_balign_t *balign = new_thr->th.th_bar;
4637  for (b = 0; b < bs_last_barrier; ++b) {
4638  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4639  balign[b].bb.team = NULL;
4640  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4641  balign[b].bb.use_oncore_barrier = 0;
4642  }
4643 
4644  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4645  new_thr->th.th_sleep_loc_type = flag_unset;
4646 
4647  new_thr->th.th_spin_here = FALSE;
4648  new_thr->th.th_next_waiting = 0;
4649 #if KMP_OS_UNIX
4650  new_thr->th.th_blocking = false;
4651 #endif
4652 
4653 #if KMP_AFFINITY_SUPPORTED
4654  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4655  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4656  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4657  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4658 #endif
4659  new_thr->th.th_def_allocator = __kmp_def_allocator;
4660  new_thr->th.th_prev_level = 0;
4661  new_thr->th.th_prev_num_threads = 1;
4662 
4663  TCW_4(new_thr->th.th_in_pool, FALSE);
4664  new_thr->th.th_active_in_pool = FALSE;
4665  TCW_4(new_thr->th.th_active, TRUE);
4666 
4667  /* adjust the global counters */
4668  __kmp_all_nth++;
4669  __kmp_nth++;
4670 
4671  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4672  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4673  if (__kmp_adjust_gtid_mode) {
4674  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4675  if (TCR_4(__kmp_gtid_mode) != 2) {
4676  TCW_4(__kmp_gtid_mode, 2);
4677  }
4678  } else {
4679  if (TCR_4(__kmp_gtid_mode) != 1) {
4680  TCW_4(__kmp_gtid_mode, 1);
4681  }
4682  }
4683  }
4684 
4685 #ifdef KMP_ADJUST_BLOCKTIME
4686  /* Adjust blocktime back to zero if necessary */
4687  /* Middle initialization might not have occurred yet */
4688  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4689  if (__kmp_nth > __kmp_avail_proc) {
4690  __kmp_zero_bt = TRUE;
4691  }
4692  }
4693 #endif /* KMP_ADJUST_BLOCKTIME */
4694 
4695 #if KMP_AFFINITY_SUPPORTED
4696  // Set the affinity and topology information for new thread
4697  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4698 #endif
4699 
4700  /* actually fork it and create the new worker thread */
4701  KF_TRACE(
4702  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4703  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4704  KF_TRACE(10,
4705  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4706 
4707  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4708  new_gtid));
4709  KMP_MB();
4710  return new_thr;
4711 }
4712 
4713 /* Reinitialize team for reuse.
4714  The hot team code calls this case at every fork barrier, so EPCC barrier
4715  test are extremely sensitive to changes in it, esp. writes to the team
4716  struct, which cause a cache invalidation in all threads.
4717  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4718 static void __kmp_reinitialize_team(kmp_team_t *team,
4719  kmp_internal_control_t *new_icvs,
4720  ident_t *loc) {
4721  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4722  team->t.t_threads[0], team));
4723  KMP_DEBUG_ASSERT(team && new_icvs);
4724  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4725  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4726 
4727  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4728  // Copy ICVs to the primary thread's implicit taskdata
4729  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4730  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4731 
4732  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4733  team->t.t_threads[0], team));
4734 }
4735 
4736 /* Initialize the team data structure.
4737  This assumes the t_threads and t_max_nproc are already set.
4738  Also, we don't touch the arguments */
4739 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4740  kmp_internal_control_t *new_icvs,
4741  ident_t *loc) {
4742  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4743 
4744  /* verify */
4745  KMP_DEBUG_ASSERT(team);
4746  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4747  KMP_DEBUG_ASSERT(team->t.t_threads);
4748  KMP_MB();
4749 
4750  team->t.t_master_tid = 0; /* not needed */
4751  /* team->t.t_master_bar; not needed */
4752  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4753  team->t.t_nproc = new_nproc;
4754 
4755  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4756  team->t.t_next_pool = NULL;
4757  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4758  * up hot team */
4759 
4760  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4761  team->t.t_invoke = NULL; /* not needed */
4762 
4763  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4764  team->t.t_sched.sched = new_icvs->sched.sched;
4765 
4766 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4767  team->t.t_fp_control_saved = FALSE; /* not needed */
4768  team->t.t_x87_fpu_control_word = 0; /* not needed */
4769  team->t.t_mxcsr = 0; /* not needed */
4770 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4771 
4772  team->t.t_construct = 0;
4773 
4774  team->t.t_ordered.dt.t_value = 0;
4775  team->t.t_master_active = FALSE;
4776 
4777 #ifdef KMP_DEBUG
4778  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4779 #endif
4780 #if KMP_OS_WINDOWS
4781  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4782 #endif
4783 
4784  team->t.t_control_stack_top = NULL;
4785 
4786  __kmp_reinitialize_team(team, new_icvs, loc);
4787 
4788  KMP_MB();
4789  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4790 }
4791 
4792 #if KMP_AFFINITY_SUPPORTED
4793 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4794  int first, int last, int newp) {
4795  th->th.th_first_place = first;
4796  th->th.th_last_place = last;
4797  th->th.th_new_place = newp;
4798  if (newp != th->th.th_current_place) {
4799  if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4800  team->t.t_display_affinity = 1;
4801  // Copy topology information associated with the new place
4802  th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4803  th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4804  }
4805 }
4806 
4807 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4808 // It calculates the worker + primary thread's partition based upon the parent
4809 // thread's partition, and binds each worker to a thread in their partition.
4810 // The primary thread's partition should already include its current binding.
4811 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4812  // Do not partition places for the hidden helper team
4813  if (KMP_HIDDEN_HELPER_TEAM(team))
4814  return;
4815  // Copy the primary thread's place partition to the team struct
4816  kmp_info_t *master_th = team->t.t_threads[0];
4817  KMP_DEBUG_ASSERT(master_th != NULL);
4818  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4819  int first_place = master_th->th.th_first_place;
4820  int last_place = master_th->th.th_last_place;
4821  int masters_place = master_th->th.th_current_place;
4822  int num_masks = __kmp_affinity.num_masks;
4823  team->t.t_first_place = first_place;
4824  team->t.t_last_place = last_place;
4825 
4826  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4827  "bound to place %d partition = [%d,%d]\n",
4828  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4829  team->t.t_id, masters_place, first_place, last_place));
4830 
4831  switch (proc_bind) {
4832 
4833  case proc_bind_default:
4834  // Serial teams might have the proc_bind policy set to proc_bind_default.
4835  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4836  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4837  break;
4838 
4839  case proc_bind_primary: {
4840  int f;
4841  int n_th = team->t.t_nproc;
4842  for (f = 1; f < n_th; f++) {
4843  kmp_info_t *th = team->t.t_threads[f];
4844  KMP_DEBUG_ASSERT(th != NULL);
4845  __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4846 
4847  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4848  "partition = [%d,%d]\n",
4849  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4850  f, masters_place, first_place, last_place));
4851  }
4852  } break;
4853 
4854  case proc_bind_close: {
4855  int f;
4856  int n_th = team->t.t_nproc;
4857  int n_places;
4858  if (first_place <= last_place) {
4859  n_places = last_place - first_place + 1;
4860  } else {
4861  n_places = num_masks - first_place + last_place + 1;
4862  }
4863  if (n_th <= n_places) {
4864  int place = masters_place;
4865  for (f = 1; f < n_th; f++) {
4866  kmp_info_t *th = team->t.t_threads[f];
4867  KMP_DEBUG_ASSERT(th != NULL);
4868 
4869  if (place == last_place) {
4870  place = first_place;
4871  } else if (place == (num_masks - 1)) {
4872  place = 0;
4873  } else {
4874  place++;
4875  }
4876  __kmp_set_thread_place(team, th, first_place, last_place, place);
4877 
4878  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4879  "partition = [%d,%d]\n",
4880  __kmp_gtid_from_thread(team->t.t_threads[f]),
4881  team->t.t_id, f, place, first_place, last_place));
4882  }
4883  } else {
4884  int S, rem, gap, s_count;
4885  S = n_th / n_places;
4886  s_count = 0;
4887  rem = n_th - (S * n_places);
4888  gap = rem > 0 ? n_places / rem : n_places;
4889  int place = masters_place;
4890  int gap_ct = gap;
4891  for (f = 0; f < n_th; f++) {
4892  kmp_info_t *th = team->t.t_threads[f];
4893  KMP_DEBUG_ASSERT(th != NULL);
4894 
4895  __kmp_set_thread_place(team, th, first_place, last_place, place);
4896  s_count++;
4897 
4898  if ((s_count == S) && rem && (gap_ct == gap)) {
4899  // do nothing, add an extra thread to place on next iteration
4900  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4901  // we added an extra thread to this place; move to next place
4902  if (place == last_place) {
4903  place = first_place;
4904  } else if (place == (num_masks - 1)) {
4905  place = 0;
4906  } else {
4907  place++;
4908  }
4909  s_count = 0;
4910  gap_ct = 1;
4911  rem--;
4912  } else if (s_count == S) { // place full; don't add extra
4913  if (place == last_place) {
4914  place = first_place;
4915  } else if (place == (num_masks - 1)) {
4916  place = 0;
4917  } else {
4918  place++;
4919  }
4920  gap_ct++;
4921  s_count = 0;
4922  }
4923 
4924  KA_TRACE(100,
4925  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4926  "partition = [%d,%d]\n",
4927  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4928  th->th.th_new_place, first_place, last_place));
4929  }
4930  KMP_DEBUG_ASSERT(place == masters_place);
4931  }
4932  } break;
4933 
4934  case proc_bind_spread: {
4935  int f;
4936  int n_th = team->t.t_nproc;
4937  int n_places;
4938  int thidx;
4939  if (first_place <= last_place) {
4940  n_places = last_place - first_place + 1;
4941  } else {
4942  n_places = num_masks - first_place + last_place + 1;
4943  }
4944  if (n_th <= n_places) {
4945  int place = -1;
4946 
4947  if (n_places != num_masks) {
4948  int S = n_places / n_th;
4949  int s_count, rem, gap, gap_ct;
4950 
4951  place = masters_place;
4952  rem = n_places - n_th * S;
4953  gap = rem ? n_th / rem : 1;
4954  gap_ct = gap;
4955  thidx = n_th;
4956  if (update_master_only == 1)
4957  thidx = 1;
4958  for (f = 0; f < thidx; f++) {
4959  kmp_info_t *th = team->t.t_threads[f];
4960  KMP_DEBUG_ASSERT(th != NULL);
4961 
4962  int fplace = place, nplace = place;
4963  s_count = 1;
4964  while (s_count < S) {
4965  if (place == last_place) {
4966  place = first_place;
4967  } else if (place == (num_masks - 1)) {
4968  place = 0;
4969  } else {
4970  place++;
4971  }
4972  s_count++;
4973  }
4974  if (rem && (gap_ct == gap)) {
4975  if (place == last_place) {
4976  place = first_place;
4977  } else if (place == (num_masks - 1)) {
4978  place = 0;
4979  } else {
4980  place++;
4981  }
4982  rem--;
4983  gap_ct = 0;
4984  }
4985  __kmp_set_thread_place(team, th, fplace, place, nplace);
4986  gap_ct++;
4987 
4988  if (place == last_place) {
4989  place = first_place;
4990  } else if (place == (num_masks - 1)) {
4991  place = 0;
4992  } else {
4993  place++;
4994  }
4995 
4996  KA_TRACE(100,
4997  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4998  "partition = [%d,%d], num_masks: %u\n",
4999  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5000  f, th->th.th_new_place, th->th.th_first_place,
5001  th->th.th_last_place, num_masks));
5002  }
5003  } else {
5004  /* Having uniform space of available computation places I can create
5005  T partitions of round(P/T) size and put threads into the first
5006  place of each partition. */
5007  double current = static_cast<double>(masters_place);
5008  double spacing =
5009  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5010  int first, last;
5011  kmp_info_t *th;
5012 
5013  thidx = n_th + 1;
5014  if (update_master_only == 1)
5015  thidx = 1;
5016  for (f = 0; f < thidx; f++) {
5017  first = static_cast<int>(current);
5018  last = static_cast<int>(current + spacing) - 1;
5019  KMP_DEBUG_ASSERT(last >= first);
5020  if (first >= n_places) {
5021  if (masters_place) {
5022  first -= n_places;
5023  last -= n_places;
5024  if (first == (masters_place + 1)) {
5025  KMP_DEBUG_ASSERT(f == n_th);
5026  first--;
5027  }
5028  if (last == masters_place) {
5029  KMP_DEBUG_ASSERT(f == (n_th - 1));
5030  last--;
5031  }
5032  } else {
5033  KMP_DEBUG_ASSERT(f == n_th);
5034  first = 0;
5035  last = 0;
5036  }
5037  }
5038  if (last >= n_places) {
5039  last = (n_places - 1);
5040  }
5041  place = first;
5042  current += spacing;
5043  if (f < n_th) {
5044  KMP_DEBUG_ASSERT(0 <= first);
5045  KMP_DEBUG_ASSERT(n_places > first);
5046  KMP_DEBUG_ASSERT(0 <= last);
5047  KMP_DEBUG_ASSERT(n_places > last);
5048  KMP_DEBUG_ASSERT(last_place >= first_place);
5049  th = team->t.t_threads[f];
5050  KMP_DEBUG_ASSERT(th);
5051  __kmp_set_thread_place(team, th, first, last, place);
5052  KA_TRACE(100,
5053  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5054  "partition = [%d,%d], spacing = %.4f\n",
5055  __kmp_gtid_from_thread(team->t.t_threads[f]),
5056  team->t.t_id, f, th->th.th_new_place,
5057  th->th.th_first_place, th->th.th_last_place, spacing));
5058  }
5059  }
5060  }
5061  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5062  } else {
5063  int S, rem, gap, s_count;
5064  S = n_th / n_places;
5065  s_count = 0;
5066  rem = n_th - (S * n_places);
5067  gap = rem > 0 ? n_places / rem : n_places;
5068  int place = masters_place;
5069  int gap_ct = gap;
5070  thidx = n_th;
5071  if (update_master_only == 1)
5072  thidx = 1;
5073  for (f = 0; f < thidx; f++) {
5074  kmp_info_t *th = team->t.t_threads[f];
5075  KMP_DEBUG_ASSERT(th != NULL);
5076 
5077  __kmp_set_thread_place(team, th, place, place, place);
5078  s_count++;
5079 
5080  if ((s_count == S) && rem && (gap_ct == gap)) {
5081  // do nothing, add an extra thread to place on next iteration
5082  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5083  // we added an extra thread to this place; move on to next place
5084  if (place == last_place) {
5085  place = first_place;
5086  } else if (place == (num_masks - 1)) {
5087  place = 0;
5088  } else {
5089  place++;
5090  }
5091  s_count = 0;
5092  gap_ct = 1;
5093  rem--;
5094  } else if (s_count == S) { // place is full; don't add extra thread
5095  if (place == last_place) {
5096  place = first_place;
5097  } else if (place == (num_masks - 1)) {
5098  place = 0;
5099  } else {
5100  place++;
5101  }
5102  gap_ct++;
5103  s_count = 0;
5104  }
5105 
5106  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5107  "partition = [%d,%d]\n",
5108  __kmp_gtid_from_thread(team->t.t_threads[f]),
5109  team->t.t_id, f, th->th.th_new_place,
5110  th->th.th_first_place, th->th.th_last_place));
5111  }
5112  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5113  }
5114  } break;
5115 
5116  default:
5117  break;
5118  }
5119 
5120  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5121 }
5122 
5123 #endif // KMP_AFFINITY_SUPPORTED
5124 
5125 /* allocate a new team data structure to use. take one off of the free pool if
5126  available */
5127 kmp_team_t *
5128 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5129 #if OMPT_SUPPORT
5130  ompt_data_t ompt_parallel_data,
5131 #endif
5132  kmp_proc_bind_t new_proc_bind,
5133  kmp_internal_control_t *new_icvs,
5134  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5135  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5136  int f;
5137  kmp_team_t *team;
5138  int use_hot_team = !root->r.r_active;
5139  int level = 0;
5140  int do_place_partition = 1;
5141 
5142  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5143  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5144  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5145  KMP_MB();
5146 
5147 #if KMP_NESTED_HOT_TEAMS
5148  kmp_hot_team_ptr_t *hot_teams;
5149  if (master) {
5150  team = master->th.th_team;
5151  level = team->t.t_active_level;
5152  if (master->th.th_teams_microtask) { // in teams construct?
5153  if (master->th.th_teams_size.nteams > 1 &&
5154  ( // #teams > 1
5155  team->t.t_pkfn ==
5156  (microtask_t)__kmp_teams_master || // inner fork of the teams
5157  master->th.th_teams_level <
5158  team->t.t_level)) { // or nested parallel inside the teams
5159  ++level; // not increment if #teams==1, or for outer fork of the teams;
5160  // increment otherwise
5161  }
5162  // Do not perform the place partition if inner fork of the teams
5163  // Wait until nested parallel region encountered inside teams construct
5164  if ((master->th.th_teams_size.nteams == 1 &&
5165  master->th.th_teams_level >= team->t.t_level) ||
5166  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5167  do_place_partition = 0;
5168  }
5169  hot_teams = master->th.th_hot_teams;
5170  if (level < __kmp_hot_teams_max_level && hot_teams &&
5171  hot_teams[level].hot_team) {
5172  // hot team has already been allocated for given level
5173  use_hot_team = 1;
5174  } else {
5175  use_hot_team = 0;
5176  }
5177  } else {
5178  // check we won't access uninitialized hot_teams, just in case
5179  KMP_DEBUG_ASSERT(new_nproc == 1);
5180  }
5181 #endif
5182  // Optimization to use a "hot" team
5183  if (use_hot_team && new_nproc > 1) {
5184  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5185 #if KMP_NESTED_HOT_TEAMS
5186  team = hot_teams[level].hot_team;
5187 #else
5188  team = root->r.r_hot_team;
5189 #endif
5190 #if KMP_DEBUG
5191  if (__kmp_tasking_mode != tskm_immediate_exec) {
5192  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5193  "task_team[1] = %p before reinit\n",
5194  team->t.t_task_team[0], team->t.t_task_team[1]));
5195  }
5196 #endif
5197 
5198  if (team->t.t_nproc != new_nproc &&
5199  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5200  // Distributed barrier may need a resize
5201  int old_nthr = team->t.t_nproc;
5202  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5203  }
5204 
5205  // If not doing the place partition, then reset the team's proc bind
5206  // to indicate that partitioning of all threads still needs to take place
5207  if (do_place_partition == 0)
5208  team->t.t_proc_bind = proc_bind_default;
5209  // Has the number of threads changed?
5210  /* Let's assume the most common case is that the number of threads is
5211  unchanged, and put that case first. */
5212  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5213  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5214  // This case can mean that omp_set_num_threads() was called and the hot
5215  // team size was already reduced, so we check the special flag
5216  if (team->t.t_size_changed == -1) {
5217  team->t.t_size_changed = 1;
5218  } else {
5219  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5220  }
5221 
5222  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5223  kmp_r_sched_t new_sched = new_icvs->sched;
5224  // set primary thread's schedule as new run-time schedule
5225  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5226 
5227  __kmp_reinitialize_team(team, new_icvs,
5228  root->r.r_uber_thread->th.th_ident);
5229 
5230  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5231  team->t.t_threads[0], team));
5232  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5233 
5234 #if KMP_AFFINITY_SUPPORTED
5235  if ((team->t.t_size_changed == 0) &&
5236  (team->t.t_proc_bind == new_proc_bind)) {
5237  if (new_proc_bind == proc_bind_spread) {
5238  if (do_place_partition) {
5239  // add flag to update only master for spread
5240  __kmp_partition_places(team, 1);
5241  }
5242  }
5243  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5244  "proc_bind = %d, partition = [%d,%d]\n",
5245  team->t.t_id, new_proc_bind, team->t.t_first_place,
5246  team->t.t_last_place));
5247  } else {
5248  if (do_place_partition) {
5249  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5250  __kmp_partition_places(team);
5251  }
5252  }
5253 #else
5254  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5255 #endif /* KMP_AFFINITY_SUPPORTED */
5256  } else if (team->t.t_nproc > new_nproc) {
5257  KA_TRACE(20,
5258  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5259  new_nproc));
5260 
5261  team->t.t_size_changed = 1;
5262  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5263  // Barrier size already reduced earlier in this function
5264  // Activate team threads via th_used_in_team
5265  __kmp_add_threads_to_team(team, new_nproc);
5266  }
5267 #if KMP_NESTED_HOT_TEAMS
5268  if (__kmp_hot_teams_mode == 0) {
5269  // AC: saved number of threads should correspond to team's value in this
5270  // mode, can be bigger in mode 1, when hot team has threads in reserve
5271  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5272  hot_teams[level].hot_team_nth = new_nproc;
5273 #endif // KMP_NESTED_HOT_TEAMS
5274  /* release the extra threads we don't need any more */
5275  for (f = new_nproc; f < team->t.t_nproc; f++) {
5276  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5277  if (__kmp_tasking_mode != tskm_immediate_exec) {
5278  // When decreasing team size, threads no longer in the team should
5279  // unref task team.
5280  team->t.t_threads[f]->th.th_task_team = NULL;
5281  }
5282  __kmp_free_thread(team->t.t_threads[f]);
5283  team->t.t_threads[f] = NULL;
5284  }
5285 #if KMP_NESTED_HOT_TEAMS
5286  } // (__kmp_hot_teams_mode == 0)
5287  else {
5288  // When keeping extra threads in team, switch threads to wait on own
5289  // b_go flag
5290  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5291  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5293  for (int b = 0; b < bs_last_barrier; ++b) {
5294  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5295  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5296  }
5297  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5298  }
5299  }
5300  }
5301 #endif // KMP_NESTED_HOT_TEAMS
5302  team->t.t_nproc = new_nproc;
5303  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5304  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5305  __kmp_reinitialize_team(team, new_icvs,
5306  root->r.r_uber_thread->th.th_ident);
5307 
5308  // Update remaining threads
5309  for (f = 0; f < new_nproc; ++f) {
5310  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5311  }
5312 
5313  // restore the current task state of the primary thread: should be the
5314  // implicit task
5315  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5316  team->t.t_threads[0], team));
5317 
5318  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5319 
5320 #ifdef KMP_DEBUG
5321  for (f = 0; f < team->t.t_nproc; f++) {
5322  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5323  team->t.t_threads[f]->th.th_team_nproc ==
5324  team->t.t_nproc);
5325  }
5326 #endif
5327 
5328  if (do_place_partition) {
5329  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5330 #if KMP_AFFINITY_SUPPORTED
5331  __kmp_partition_places(team);
5332 #endif
5333  }
5334  } else { // team->t.t_nproc < new_nproc
5335 
5336  KA_TRACE(20,
5337  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5338  new_nproc));
5339  int old_nproc = team->t.t_nproc; // save old value and use to update only
5340  team->t.t_size_changed = 1;
5341 
5342 #if KMP_NESTED_HOT_TEAMS
5343  int avail_threads = hot_teams[level].hot_team_nth;
5344  if (new_nproc < avail_threads)
5345  avail_threads = new_nproc;
5346  kmp_info_t **other_threads = team->t.t_threads;
5347  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5348  // Adjust barrier data of reserved threads (if any) of the team
5349  // Other data will be set in __kmp_initialize_info() below.
5350  int b;
5351  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5352  for (b = 0; b < bs_last_barrier; ++b) {
5353  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5354  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5355 #if USE_DEBUGGER
5356  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5357 #endif
5358  }
5359  }
5360  if (hot_teams[level].hot_team_nth >= new_nproc) {
5361  // we have all needed threads in reserve, no need to allocate any
5362  // this only possible in mode 1, cannot have reserved threads in mode 0
5363  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5364  team->t.t_nproc = new_nproc; // just get reserved threads involved
5365  } else {
5366  // We may have some threads in reserve, but not enough;
5367  // get reserved threads involved if any.
5368  team->t.t_nproc = hot_teams[level].hot_team_nth;
5369  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5370 #endif // KMP_NESTED_HOT_TEAMS
5371  if (team->t.t_max_nproc < new_nproc) {
5372  /* reallocate larger arrays */
5373  __kmp_reallocate_team_arrays(team, new_nproc);
5374  __kmp_reinitialize_team(team, new_icvs, NULL);
5375  }
5376 
5377 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5378  /* Temporarily set full mask for primary thread before creation of
5379  workers. The reason is that workers inherit the affinity from the
5380  primary thread, so if a lot of workers are created on the single
5381  core quickly, they don't get a chance to set their own affinity for
5382  a long time. */
5383  kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5384 #endif
5385 
5386  /* allocate new threads for the hot team */
5387  for (f = team->t.t_nproc; f < new_nproc; f++) {
5388  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5389  KMP_DEBUG_ASSERT(new_worker);
5390  team->t.t_threads[f] = new_worker;
5391 
5392  KA_TRACE(20,
5393  ("__kmp_allocate_team: team %d init T#%d arrived: "
5394  "join=%llu, plain=%llu\n",
5395  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5396  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5397  team->t.t_bar[bs_plain_barrier].b_arrived));
5398 
5399  { // Initialize barrier data for new threads.
5400  int b;
5401  kmp_balign_t *balign = new_worker->th.th_bar;
5402  for (b = 0; b < bs_last_barrier; ++b) {
5403  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5404  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5405  KMP_BARRIER_PARENT_FLAG);
5406 #if USE_DEBUGGER
5407  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5408 #endif
5409  }
5410  }
5411  }
5412 
5413 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5414  /* Restore initial primary thread's affinity mask */
5415  new_temp_affinity.restore();
5416 #endif
5417 #if KMP_NESTED_HOT_TEAMS
5418  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5419 #endif // KMP_NESTED_HOT_TEAMS
5420  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5421  // Barrier size already increased earlier in this function
5422  // Activate team threads via th_used_in_team
5423  __kmp_add_threads_to_team(team, new_nproc);
5424  }
5425  /* make sure everyone is syncronized */
5426  // new threads below
5427  __kmp_initialize_team(team, new_nproc, new_icvs,
5428  root->r.r_uber_thread->th.th_ident);
5429 
5430  /* reinitialize the threads */
5431  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5432  for (f = 0; f < team->t.t_nproc; ++f)
5433  __kmp_initialize_info(team->t.t_threads[f], team, f,
5434  __kmp_gtid_from_tid(f, team));
5435 
5436  // set th_task_state for new threads in hot team with older thread's state
5437  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5438  for (f = old_nproc; f < team->t.t_nproc; ++f)
5439  team->t.t_threads[f]->th.th_task_state = old_state;
5440 
5441 #ifdef KMP_DEBUG
5442  for (f = 0; f < team->t.t_nproc; ++f) {
5443  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5444  team->t.t_threads[f]->th.th_team_nproc ==
5445  team->t.t_nproc);
5446  }
5447 #endif
5448 
5449  if (do_place_partition) {
5450  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5451 #if KMP_AFFINITY_SUPPORTED
5452  __kmp_partition_places(team);
5453 #endif
5454  }
5455  } // Check changes in number of threads
5456 
5457  kmp_info_t *master = team->t.t_threads[0];
5458  if (master->th.th_teams_microtask) {
5459  for (f = 1; f < new_nproc; ++f) {
5460  // propagate teams construct specific info to workers
5461  kmp_info_t *thr = team->t.t_threads[f];
5462  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5463  thr->th.th_teams_level = master->th.th_teams_level;
5464  thr->th.th_teams_size = master->th.th_teams_size;
5465  }
5466  }
5467 #if KMP_NESTED_HOT_TEAMS
5468  if (level) {
5469  // Sync barrier state for nested hot teams, not needed for outermost hot
5470  // team.
5471  for (f = 1; f < new_nproc; ++f) {
5472  kmp_info_t *thr = team->t.t_threads[f];
5473  int b;
5474  kmp_balign_t *balign = thr->th.th_bar;
5475  for (b = 0; b < bs_last_barrier; ++b) {
5476  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5477  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5478 #if USE_DEBUGGER
5479  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5480 #endif
5481  }
5482  }
5483  }
5484 #endif // KMP_NESTED_HOT_TEAMS
5485 
5486  /* reallocate space for arguments if necessary */
5487  __kmp_alloc_argv_entries(argc, team, TRUE);
5488  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5489  // The hot team re-uses the previous task team,
5490  // if untouched during the previous release->gather phase.
5491 
5492  KF_TRACE(10, (" hot_team = %p\n", team));
5493 
5494 #if KMP_DEBUG
5495  if (__kmp_tasking_mode != tskm_immediate_exec) {
5496  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5497  "task_team[1] = %p after reinit\n",
5498  team->t.t_task_team[0], team->t.t_task_team[1]));
5499  }
5500 #endif
5501 
5502 #if OMPT_SUPPORT
5503  __ompt_team_assign_id(team, ompt_parallel_data);
5504 #endif
5505 
5506  KMP_MB();
5507 
5508  return team;
5509  }
5510 
5511  /* next, let's try to take one from the team pool */
5512  KMP_MB();
5513  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5514  /* TODO: consider resizing undersized teams instead of reaping them, now
5515  that we have a resizing mechanism */
5516  if (team->t.t_max_nproc >= max_nproc) {
5517  /* take this team from the team pool */
5518  __kmp_team_pool = team->t.t_next_pool;
5519 
5520  if (max_nproc > 1 &&
5521  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5522  if (!team->t.b) { // Allocate barrier structure
5523  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5524  }
5525  }
5526 
5527  /* setup the team for fresh use */
5528  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5529 
5530  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5531  "task_team[1] %p to NULL\n",
5532  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5533  team->t.t_task_team[0] = NULL;
5534  team->t.t_task_team[1] = NULL;
5535 
5536  /* reallocate space for arguments if necessary */
5537  __kmp_alloc_argv_entries(argc, team, TRUE);
5538  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5539 
5540  KA_TRACE(
5541  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5542  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5543  { // Initialize barrier data.
5544  int b;
5545  for (b = 0; b < bs_last_barrier; ++b) {
5546  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5547 #if USE_DEBUGGER
5548  team->t.t_bar[b].b_master_arrived = 0;
5549  team->t.t_bar[b].b_team_arrived = 0;
5550 #endif
5551  }
5552  }
5553 
5554  team->t.t_proc_bind = new_proc_bind;
5555 
5556  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5557  team->t.t_id));
5558 
5559 #if OMPT_SUPPORT
5560  __ompt_team_assign_id(team, ompt_parallel_data);
5561 #endif
5562 
5563  KMP_MB();
5564 
5565  return team;
5566  }
5567 
5568  /* reap team if it is too small, then loop back and check the next one */
5569  // not sure if this is wise, but, will be redone during the hot-teams
5570  // rewrite.
5571  /* TODO: Use technique to find the right size hot-team, don't reap them */
5572  team = __kmp_reap_team(team);
5573  __kmp_team_pool = team;
5574  }
5575 
5576  /* nothing available in the pool, no matter, make a new team! */
5577  KMP_MB();
5578  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5579 
5580  /* and set it up */
5581  team->t.t_max_nproc = max_nproc;
5582  if (max_nproc > 1 &&
5583  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5584  // Allocate barrier structure
5585  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5586  }
5587 
5588  /* NOTE well, for some reason allocating one big buffer and dividing it up
5589  seems to really hurt performance a lot on the P4, so, let's not use this */
5590  __kmp_allocate_team_arrays(team, max_nproc);
5591 
5592  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5593  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5594 
5595  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5596  "%p to NULL\n",
5597  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5598  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5599  // memory, no need to duplicate
5600  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5601  // memory, no need to duplicate
5602 
5603  if (__kmp_storage_map) {
5604  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5605  }
5606 
5607  /* allocate space for arguments */
5608  __kmp_alloc_argv_entries(argc, team, FALSE);
5609  team->t.t_argc = argc;
5610 
5611  KA_TRACE(20,
5612  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5613  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5614  { // Initialize barrier data.
5615  int b;
5616  for (b = 0; b < bs_last_barrier; ++b) {
5617  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5618 #if USE_DEBUGGER
5619  team->t.t_bar[b].b_master_arrived = 0;
5620  team->t.t_bar[b].b_team_arrived = 0;
5621 #endif
5622  }
5623  }
5624 
5625  team->t.t_proc_bind = new_proc_bind;
5626 
5627 #if OMPT_SUPPORT
5628  __ompt_team_assign_id(team, ompt_parallel_data);
5629  team->t.ompt_serialized_team_info = NULL;
5630 #endif
5631 
5632  KMP_MB();
5633 
5634  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5635  team->t.t_id));
5636 
5637  return team;
5638 }
5639 
5640 /* TODO implement hot-teams at all levels */
5641 /* TODO implement lazy thread release on demand (disband request) */
5642 
5643 /* free the team. return it to the team pool. release all the threads
5644  * associated with it */
5645 void __kmp_free_team(kmp_root_t *root,
5646  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5647  int f;
5648  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5649  team->t.t_id));
5650 
5651  /* verify state */
5652  KMP_DEBUG_ASSERT(root);
5653  KMP_DEBUG_ASSERT(team);
5654  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5655  KMP_DEBUG_ASSERT(team->t.t_threads);
5656 
5657  int use_hot_team = team == root->r.r_hot_team;
5658 #if KMP_NESTED_HOT_TEAMS
5659  int level;
5660  if (master) {
5661  level = team->t.t_active_level - 1;
5662  if (master->th.th_teams_microtask) { // in teams construct?
5663  if (master->th.th_teams_size.nteams > 1) {
5664  ++level; // level was not increased in teams construct for
5665  // team_of_masters
5666  }
5667  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5668  master->th.th_teams_level == team->t.t_level) {
5669  ++level; // level was not increased in teams construct for
5670  // team_of_workers before the parallel
5671  } // team->t.t_level will be increased inside parallel
5672  }
5673 #if KMP_DEBUG
5674  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5675 #endif
5676  if (level < __kmp_hot_teams_max_level) {
5677  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5678  use_hot_team = 1;
5679  }
5680  }
5681 #endif // KMP_NESTED_HOT_TEAMS
5682 
5683  /* team is done working */
5684  TCW_SYNC_PTR(team->t.t_pkfn,
5685  NULL); // Important for Debugging Support Library.
5686 #if KMP_OS_WINDOWS
5687  team->t.t_copyin_counter = 0; // init counter for possible reuse
5688 #endif
5689  // Do not reset pointer to parent team to NULL for hot teams.
5690 
5691  /* if we are non-hot team, release our threads */
5692  if (!use_hot_team) {
5693  if (__kmp_tasking_mode != tskm_immediate_exec) {
5694  // Wait for threads to reach reapable state
5695  for (f = 1; f < team->t.t_nproc; ++f) {
5696  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5697  kmp_info_t *th = team->t.t_threads[f];
5698  volatile kmp_uint32 *state = &th->th.th_reap_state;
5699  while (*state != KMP_SAFE_TO_REAP) {
5700 #if KMP_OS_WINDOWS
5701  // On Windows a thread can be killed at any time, check this
5702  DWORD ecode;
5703  if (!__kmp_is_thread_alive(th, &ecode)) {
5704  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5705  break;
5706  }
5707 #endif
5708  // first check if thread is sleeping
5709  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5710  if (fl.is_sleeping())
5711  fl.resume(__kmp_gtid_from_thread(th));
5712  KMP_CPU_PAUSE();
5713  }
5714  }
5715 
5716  // Delete task teams
5717  int tt_idx;
5718  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5719  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5720  if (task_team != NULL) {
5721  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5722  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5723  team->t.t_threads[f]->th.th_task_team = NULL;
5724  }
5725  KA_TRACE(
5726  20,
5727  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5728  __kmp_get_gtid(), task_team, team->t.t_id));
5729 #if KMP_NESTED_HOT_TEAMS
5730  __kmp_free_task_team(master, task_team);
5731 #endif
5732  team->t.t_task_team[tt_idx] = NULL;
5733  }
5734  }
5735  }
5736 
5737  // Reset pointer to parent team only for non-hot teams.
5738  team->t.t_parent = NULL;
5739  team->t.t_level = 0;
5740  team->t.t_active_level = 0;
5741 
5742  /* free the worker threads */
5743  for (f = 1; f < team->t.t_nproc; ++f) {
5744  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5745  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5746  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5747  1, 2);
5748  }
5749  __kmp_free_thread(team->t.t_threads[f]);
5750  }
5751 
5752  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5753  if (team->t.b) {
5754  // wake up thread at old location
5755  team->t.b->go_release();
5756  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5757  for (f = 1; f < team->t.t_nproc; ++f) {
5758  if (team->t.b->sleep[f].sleep) {
5759  __kmp_atomic_resume_64(
5760  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5761  (kmp_atomic_flag_64<> *)NULL);
5762  }
5763  }
5764  }
5765  // Wait for threads to be removed from team
5766  for (int f = 1; f < team->t.t_nproc; ++f) {
5767  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5768  KMP_CPU_PAUSE();
5769  }
5770  }
5771  }
5772 
5773  for (f = 1; f < team->t.t_nproc; ++f) {
5774  team->t.t_threads[f] = NULL;
5775  }
5776 
5777  if (team->t.t_max_nproc > 1 &&
5778  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5779  distributedBarrier::deallocate(team->t.b);
5780  team->t.b = NULL;
5781  }
5782  /* put the team back in the team pool */
5783  /* TODO limit size of team pool, call reap_team if pool too large */
5784  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5785  __kmp_team_pool = (volatile kmp_team_t *)team;
5786  } else { // Check if team was created for primary threads in teams construct
5787  // See if first worker is a CG root
5788  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5789  team->t.t_threads[1]->th.th_cg_roots);
5790  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5791  // Clean up the CG root nodes on workers so that this team can be re-used
5792  for (f = 1; f < team->t.t_nproc; ++f) {
5793  kmp_info_t *thr = team->t.t_threads[f];
5794  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5795  thr->th.th_cg_roots->cg_root == thr);
5796  // Pop current CG root off list
5797  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5798  thr->th.th_cg_roots = tmp->up;
5799  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5800  " up to node %p. cg_nthreads was %d\n",
5801  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5802  int i = tmp->cg_nthreads--;
5803  if (i == 1) {
5804  __kmp_free(tmp); // free CG if we are the last thread in it
5805  }
5806  // Restore current task's thread_limit from CG root
5807  if (thr->th.th_cg_roots)
5808  thr->th.th_current_task->td_icvs.thread_limit =
5809  thr->th.th_cg_roots->cg_thread_limit;
5810  }
5811  }
5812  }
5813 
5814  KMP_MB();
5815 }
5816 
5817 /* reap the team. destroy it, reclaim all its resources and free its memory */
5818 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5819  kmp_team_t *next_pool = team->t.t_next_pool;
5820 
5821  KMP_DEBUG_ASSERT(team);
5822  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5823  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5824  KMP_DEBUG_ASSERT(team->t.t_threads);
5825  KMP_DEBUG_ASSERT(team->t.t_argv);
5826 
5827  /* TODO clean the threads that are a part of this? */
5828 
5829  /* free stuff */
5830  __kmp_free_team_arrays(team);
5831  if (team->t.t_argv != &team->t.t_inline_argv[0])
5832  __kmp_free((void *)team->t.t_argv);
5833  __kmp_free(team);
5834 
5835  KMP_MB();
5836  return next_pool;
5837 }
5838 
5839 // Free the thread. Don't reap it, just place it on the pool of available
5840 // threads.
5841 //
5842 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5843 // binding for the affinity mechanism to be useful.
5844 //
5845 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5846 // However, we want to avoid a potential performance problem by always
5847 // scanning through the list to find the correct point at which to insert
5848 // the thread (potential N**2 behavior). To do this we keep track of the
5849 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5850 // With single-level parallelism, threads will always be added to the tail
5851 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5852 // parallelism, all bets are off and we may need to scan through the entire
5853 // free list.
5854 //
5855 // This change also has a potentially large performance benefit, for some
5856 // applications. Previously, as threads were freed from the hot team, they
5857 // would be placed back on the free list in inverse order. If the hot team
5858 // grew back to it's original size, then the freed thread would be placed
5859 // back on the hot team in reverse order. This could cause bad cache
5860 // locality problems on programs where the size of the hot team regularly
5861 // grew and shrunk.
5862 //
5863 // Now, for single-level parallelism, the OMP tid is always == gtid.
5864 void __kmp_free_thread(kmp_info_t *this_th) {
5865  int gtid;
5866  kmp_info_t **scan;
5867 
5868  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5869  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5870 
5871  KMP_DEBUG_ASSERT(this_th);
5872 
5873  // When moving thread to pool, switch thread to wait on own b_go flag, and
5874  // uninitialized (NULL team).
5875  int b;
5876  kmp_balign_t *balign = this_th->th.th_bar;
5877  for (b = 0; b < bs_last_barrier; ++b) {
5878  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5879  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5880  balign[b].bb.team = NULL;
5881  balign[b].bb.leaf_kids = 0;
5882  }
5883  this_th->th.th_task_state = 0;
5884  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5885 
5886  /* put thread back on the free pool */
5887  TCW_PTR(this_th->th.th_team, NULL);
5888  TCW_PTR(this_th->th.th_root, NULL);
5889  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5890 
5891  while (this_th->th.th_cg_roots) {
5892  this_th->th.th_cg_roots->cg_nthreads--;
5893  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5894  " %p of thread %p to %d\n",
5895  this_th, this_th->th.th_cg_roots,
5896  this_th->th.th_cg_roots->cg_root,
5897  this_th->th.th_cg_roots->cg_nthreads));
5898  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5899  if (tmp->cg_root == this_th) { // Thread is a cg_root
5900  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5901  KA_TRACE(
5902  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5903  this_th->th.th_cg_roots = tmp->up;
5904  __kmp_free(tmp);
5905  } else { // Worker thread
5906  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5907  __kmp_free(tmp);
5908  }
5909  this_th->th.th_cg_roots = NULL;
5910  break;
5911  }
5912  }
5913 
5914  /* If the implicit task assigned to this thread can be used by other threads
5915  * -> multiple threads can share the data and try to free the task at
5916  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5917  * with higher probability when hot team is disabled but can occurs even when
5918  * the hot team is enabled */
5919  __kmp_free_implicit_task(this_th);
5920  this_th->th.th_current_task = NULL;
5921 
5922  // If the __kmp_thread_pool_insert_pt is already past the new insert
5923  // point, then we need to re-scan the entire list.
5924  gtid = this_th->th.th_info.ds.ds_gtid;
5925  if (__kmp_thread_pool_insert_pt != NULL) {
5926  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5927  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5928  __kmp_thread_pool_insert_pt = NULL;
5929  }
5930  }
5931 
5932  // Scan down the list to find the place to insert the thread.
5933  // scan is the address of a link in the list, possibly the address of
5934  // __kmp_thread_pool itself.
5935  //
5936  // In the absence of nested parallelism, the for loop will have 0 iterations.
5937  if (__kmp_thread_pool_insert_pt != NULL) {
5938  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5939  } else {
5940  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5941  }
5942  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5943  scan = &((*scan)->th.th_next_pool))
5944  ;
5945 
5946  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5947  // to its address.
5948  TCW_PTR(this_th->th.th_next_pool, *scan);
5949  __kmp_thread_pool_insert_pt = *scan = this_th;
5950  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5951  (this_th->th.th_info.ds.ds_gtid <
5952  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5953  TCW_4(this_th->th.th_in_pool, TRUE);
5954  __kmp_suspend_initialize_thread(this_th);
5955  __kmp_lock_suspend_mx(this_th);
5956  if (this_th->th.th_active == TRUE) {
5957  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5958  this_th->th.th_active_in_pool = TRUE;
5959  }
5960 #if KMP_DEBUG
5961  else {
5962  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5963  }
5964 #endif
5965  __kmp_unlock_suspend_mx(this_th);
5966 
5967  TCW_4(__kmp_nth, __kmp_nth - 1);
5968 
5969 #ifdef KMP_ADJUST_BLOCKTIME
5970  /* Adjust blocktime back to user setting or default if necessary */
5971  /* Middle initialization might never have occurred */
5972  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5973  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5974  if (__kmp_nth <= __kmp_avail_proc) {
5975  __kmp_zero_bt = FALSE;
5976  }
5977  }
5978 #endif /* KMP_ADJUST_BLOCKTIME */
5979 
5980  KMP_MB();
5981 }
5982 
5983 /* ------------------------------------------------------------------------ */
5984 
5985 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5986 #if OMP_PROFILING_SUPPORT
5987  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5988  // TODO: add a configuration option for time granularity
5989  if (ProfileTraceFile)
5990  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5991 #endif
5992 
5993  int gtid = this_thr->th.th_info.ds.ds_gtid;
5994  /* void *stack_data;*/
5995  kmp_team_t **volatile pteam;
5996 
5997  KMP_MB();
5998  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5999 
6000  if (__kmp_env_consistency_check) {
6001  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6002  }
6003 
6004 #if OMPD_SUPPORT
6005  if (ompd_state & OMPD_ENABLE_BP)
6006  ompd_bp_thread_begin();
6007 #endif
6008 
6009 #if OMPT_SUPPORT
6010  ompt_data_t *thread_data = nullptr;
6011  if (ompt_enabled.enabled) {
6012  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6013  *thread_data = ompt_data_none;
6014 
6015  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6016  this_thr->th.ompt_thread_info.wait_id = 0;
6017  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6018  this_thr->th.ompt_thread_info.parallel_flags = 0;
6019  if (ompt_enabled.ompt_callback_thread_begin) {
6020  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6021  ompt_thread_worker, thread_data);
6022  }
6023  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6024  }
6025 #endif
6026 
6027  /* This is the place where threads wait for work */
6028  while (!TCR_4(__kmp_global.g.g_done)) {
6029  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6030  KMP_MB();
6031 
6032  /* wait for work to do */
6033  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6034 
6035  /* No tid yet since not part of a team */
6036  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6037 
6038 #if OMPT_SUPPORT
6039  if (ompt_enabled.enabled) {
6040  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6041  }
6042 #endif
6043 
6044  pteam = &this_thr->th.th_team;
6045 
6046  /* have we been allocated? */
6047  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6048  /* we were just woken up, so run our new task */
6049  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6050  int rc;
6051  KA_TRACE(20,
6052  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6053  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6054  (*pteam)->t.t_pkfn));
6055 
6056  updateHWFPControl(*pteam);
6057 
6058 #if OMPT_SUPPORT
6059  if (ompt_enabled.enabled) {
6060  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6061  }
6062 #endif
6063 
6064  rc = (*pteam)->t.t_invoke(gtid);
6065  KMP_ASSERT(rc);
6066 
6067  KMP_MB();
6068  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6069  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6070  (*pteam)->t.t_pkfn));
6071  }
6072 #if OMPT_SUPPORT
6073  if (ompt_enabled.enabled) {
6074  /* no frame set while outside task */
6075  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6076 
6077  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6078  }
6079 #endif
6080  /* join barrier after parallel region */
6081  __kmp_join_barrier(gtid);
6082  }
6083  }
6084 
6085 #if OMPD_SUPPORT
6086  if (ompd_state & OMPD_ENABLE_BP)
6087  ompd_bp_thread_end();
6088 #endif
6089 
6090 #if OMPT_SUPPORT
6091  if (ompt_enabled.ompt_callback_thread_end) {
6092  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6093  }
6094 #endif
6095 
6096  this_thr->th.th_task_team = NULL;
6097  /* run the destructors for the threadprivate data for this thread */
6098  __kmp_common_destroy_gtid(gtid);
6099 
6100  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6101  KMP_MB();
6102 
6103 #if OMP_PROFILING_SUPPORT
6104  llvm::timeTraceProfilerFinishThread();
6105 #endif
6106  return this_thr;
6107 }
6108 
6109 /* ------------------------------------------------------------------------ */
6110 
6111 void __kmp_internal_end_dest(void *specific_gtid) {
6112  // Make sure no significant bits are lost
6113  int gtid;
6114  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6115 
6116  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6117  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6118  * this is because 0 is reserved for the nothing-stored case */
6119 
6120  __kmp_internal_end_thread(gtid);
6121 }
6122 
6123 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6124 
6125 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6126  __kmp_internal_end_atexit();
6127 }
6128 
6129 #endif
6130 
6131 /* [Windows] josh: when the atexit handler is called, there may still be more
6132  than one thread alive */
6133 void __kmp_internal_end_atexit(void) {
6134  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6135  /* [Windows]
6136  josh: ideally, we want to completely shutdown the library in this atexit
6137  handler, but stat code that depends on thread specific data for gtid fails
6138  because that data becomes unavailable at some point during the shutdown, so
6139  we call __kmp_internal_end_thread instead. We should eventually remove the
6140  dependency on __kmp_get_specific_gtid in the stat code and use
6141  __kmp_internal_end_library to cleanly shutdown the library.
6142 
6143  // TODO: Can some of this comment about GVS be removed?
6144  I suspect that the offending stat code is executed when the calling thread
6145  tries to clean up a dead root thread's data structures, resulting in GVS
6146  code trying to close the GVS structures for that thread, but since the stat
6147  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6148  the calling thread is cleaning up itself instead of another thread, it get
6149  confused. This happens because allowing a thread to unregister and cleanup
6150  another thread is a recent modification for addressing an issue.
6151  Based on the current design (20050722), a thread may end up
6152  trying to unregister another thread only if thread death does not trigger
6153  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6154  thread specific data destructor function to detect thread death. For
6155  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6156  is nothing. Thus, the workaround is applicable only for Windows static
6157  stat library. */
6158  __kmp_internal_end_library(-1);
6159 #if KMP_OS_WINDOWS
6160  __kmp_close_console();
6161 #endif
6162 }
6163 
6164 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6165  // It is assumed __kmp_forkjoin_lock is acquired.
6166 
6167  int gtid;
6168 
6169  KMP_DEBUG_ASSERT(thread != NULL);
6170 
6171  gtid = thread->th.th_info.ds.ds_gtid;
6172 
6173  if (!is_root) {
6174  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6175  /* Assume the threads are at the fork barrier here */
6176  KA_TRACE(
6177  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6178  gtid));
6179  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6180  while (
6181  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6182  KMP_CPU_PAUSE();
6183  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6184  } else {
6185  /* Need release fence here to prevent seg faults for tree forkjoin
6186  barrier (GEH) */
6187  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6188  thread);
6189  __kmp_release_64(&flag);
6190  }
6191  }
6192 
6193  // Terminate OS thread.
6194  __kmp_reap_worker(thread);
6195 
6196  // The thread was killed asynchronously. If it was actively
6197  // spinning in the thread pool, decrement the global count.
6198  //
6199  // There is a small timing hole here - if the worker thread was just waking
6200  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6201  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6202  // the global counter might not get updated.
6203  //
6204  // Currently, this can only happen as the library is unloaded,
6205  // so there are no harmful side effects.
6206  if (thread->th.th_active_in_pool) {
6207  thread->th.th_active_in_pool = FALSE;
6208  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6209  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6210  }
6211  }
6212 
6213  __kmp_free_implicit_task(thread);
6214 
6215 // Free the fast memory for tasking
6216 #if USE_FAST_MEMORY
6217  __kmp_free_fast_memory(thread);
6218 #endif /* USE_FAST_MEMORY */
6219 
6220  __kmp_suspend_uninitialize_thread(thread);
6221 
6222  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6223  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6224 
6225  --__kmp_all_nth;
6226  // __kmp_nth was decremented when thread is added to the pool.
6227 
6228 #ifdef KMP_ADJUST_BLOCKTIME
6229  /* Adjust blocktime back to user setting or default if necessary */
6230  /* Middle initialization might never have occurred */
6231  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6232  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6233  if (__kmp_nth <= __kmp_avail_proc) {
6234  __kmp_zero_bt = FALSE;
6235  }
6236  }
6237 #endif /* KMP_ADJUST_BLOCKTIME */
6238 
6239  /* free the memory being used */
6240  if (__kmp_env_consistency_check) {
6241  if (thread->th.th_cons) {
6242  __kmp_free_cons_stack(thread->th.th_cons);
6243  thread->th.th_cons = NULL;
6244  }
6245  }
6246 
6247  if (thread->th.th_pri_common != NULL) {
6248  __kmp_free(thread->th.th_pri_common);
6249  thread->th.th_pri_common = NULL;
6250  }
6251 
6252  if (thread->th.th_task_state_memo_stack != NULL) {
6253  __kmp_free(thread->th.th_task_state_memo_stack);
6254  thread->th.th_task_state_memo_stack = NULL;
6255  }
6256 
6257 #if KMP_USE_BGET
6258  if (thread->th.th_local.bget_data != NULL) {
6259  __kmp_finalize_bget(thread);
6260  }
6261 #endif
6262 
6263 #if KMP_AFFINITY_SUPPORTED
6264  if (thread->th.th_affin_mask != NULL) {
6265  KMP_CPU_FREE(thread->th.th_affin_mask);
6266  thread->th.th_affin_mask = NULL;
6267  }
6268 #endif /* KMP_AFFINITY_SUPPORTED */
6269 
6270 #if KMP_USE_HIER_SCHED
6271  if (thread->th.th_hier_bar_data != NULL) {
6272  __kmp_free(thread->th.th_hier_bar_data);
6273  thread->th.th_hier_bar_data = NULL;
6274  }
6275 #endif
6276 
6277  __kmp_reap_team(thread->th.th_serial_team);
6278  thread->th.th_serial_team = NULL;
6279  __kmp_free(thread);
6280 
6281  KMP_MB();
6282 
6283 } // __kmp_reap_thread
6284 
6285 static void __kmp_itthash_clean(kmp_info_t *th) {
6286 #if USE_ITT_NOTIFY
6287  if (__kmp_itt_region_domains.count > 0) {
6288  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6289  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6290  while (bucket) {
6291  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6292  __kmp_thread_free(th, bucket);
6293  bucket = next;
6294  }
6295  }
6296  }
6297  if (__kmp_itt_barrier_domains.count > 0) {
6298  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6299  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6300  while (bucket) {
6301  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6302  __kmp_thread_free(th, bucket);
6303  bucket = next;
6304  }
6305  }
6306  }
6307 #endif
6308 }
6309 
6310 static void __kmp_internal_end(void) {
6311  int i;
6312 
6313  /* First, unregister the library */
6314  __kmp_unregister_library();
6315 
6316 #if KMP_OS_WINDOWS
6317  /* In Win static library, we can't tell when a root actually dies, so we
6318  reclaim the data structures for any root threads that have died but not
6319  unregistered themselves, in order to shut down cleanly.
6320  In Win dynamic library we also can't tell when a thread dies. */
6321  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6322 // dead roots
6323 #endif
6324 
6325  for (i = 0; i < __kmp_threads_capacity; i++)
6326  if (__kmp_root[i])
6327  if (__kmp_root[i]->r.r_active)
6328  break;
6329  KMP_MB(); /* Flush all pending memory write invalidates. */
6330  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6331 
6332  if (i < __kmp_threads_capacity) {
6333 #if KMP_USE_MONITOR
6334  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6335  KMP_MB(); /* Flush all pending memory write invalidates. */
6336 
6337  // Need to check that monitor was initialized before reaping it. If we are
6338  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6339  // __kmp_monitor will appear to contain valid data, but it is only valid in
6340  // the parent process, not the child.
6341  // New behavior (201008): instead of keying off of the flag
6342  // __kmp_init_parallel, the monitor thread creation is keyed off
6343  // of the new flag __kmp_init_monitor.
6344  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6345  if (TCR_4(__kmp_init_monitor)) {
6346  __kmp_reap_monitor(&__kmp_monitor);
6347  TCW_4(__kmp_init_monitor, 0);
6348  }
6349  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6350  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6351 #endif // KMP_USE_MONITOR
6352  } else {
6353 /* TODO move this to cleanup code */
6354 #ifdef KMP_DEBUG
6355  /* make sure that everything has properly ended */
6356  for (i = 0; i < __kmp_threads_capacity; i++) {
6357  if (__kmp_root[i]) {
6358  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6359  // there can be uber threads alive here
6360  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6361  }
6362  }
6363 #endif
6364 
6365  KMP_MB();
6366 
6367  // Reap the worker threads.
6368  // This is valid for now, but be careful if threads are reaped sooner.
6369  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6370  // Get the next thread from the pool.
6371  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6372  __kmp_thread_pool = thread->th.th_next_pool;
6373  // Reap it.
6374  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6375  thread->th.th_next_pool = NULL;
6376  thread->th.th_in_pool = FALSE;
6377  __kmp_reap_thread(thread, 0);
6378  }
6379  __kmp_thread_pool_insert_pt = NULL;
6380 
6381  // Reap teams.
6382  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6383  // Get the next team from the pool.
6384  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6385  __kmp_team_pool = team->t.t_next_pool;
6386  // Reap it.
6387  team->t.t_next_pool = NULL;
6388  __kmp_reap_team(team);
6389  }
6390 
6391  __kmp_reap_task_teams();
6392 
6393 #if KMP_OS_UNIX
6394  // Threads that are not reaped should not access any resources since they
6395  // are going to be deallocated soon, so the shutdown sequence should wait
6396  // until all threads either exit the final spin-waiting loop or begin
6397  // sleeping after the given blocktime.
6398  for (i = 0; i < __kmp_threads_capacity; i++) {
6399  kmp_info_t *thr = __kmp_threads[i];
6400  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6401  KMP_CPU_PAUSE();
6402  }
6403 #endif
6404 
6405  for (i = 0; i < __kmp_threads_capacity; ++i) {
6406  // TBD: Add some checking...
6407  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6408  }
6409 
6410  /* Make sure all threadprivate destructors get run by joining with all
6411  worker threads before resetting this flag */
6412  TCW_SYNC_4(__kmp_init_common, FALSE);
6413 
6414  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6415  KMP_MB();
6416 
6417 #if KMP_USE_MONITOR
6418  // See note above: One of the possible fixes for CQ138434 / CQ140126
6419  //
6420  // FIXME: push both code fragments down and CSE them?
6421  // push them into __kmp_cleanup() ?
6422  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6423  if (TCR_4(__kmp_init_monitor)) {
6424  __kmp_reap_monitor(&__kmp_monitor);
6425  TCW_4(__kmp_init_monitor, 0);
6426  }
6427  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6428  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6429 #endif
6430  } /* else !__kmp_global.t_active */
6431  TCW_4(__kmp_init_gtid, FALSE);
6432  KMP_MB(); /* Flush all pending memory write invalidates. */
6433 
6434  __kmp_cleanup();
6435 #if OMPT_SUPPORT
6436  ompt_fini();
6437 #endif
6438 }
6439 
6440 void __kmp_internal_end_library(int gtid_req) {
6441  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6442  /* this shouldn't be a race condition because __kmp_internal_end() is the
6443  only place to clear __kmp_serial_init */
6444  /* we'll check this later too, after we get the lock */
6445  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6446  // redundant, because the next check will work in any case.
6447  if (__kmp_global.g.g_abort) {
6448  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6449  /* TODO abort? */
6450  return;
6451  }
6452  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6453  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6454  return;
6455  }
6456 
6457  // If hidden helper team has been initialized, we need to deinit it
6458  if (TCR_4(__kmp_init_hidden_helper) &&
6459  !TCR_4(__kmp_hidden_helper_team_done)) {
6460  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6461  // First release the main thread to let it continue its work
6462  __kmp_hidden_helper_main_thread_release();
6463  // Wait until the hidden helper team has been destroyed
6464  __kmp_hidden_helper_threads_deinitz_wait();
6465  }
6466 
6467  KMP_MB(); /* Flush all pending memory write invalidates. */
6468  /* find out who we are and what we should do */
6469  {
6470  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6471  KA_TRACE(
6472  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6473  if (gtid == KMP_GTID_SHUTDOWN) {
6474  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6475  "already shutdown\n"));
6476  return;
6477  } else if (gtid == KMP_GTID_MONITOR) {
6478  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6479  "registered, or system shutdown\n"));
6480  return;
6481  } else if (gtid == KMP_GTID_DNE) {
6482  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6483  "shutdown\n"));
6484  /* we don't know who we are, but we may still shutdown the library */
6485  } else if (KMP_UBER_GTID(gtid)) {
6486  /* unregister ourselves as an uber thread. gtid is no longer valid */
6487  if (__kmp_root[gtid]->r.r_active) {
6488  __kmp_global.g.g_abort = -1;
6489  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6490  __kmp_unregister_library();
6491  KA_TRACE(10,
6492  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6493  gtid));
6494  return;
6495  } else {
6496  __kmp_itthash_clean(__kmp_threads[gtid]);
6497  KA_TRACE(
6498  10,
6499  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6500  __kmp_unregister_root_current_thread(gtid);
6501  }
6502  } else {
6503 /* worker threads may call this function through the atexit handler, if they
6504  * call exit() */
6505 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6506  TODO: do a thorough shutdown instead */
6507 #ifdef DUMP_DEBUG_ON_EXIT
6508  if (__kmp_debug_buf)
6509  __kmp_dump_debug_buffer();
6510 #endif
6511  // added unregister library call here when we switch to shm linux
6512  // if we don't, it will leave lots of files in /dev/shm
6513  // cleanup shared memory file before exiting.
6514  __kmp_unregister_library();
6515  return;
6516  }
6517  }
6518  /* synchronize the termination process */
6519  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6520 
6521  /* have we already finished */
6522  if (__kmp_global.g.g_abort) {
6523  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6524  /* TODO abort? */
6525  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6526  return;
6527  }
6528  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6529  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6530  return;
6531  }
6532 
6533  /* We need this lock to enforce mutex between this reading of
6534  __kmp_threads_capacity and the writing by __kmp_register_root.
6535  Alternatively, we can use a counter of roots that is atomically updated by
6536  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6537  __kmp_internal_end_*. */
6538  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6539 
6540  /* now we can safely conduct the actual termination */
6541  __kmp_internal_end();
6542 
6543  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6544  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6545 
6546  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6547 
6548 #ifdef DUMP_DEBUG_ON_EXIT
6549  if (__kmp_debug_buf)
6550  __kmp_dump_debug_buffer();
6551 #endif
6552 
6553 #if KMP_OS_WINDOWS
6554  __kmp_close_console();
6555 #endif
6556 
6557  __kmp_fini_allocator();
6558 
6559 } // __kmp_internal_end_library
6560 
6561 void __kmp_internal_end_thread(int gtid_req) {
6562  int i;
6563 
6564  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6565  /* this shouldn't be a race condition because __kmp_internal_end() is the
6566  * only place to clear __kmp_serial_init */
6567  /* we'll check this later too, after we get the lock */
6568  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6569  // redundant, because the next check will work in any case.
6570  if (__kmp_global.g.g_abort) {
6571  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6572  /* TODO abort? */
6573  return;
6574  }
6575  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6576  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6577  return;
6578  }
6579 
6580  // If hidden helper team has been initialized, we need to deinit it
6581  if (TCR_4(__kmp_init_hidden_helper) &&
6582  !TCR_4(__kmp_hidden_helper_team_done)) {
6583  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6584  // First release the main thread to let it continue its work
6585  __kmp_hidden_helper_main_thread_release();
6586  // Wait until the hidden helper team has been destroyed
6587  __kmp_hidden_helper_threads_deinitz_wait();
6588  }
6589 
6590  KMP_MB(); /* Flush all pending memory write invalidates. */
6591 
6592  /* find out who we are and what we should do */
6593  {
6594  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6595  KA_TRACE(10,
6596  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6597  if (gtid == KMP_GTID_SHUTDOWN) {
6598  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6599  "already shutdown\n"));
6600  return;
6601  } else if (gtid == KMP_GTID_MONITOR) {
6602  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6603  "registered, or system shutdown\n"));
6604  return;
6605  } else if (gtid == KMP_GTID_DNE) {
6606  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6607  "shutdown\n"));
6608  return;
6609  /* we don't know who we are */
6610  } else if (KMP_UBER_GTID(gtid)) {
6611  /* unregister ourselves as an uber thread. gtid is no longer valid */
6612  if (__kmp_root[gtid]->r.r_active) {
6613  __kmp_global.g.g_abort = -1;
6614  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6615  KA_TRACE(10,
6616  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6617  gtid));
6618  return;
6619  } else {
6620  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6621  gtid));
6622  __kmp_unregister_root_current_thread(gtid);
6623  }
6624  } else {
6625  /* just a worker thread, let's leave */
6626  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6627 
6628  if (gtid >= 0) {
6629  __kmp_threads[gtid]->th.th_task_team = NULL;
6630  }
6631 
6632  KA_TRACE(10,
6633  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6634  gtid));
6635  return;
6636  }
6637  }
6638 #if KMP_DYNAMIC_LIB
6639  if (__kmp_pause_status != kmp_hard_paused)
6640  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6641  // because we will better shutdown later in the library destructor.
6642  {
6643  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6644  return;
6645  }
6646 #endif
6647  /* synchronize the termination process */
6648  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6649 
6650  /* have we already finished */
6651  if (__kmp_global.g.g_abort) {
6652  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6653  /* TODO abort? */
6654  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655  return;
6656  }
6657  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6658  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6659  return;
6660  }
6661 
6662  /* We need this lock to enforce mutex between this reading of
6663  __kmp_threads_capacity and the writing by __kmp_register_root.
6664  Alternatively, we can use a counter of roots that is atomically updated by
6665  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6666  __kmp_internal_end_*. */
6667 
6668  /* should we finish the run-time? are all siblings done? */
6669  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6670 
6671  for (i = 0; i < __kmp_threads_capacity; ++i) {
6672  if (KMP_UBER_GTID(i)) {
6673  KA_TRACE(
6674  10,
6675  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6676  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6677  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678  return;
6679  }
6680  }
6681 
6682  /* now we can safely conduct the actual termination */
6683 
6684  __kmp_internal_end();
6685 
6686  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6687  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6688 
6689  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6690 
6691 #ifdef DUMP_DEBUG_ON_EXIT
6692  if (__kmp_debug_buf)
6693  __kmp_dump_debug_buffer();
6694 #endif
6695 } // __kmp_internal_end_thread
6696 
6697 // -----------------------------------------------------------------------------
6698 // Library registration stuff.
6699 
6700 static long __kmp_registration_flag = 0;
6701 // Random value used to indicate library initialization.
6702 static char *__kmp_registration_str = NULL;
6703 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6704 
6705 static inline char *__kmp_reg_status_name() {
6706 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6707  each thread. If registration and unregistration go in different threads
6708  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6709  env var can not be found, because the name will contain different pid. */
6710 // macOS* complains about name being too long with additional getuid()
6711 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6712  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6713  (int)getuid());
6714 #else
6715  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6716 #endif
6717 } // __kmp_reg_status_get
6718 
6719 #if defined(KMP_USE_SHM)
6720 bool __kmp_shm_available = false;
6721 bool __kmp_tmp_available = false;
6722 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6723 char *temp_reg_status_file_name = nullptr;
6724 #endif
6725 
6726 void __kmp_register_library_startup(void) {
6727 
6728  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6729  int done = 0;
6730  union {
6731  double dtime;
6732  long ltime;
6733  } time;
6734 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6735  __kmp_initialize_system_tick();
6736 #endif
6737  __kmp_read_system_time(&time.dtime);
6738  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6739  __kmp_registration_str =
6740  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6741  __kmp_registration_flag, KMP_LIBRARY_FILE);
6742 
6743  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6744  __kmp_registration_str));
6745 
6746  while (!done) {
6747 
6748  char *value = NULL; // Actual value of the environment variable.
6749 
6750 #if defined(KMP_USE_SHM)
6751  char *shm_name = nullptr;
6752  char *data1 = nullptr;
6753  __kmp_shm_available = __kmp_detect_shm();
6754  if (__kmp_shm_available) {
6755  int fd1 = -1;
6756  shm_name = __kmp_str_format("/%s", name);
6757  int shm_preexist = 0;
6758  fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6759  if ((fd1 == -1) && (errno == EEXIST)) {
6760  // file didn't open because it already exists.
6761  // try opening existing file
6762  fd1 = shm_open(shm_name, O_RDWR, 0666);
6763  if (fd1 == -1) { // file didn't open
6764  KMP_WARNING(FunctionError, "Can't open SHM");
6765  __kmp_shm_available = false;
6766  } else { // able to open existing file
6767  shm_preexist = 1;
6768  }
6769  }
6770  if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6771  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6772  KMP_WARNING(FunctionError, "Can't set size of SHM");
6773  __kmp_shm_available = false;
6774  }
6775  }
6776  if (__kmp_shm_available) { // SHM exists, now map it
6777  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6778  fd1, 0);
6779  if (data1 == MAP_FAILED) { // failed to map shared memory
6780  KMP_WARNING(FunctionError, "Can't map SHM");
6781  __kmp_shm_available = false;
6782  }
6783  }
6784  if (__kmp_shm_available) { // SHM mapped
6785  if (shm_preexist == 0) { // set data to SHM, set value
6786  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6787  }
6788  // Read value from either what we just wrote or existing file.
6789  value = __kmp_str_format("%s", data1); // read value from SHM
6790  munmap(data1, SHM_SIZE);
6791  }
6792  if (fd1 != -1)
6793  close(fd1);
6794  }
6795  if (!__kmp_shm_available)
6796  __kmp_tmp_available = __kmp_detect_tmp();
6797  if (!__kmp_shm_available && __kmp_tmp_available) {
6798  // SHM failed to work due to an error other than that the file already
6799  // exists. Try to create a temp file under /tmp.
6800  // If /tmp isn't accessible, fall back to using environment variable.
6801  // TODO: /tmp might not always be the temporary directory. For now we will
6802  // not consider TMPDIR.
6803  int fd1 = -1;
6804  temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6805  int tmp_preexist = 0;
6806  fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6807  if ((fd1 == -1) && (errno == EEXIST)) {
6808  // file didn't open because it already exists.
6809  // try opening existing file
6810  fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
6811  if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6812  KMP_WARNING(FunctionError, "Can't open TEMP");
6813  __kmp_tmp_available = false;
6814  } else {
6815  tmp_preexist = 1;
6816  }
6817  }
6818  if (__kmp_tmp_available && tmp_preexist == 0) {
6819  // we created /tmp file now set size
6820  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6821  KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6822  __kmp_tmp_available = false;
6823  }
6824  }
6825  if (__kmp_tmp_available) {
6826  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6827  fd1, 0);
6828  if (data1 == MAP_FAILED) { // failed to map /tmp
6829  KMP_WARNING(FunctionError, "Can't map /tmp");
6830  __kmp_tmp_available = false;
6831  }
6832  }
6833  if (__kmp_tmp_available) {
6834  if (tmp_preexist == 0) { // set data to TMP, set value
6835  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6836  }
6837  // Read value from either what we just wrote or existing file.
6838  value = __kmp_str_format("%s", data1); // read value from SHM
6839  munmap(data1, SHM_SIZE);
6840  }
6841  if (fd1 != -1)
6842  close(fd1);
6843  }
6844  if (!__kmp_shm_available && !__kmp_tmp_available) {
6845  // no /dev/shm and no /tmp -- fall back to environment variable
6846  // Set environment variable, but do not overwrite if it exists.
6847  __kmp_env_set(name, __kmp_registration_str, 0);
6848  // read value to see if it got set
6849  value = __kmp_env_get(name);
6850  }
6851 #else // Windows and unix with static library
6852  // Set environment variable, but do not overwrite if it exists.
6853  __kmp_env_set(name, __kmp_registration_str, 0);
6854  // read value to see if it got set
6855  value = __kmp_env_get(name);
6856 #endif
6857 
6858  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6859  done = 1; // Ok, environment variable set successfully, exit the loop.
6860  } else {
6861  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6862  // Check whether it alive or dead.
6863  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6864  char *tail = value;
6865  char *flag_addr_str = NULL;
6866  char *flag_val_str = NULL;
6867  char const *file_name = NULL;
6868  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6869  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6870  file_name = tail;
6871  if (tail != NULL) {
6872  unsigned long *flag_addr = 0;
6873  unsigned long flag_val = 0;
6874  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6875  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6876  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6877  // First, check whether environment-encoded address is mapped into
6878  // addr space.
6879  // If so, dereference it to see if it still has the right value.
6880  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6881  neighbor = 1;
6882  } else {
6883  // If not, then we know the other copy of the library is no longer
6884  // running.
6885  neighbor = 2;
6886  }
6887  }
6888  }
6889  switch (neighbor) {
6890  case 0: // Cannot parse environment variable -- neighbor status unknown.
6891  // Assume it is the incompatible format of future version of the
6892  // library. Assume the other library is alive.
6893  // WARN( ... ); // TODO: Issue a warning.
6894  file_name = "unknown library";
6895  KMP_FALLTHROUGH();
6896  // Attention! Falling to the next case. That's intentional.
6897  case 1: { // Neighbor is alive.
6898  // Check it is allowed.
6899  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6900  if (!__kmp_str_match_true(duplicate_ok)) {
6901  // That's not allowed. Issue fatal error.
6902  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6903  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6904  }
6905  KMP_INTERNAL_FREE(duplicate_ok);
6906  __kmp_duplicate_library_ok = 1;
6907  done = 1; // Exit the loop.
6908  } break;
6909  case 2: { // Neighbor is dead.
6910 
6911 #if defined(KMP_USE_SHM)
6912  if (__kmp_shm_available) { // close shared memory.
6913  shm_unlink(shm_name); // this removes file in /dev/shm
6914  } else if (__kmp_tmp_available) {
6915  unlink(temp_reg_status_file_name); // this removes the temp file
6916  } else {
6917  // Clear the variable and try to register library again.
6918  __kmp_env_unset(name);
6919  }
6920 #else
6921  // Clear the variable and try to register library again.
6922  __kmp_env_unset(name);
6923 #endif
6924  } break;
6925  default: {
6926  KMP_DEBUG_ASSERT(0);
6927  } break;
6928  }
6929  }
6930  KMP_INTERNAL_FREE((void *)value);
6931 #if defined(KMP_USE_SHM)
6932  if (shm_name)
6933  KMP_INTERNAL_FREE((void *)shm_name);
6934 #endif
6935  } // while
6936  KMP_INTERNAL_FREE((void *)name);
6937 
6938 } // func __kmp_register_library_startup
6939 
6940 void __kmp_unregister_library(void) {
6941 
6942  char *name = __kmp_reg_status_name();
6943  char *value = NULL;
6944 
6945 #if defined(KMP_USE_SHM)
6946  char *shm_name = nullptr;
6947  int fd1;
6948  if (__kmp_shm_available) {
6949  shm_name = __kmp_str_format("/%s", name);
6950  fd1 = shm_open(shm_name, O_RDONLY, 0666);
6951  if (fd1 != -1) { // File opened successfully
6952  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6953  if (data1 != MAP_FAILED) {
6954  value = __kmp_str_format("%s", data1); // read value from SHM
6955  munmap(data1, SHM_SIZE);
6956  }
6957  close(fd1);
6958  }
6959  } else if (__kmp_tmp_available) { // try /tmp
6960  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6961  if (fd1 != -1) { // File opened successfully
6962  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6963  if (data1 != MAP_FAILED) {
6964  value = __kmp_str_format("%s", data1); // read value from /tmp
6965  munmap(data1, SHM_SIZE);
6966  }
6967  close(fd1);
6968  }
6969  } else { // fall back to envirable
6970  value = __kmp_env_get(name);
6971  }
6972 #else
6973  value = __kmp_env_get(name);
6974 #endif
6975 
6976  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6977  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6978  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6979 // Ok, this is our variable. Delete it.
6980 #if defined(KMP_USE_SHM)
6981  if (__kmp_shm_available) {
6982  shm_unlink(shm_name); // this removes file in /dev/shm
6983  } else if (__kmp_tmp_available) {
6984  unlink(temp_reg_status_file_name); // this removes the temp file
6985  } else {
6986  __kmp_env_unset(name);
6987  }
6988 #else
6989  __kmp_env_unset(name);
6990 #endif
6991  }
6992 
6993 #if defined(KMP_USE_SHM)
6994  if (shm_name)
6995  KMP_INTERNAL_FREE(shm_name);
6996  if (temp_reg_status_file_name)
6997  KMP_INTERNAL_FREE(temp_reg_status_file_name);
6998 #endif
6999 
7000  KMP_INTERNAL_FREE(__kmp_registration_str);
7001  KMP_INTERNAL_FREE(value);
7002  KMP_INTERNAL_FREE(name);
7003 
7004  __kmp_registration_flag = 0;
7005  __kmp_registration_str = NULL;
7006 
7007 } // __kmp_unregister_library
7008 
7009 // End of Library registration stuff.
7010 // -----------------------------------------------------------------------------
7011 
7012 #if KMP_MIC_SUPPORTED
7013 
7014 static void __kmp_check_mic_type() {
7015  kmp_cpuid_t cpuid_state = {0};
7016  kmp_cpuid_t *cs_p = &cpuid_state;
7017  __kmp_x86_cpuid(1, 0, cs_p);
7018  // We don't support mic1 at the moment
7019  if ((cs_p->eax & 0xff0) == 0xB10) {
7020  __kmp_mic_type = mic2;
7021  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7022  __kmp_mic_type = mic3;
7023  } else {
7024  __kmp_mic_type = non_mic;
7025  }
7026 }
7027 
7028 #endif /* KMP_MIC_SUPPORTED */
7029 
7030 #if KMP_HAVE_UMWAIT
7031 static void __kmp_user_level_mwait_init() {
7032  struct kmp_cpuid buf;
7033  __kmp_x86_cpuid(7, 0, &buf);
7034  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7035  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7036  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7037  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7038  __kmp_umwait_enabled));
7039 }
7040 #elif KMP_HAVE_MWAIT
7041 #ifndef AT_INTELPHIUSERMWAIT
7042 // Spurious, non-existent value that should always fail to return anything.
7043 // Will be replaced with the correct value when we know that.
7044 #define AT_INTELPHIUSERMWAIT 10000
7045 #endif
7046 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7047 // earlier OS is used to build the RTL, we'll use the following internal
7048 // function when the entry is not found.
7049 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7050 unsigned long getauxval(unsigned long) { return 0; }
7051 
7052 static void __kmp_user_level_mwait_init() {
7053  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7054  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7055  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7056  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7057  if (__kmp_mic_type == mic3) {
7058  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7059  if ((res & 0x1) || __kmp_user_level_mwait) {
7060  __kmp_mwait_enabled = TRUE;
7061  if (__kmp_user_level_mwait) {
7062  KMP_INFORM(EnvMwaitWarn);
7063  }
7064  } else {
7065  __kmp_mwait_enabled = FALSE;
7066  }
7067  }
7068  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7069  "__kmp_mwait_enabled = %d\n",
7070  __kmp_mic_type, __kmp_mwait_enabled));
7071 }
7072 #endif /* KMP_HAVE_UMWAIT */
7073 
7074 static void __kmp_do_serial_initialize(void) {
7075  int i, gtid;
7076  size_t size;
7077 
7078  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7079 
7080  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7081  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7082  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7083  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7084  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7085 
7086 #if OMPT_SUPPORT
7087  ompt_pre_init();
7088 #endif
7089 #if OMPD_SUPPORT
7090  __kmp_env_dump();
7091  ompd_init();
7092 #endif
7093 
7094  __kmp_validate_locks();
7095 
7096 #if ENABLE_LIBOMPTARGET
7097  /* Initialize functions from libomptarget */
7098  __kmp_init_omptarget();
7099 #endif
7100 
7101  /* Initialize internal memory allocator */
7102  __kmp_init_allocator();
7103 
7104  /* Register the library startup via an environment variable or via mapped
7105  shared memory file and check to see whether another copy of the library is
7106  already registered. Since forked child process is often terminated, we
7107  postpone the registration till middle initialization in the child */
7108  if (__kmp_need_register_serial)
7109  __kmp_register_library_startup();
7110 
7111  /* TODO reinitialization of library */
7112  if (TCR_4(__kmp_global.g.g_done)) {
7113  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7114  }
7115 
7116  __kmp_global.g.g_abort = 0;
7117  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7118 
7119 /* initialize the locks */
7120 #if KMP_USE_ADAPTIVE_LOCKS
7121 #if KMP_DEBUG_ADAPTIVE_LOCKS
7122  __kmp_init_speculative_stats();
7123 #endif
7124 #endif
7125 #if KMP_STATS_ENABLED
7126  __kmp_stats_init();
7127 #endif
7128  __kmp_init_lock(&__kmp_global_lock);
7129  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7130  __kmp_init_lock(&__kmp_debug_lock);
7131  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7132  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7133  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7134  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7135  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7136  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7137  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7138  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7139  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7140  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7141  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7142  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7143  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7144  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7145  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7146 #if KMP_USE_MONITOR
7147  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7148 #endif
7149  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7150 
7151  /* conduct initialization and initial setup of configuration */
7152 
7153  __kmp_runtime_initialize();
7154 
7155 #if KMP_MIC_SUPPORTED
7156  __kmp_check_mic_type();
7157 #endif
7158 
7159 // Some global variable initialization moved here from kmp_env_initialize()
7160 #ifdef KMP_DEBUG
7161  kmp_diag = 0;
7162 #endif
7163  __kmp_abort_delay = 0;
7164 
7165  // From __kmp_init_dflt_team_nth()
7166  /* assume the entire machine will be used */
7167  __kmp_dflt_team_nth_ub = __kmp_xproc;
7168  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7169  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7170  }
7171  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7172  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7173  }
7174  __kmp_max_nth = __kmp_sys_max_nth;
7175  __kmp_cg_max_nth = __kmp_sys_max_nth;
7176  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7177  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7178  __kmp_teams_max_nth = __kmp_sys_max_nth;
7179  }
7180 
7181  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7182  // part
7183  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7184 #if KMP_USE_MONITOR
7185  __kmp_monitor_wakeups =
7186  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7187  __kmp_bt_intervals =
7188  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7189 #endif
7190  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7191  __kmp_library = library_throughput;
7192  // From KMP_SCHEDULE initialization
7193  __kmp_static = kmp_sch_static_balanced;
7194 // AC: do not use analytical here, because it is non-monotonous
7195 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7196 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7197 // need to repeat assignment
7198 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7199 // bit control and barrier method control parts
7200 #if KMP_FAST_REDUCTION_BARRIER
7201 #define kmp_reduction_barrier_gather_bb ((int)1)
7202 #define kmp_reduction_barrier_release_bb ((int)1)
7203 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7204 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7205 #endif // KMP_FAST_REDUCTION_BARRIER
7206  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7207  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7208  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7209  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7210  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7211 #if KMP_FAST_REDUCTION_BARRIER
7212  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7213  // lin_64 ): hyper,1
7214  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7215  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7216  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7217  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7218  }
7219 #endif // KMP_FAST_REDUCTION_BARRIER
7220  }
7221 #if KMP_FAST_REDUCTION_BARRIER
7222 #undef kmp_reduction_barrier_release_pat
7223 #undef kmp_reduction_barrier_gather_pat
7224 #undef kmp_reduction_barrier_release_bb
7225 #undef kmp_reduction_barrier_gather_bb
7226 #endif // KMP_FAST_REDUCTION_BARRIER
7227 #if KMP_MIC_SUPPORTED
7228  if (__kmp_mic_type == mic2) { // KNC
7229  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7230  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7231  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7232  1; // forkjoin release
7233  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7234  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7235  }
7236 #if KMP_FAST_REDUCTION_BARRIER
7237  if (__kmp_mic_type == mic2) { // KNC
7238  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7239  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7240  }
7241 #endif // KMP_FAST_REDUCTION_BARRIER
7242 #endif // KMP_MIC_SUPPORTED
7243 
7244 // From KMP_CHECKS initialization
7245 #ifdef KMP_DEBUG
7246  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7247 #else
7248  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7249 #endif
7250 
7251  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7252  __kmp_foreign_tp = TRUE;
7253 
7254  __kmp_global.g.g_dynamic = FALSE;
7255  __kmp_global.g.g_dynamic_mode = dynamic_default;
7256 
7257  __kmp_init_nesting_mode();
7258 
7259  __kmp_env_initialize(NULL);
7260 
7261 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7262  __kmp_user_level_mwait_init();
7263 #endif
7264 // Print all messages in message catalog for testing purposes.
7265 #ifdef KMP_DEBUG
7266  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7267  if (__kmp_str_match_true(val)) {
7268  kmp_str_buf_t buffer;
7269  __kmp_str_buf_init(&buffer);
7270  __kmp_i18n_dump_catalog(&buffer);
7271  __kmp_printf("%s", buffer.str);
7272  __kmp_str_buf_free(&buffer);
7273  }
7274  __kmp_env_free(&val);
7275 #endif
7276 
7277  __kmp_threads_capacity =
7278  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7279  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7280  __kmp_tp_capacity = __kmp_default_tp_capacity(
7281  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7282 
7283  // If the library is shut down properly, both pools must be NULL. Just in
7284  // case, set them to NULL -- some memory may leak, but subsequent code will
7285  // work even if pools are not freed.
7286  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7287  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7288  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7289  __kmp_thread_pool = NULL;
7290  __kmp_thread_pool_insert_pt = NULL;
7291  __kmp_team_pool = NULL;
7292 
7293  /* Allocate all of the variable sized records */
7294  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7295  * expandable */
7296  /* Since allocation is cache-aligned, just add extra padding at the end */
7297  size =
7298  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7299  CACHE_LINE;
7300  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7301  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7302  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7303 
7304  /* init thread counts */
7305  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7306  0); // Asserts fail if the library is reinitializing and
7307  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7308  __kmp_all_nth = 0;
7309  __kmp_nth = 0;
7310 
7311  /* setup the uber master thread and hierarchy */
7312  gtid = __kmp_register_root(TRUE);
7313  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7314  KMP_ASSERT(KMP_UBER_GTID(gtid));
7315  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7316 
7317  KMP_MB(); /* Flush all pending memory write invalidates. */
7318 
7319  __kmp_common_initialize();
7320 
7321 #if KMP_OS_UNIX
7322  /* invoke the child fork handler */
7323  __kmp_register_atfork();
7324 #endif
7325 
7326 #if !KMP_DYNAMIC_LIB || \
7327  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7328  {
7329  /* Invoke the exit handler when the program finishes, only for static
7330  library and macOS* dynamic. For other dynamic libraries, we already
7331  have _fini and DllMain. */
7332  int rc = atexit(__kmp_internal_end_atexit);
7333  if (rc != 0) {
7334  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7335  __kmp_msg_null);
7336  }
7337  }
7338 #endif
7339 
7340 #if KMP_HANDLE_SIGNALS
7341 #if KMP_OS_UNIX
7342  /* NOTE: make sure that this is called before the user installs their own
7343  signal handlers so that the user handlers are called first. this way they
7344  can return false, not call our handler, avoid terminating the library, and
7345  continue execution where they left off. */
7346  __kmp_install_signals(FALSE);
7347 #endif /* KMP_OS_UNIX */
7348 #if KMP_OS_WINDOWS
7349  __kmp_install_signals(TRUE);
7350 #endif /* KMP_OS_WINDOWS */
7351 #endif
7352 
7353  /* we have finished the serial initialization */
7354  __kmp_init_counter++;
7355 
7356  __kmp_init_serial = TRUE;
7357 
7358  if (__kmp_version) {
7359  __kmp_print_version_1();
7360  }
7361 
7362  if (__kmp_settings) {
7363  __kmp_env_print();
7364  }
7365 
7366  if (__kmp_display_env || __kmp_display_env_verbose) {
7367  __kmp_env_print_2();
7368  }
7369 
7370 #if OMPT_SUPPORT
7371  ompt_post_init();
7372 #endif
7373 
7374  KMP_MB();
7375 
7376  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7377 }
7378 
7379 void __kmp_serial_initialize(void) {
7380  if (__kmp_init_serial) {
7381  return;
7382  }
7383  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7384  if (__kmp_init_serial) {
7385  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7386  return;
7387  }
7388  __kmp_do_serial_initialize();
7389  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7390 }
7391 
7392 static void __kmp_do_middle_initialize(void) {
7393  int i, j;
7394  int prev_dflt_team_nth;
7395 
7396  if (!__kmp_init_serial) {
7397  __kmp_do_serial_initialize();
7398  }
7399 
7400  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7401 
7402  if (UNLIKELY(!__kmp_need_register_serial)) {
7403  // We are in a forked child process. The registration was skipped during
7404  // serial initialization in __kmp_atfork_child handler. Do it here.
7405  __kmp_register_library_startup();
7406  }
7407 
7408  // Save the previous value for the __kmp_dflt_team_nth so that
7409  // we can avoid some reinitialization if it hasn't changed.
7410  prev_dflt_team_nth = __kmp_dflt_team_nth;
7411 
7412 #if KMP_AFFINITY_SUPPORTED
7413  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7414  // number of cores on the machine.
7415  __kmp_affinity_initialize(__kmp_affinity);
7416 
7417 #endif /* KMP_AFFINITY_SUPPORTED */
7418 
7419  KMP_ASSERT(__kmp_xproc > 0);
7420  if (__kmp_avail_proc == 0) {
7421  __kmp_avail_proc = __kmp_xproc;
7422  }
7423 
7424  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7425  // correct them now
7426  j = 0;
7427  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7428  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7429  __kmp_avail_proc;
7430  j++;
7431  }
7432 
7433  if (__kmp_dflt_team_nth == 0) {
7434 #ifdef KMP_DFLT_NTH_CORES
7435  // Default #threads = #cores
7436  __kmp_dflt_team_nth = __kmp_ncores;
7437  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7438  "__kmp_ncores (%d)\n",
7439  __kmp_dflt_team_nth));
7440 #else
7441  // Default #threads = #available OS procs
7442  __kmp_dflt_team_nth = __kmp_avail_proc;
7443  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7444  "__kmp_avail_proc(%d)\n",
7445  __kmp_dflt_team_nth));
7446 #endif /* KMP_DFLT_NTH_CORES */
7447  }
7448 
7449  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7450  __kmp_dflt_team_nth = KMP_MIN_NTH;
7451  }
7452  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7453  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7454  }
7455 
7456  if (__kmp_nesting_mode > 0)
7457  __kmp_set_nesting_mode_threads();
7458 
7459  // There's no harm in continuing if the following check fails,
7460  // but it indicates an error in the previous logic.
7461  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7462 
7463  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7464  // Run through the __kmp_threads array and set the num threads icv for each
7465  // root thread that is currently registered with the RTL (which has not
7466  // already explicitly set its nthreads-var with a call to
7467  // omp_set_num_threads()).
7468  for (i = 0; i < __kmp_threads_capacity; i++) {
7469  kmp_info_t *thread = __kmp_threads[i];
7470  if (thread == NULL)
7471  continue;
7472  if (thread->th.th_current_task->td_icvs.nproc != 0)
7473  continue;
7474 
7475  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7476  }
7477  }
7478  KA_TRACE(
7479  20,
7480  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7481  __kmp_dflt_team_nth));
7482 
7483 #ifdef KMP_ADJUST_BLOCKTIME
7484  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7485  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7486  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7487  if (__kmp_nth > __kmp_avail_proc) {
7488  __kmp_zero_bt = TRUE;
7489  }
7490  }
7491 #endif /* KMP_ADJUST_BLOCKTIME */
7492 
7493  /* we have finished middle initialization */
7494  TCW_SYNC_4(__kmp_init_middle, TRUE);
7495 
7496  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7497 }
7498 
7499 void __kmp_middle_initialize(void) {
7500  if (__kmp_init_middle) {
7501  return;
7502  }
7503  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7504  if (__kmp_init_middle) {
7505  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7506  return;
7507  }
7508  __kmp_do_middle_initialize();
7509  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7510 }
7511 
7512 void __kmp_parallel_initialize(void) {
7513  int gtid = __kmp_entry_gtid(); // this might be a new root
7514 
7515  /* synchronize parallel initialization (for sibling) */
7516  if (TCR_4(__kmp_init_parallel))
7517  return;
7518  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7519  if (TCR_4(__kmp_init_parallel)) {
7520  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7521  return;
7522  }
7523 
7524  /* TODO reinitialization after we have already shut down */
7525  if (TCR_4(__kmp_global.g.g_done)) {
7526  KA_TRACE(
7527  10,
7528  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7529  __kmp_infinite_loop();
7530  }
7531 
7532  /* jc: The lock __kmp_initz_lock is already held, so calling
7533  __kmp_serial_initialize would cause a deadlock. So we call
7534  __kmp_do_serial_initialize directly. */
7535  if (!__kmp_init_middle) {
7536  __kmp_do_middle_initialize();
7537  }
7538  __kmp_assign_root_init_mask();
7539  __kmp_resume_if_hard_paused();
7540 
7541  /* begin initialization */
7542  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7543  KMP_ASSERT(KMP_UBER_GTID(gtid));
7544 
7545 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7546  // Save the FP control regs.
7547  // Worker threads will set theirs to these values at thread startup.
7548  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7549  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7550  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7551 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7552 
7553 #if KMP_OS_UNIX
7554 #if KMP_HANDLE_SIGNALS
7555  /* must be after __kmp_serial_initialize */
7556  __kmp_install_signals(TRUE);
7557 #endif
7558 #endif
7559 
7560  __kmp_suspend_initialize();
7561 
7562 #if defined(USE_LOAD_BALANCE)
7563  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7564  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7565  }
7566 #else
7567  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7568  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7569  }
7570 #endif
7571 
7572  if (__kmp_version) {
7573  __kmp_print_version_2();
7574  }
7575 
7576  /* we have finished parallel initialization */
7577  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7578 
7579  KMP_MB();
7580  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7581 
7582  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7583 }
7584 
7585 void __kmp_hidden_helper_initialize() {
7586  if (TCR_4(__kmp_init_hidden_helper))
7587  return;
7588 
7589  // __kmp_parallel_initialize is required before we initialize hidden helper
7590  if (!TCR_4(__kmp_init_parallel))
7591  __kmp_parallel_initialize();
7592 
7593  // Double check. Note that this double check should not be placed before
7594  // __kmp_parallel_initialize as it will cause dead lock.
7595  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7596  if (TCR_4(__kmp_init_hidden_helper)) {
7597  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7598  return;
7599  }
7600 
7601 #if KMP_AFFINITY_SUPPORTED
7602  // Initialize hidden helper affinity settings.
7603  // The above __kmp_parallel_initialize() will initialize
7604  // regular affinity (and topology) if not already done.
7605  if (!__kmp_hh_affinity.flags.initialized)
7606  __kmp_affinity_initialize(__kmp_hh_affinity);
7607 #endif
7608 
7609  // Set the count of hidden helper tasks to be executed to zero
7610  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7611 
7612  // Set the global variable indicating that we're initializing hidden helper
7613  // team/threads
7614  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7615 
7616  // Platform independent initialization
7617  __kmp_do_initialize_hidden_helper_threads();
7618 
7619  // Wait here for the finish of initialization of hidden helper teams
7620  __kmp_hidden_helper_threads_initz_wait();
7621 
7622  // We have finished hidden helper initialization
7623  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7624 
7625  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7626 }
7627 
7628 /* ------------------------------------------------------------------------ */
7629 
7630 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7631  kmp_team_t *team) {
7632  kmp_disp_t *dispatch;
7633 
7634  KMP_MB();
7635 
7636  /* none of the threads have encountered any constructs, yet. */
7637  this_thr->th.th_local.this_construct = 0;
7638 #if KMP_CACHE_MANAGE
7639  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7640 #endif /* KMP_CACHE_MANAGE */
7641  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7642  KMP_DEBUG_ASSERT(dispatch);
7643  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7644  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7645  // this_thr->th.th_info.ds.ds_tid ] );
7646 
7647  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7648  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7649  if (__kmp_env_consistency_check)
7650  __kmp_push_parallel(gtid, team->t.t_ident);
7651 
7652  KMP_MB(); /* Flush all pending memory write invalidates. */
7653 }
7654 
7655 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7656  kmp_team_t *team) {
7657  if (__kmp_env_consistency_check)
7658  __kmp_pop_parallel(gtid, team->t.t_ident);
7659 
7660  __kmp_finish_implicit_task(this_thr);
7661 }
7662 
7663 int __kmp_invoke_task_func(int gtid) {
7664  int rc;
7665  int tid = __kmp_tid_from_gtid(gtid);
7666  kmp_info_t *this_thr = __kmp_threads[gtid];
7667  kmp_team_t *team = this_thr->th.th_team;
7668 
7669  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7670 #if USE_ITT_BUILD
7671  if (__itt_stack_caller_create_ptr) {
7672  // inform ittnotify about entering user's code
7673  if (team->t.t_stack_id != NULL) {
7674  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7675  } else {
7676  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7677  __kmp_itt_stack_callee_enter(
7678  (__itt_caller)team->t.t_parent->t.t_stack_id);
7679  }
7680  }
7681 #endif /* USE_ITT_BUILD */
7682 #if INCLUDE_SSC_MARKS
7683  SSC_MARK_INVOKING();
7684 #endif
7685 
7686 #if OMPT_SUPPORT
7687  void *dummy;
7688  void **exit_frame_p;
7689  ompt_data_t *my_task_data;
7690  ompt_data_t *my_parallel_data;
7691  int ompt_team_size;
7692 
7693  if (ompt_enabled.enabled) {
7694  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7695  .ompt_task_info.frame.exit_frame.ptr);
7696  } else {
7697  exit_frame_p = &dummy;
7698  }
7699 
7700  my_task_data =
7701  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7702  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7703  if (ompt_enabled.ompt_callback_implicit_task) {
7704  ompt_team_size = team->t.t_nproc;
7705  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7706  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7707  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7708  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7709  }
7710 #endif
7711 
7712 #if KMP_STATS_ENABLED
7713  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7714  if (previous_state == stats_state_e::TEAMS_REGION) {
7715  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7716  } else {
7717  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7718  }
7719  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7720 #endif
7721 
7722  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7723  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7724 #if OMPT_SUPPORT
7725  ,
7726  exit_frame_p
7727 #endif
7728  );
7729 #if OMPT_SUPPORT
7730  *exit_frame_p = NULL;
7731  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7732 #endif
7733 
7734 #if KMP_STATS_ENABLED
7735  if (previous_state == stats_state_e::TEAMS_REGION) {
7736  KMP_SET_THREAD_STATE(previous_state);
7737  }
7738  KMP_POP_PARTITIONED_TIMER();
7739 #endif
7740 
7741 #if USE_ITT_BUILD
7742  if (__itt_stack_caller_create_ptr) {
7743  // inform ittnotify about leaving user's code
7744  if (team->t.t_stack_id != NULL) {
7745  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7746  } else {
7747  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7748  __kmp_itt_stack_callee_leave(
7749  (__itt_caller)team->t.t_parent->t.t_stack_id);
7750  }
7751  }
7752 #endif /* USE_ITT_BUILD */
7753  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7754 
7755  return rc;
7756 }
7757 
7758 void __kmp_teams_master(int gtid) {
7759  // This routine is called by all primary threads in teams construct
7760  kmp_info_t *thr = __kmp_threads[gtid];
7761  kmp_team_t *team = thr->th.th_team;
7762  ident_t *loc = team->t.t_ident;
7763  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7764  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7765  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7766  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7767  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7768 
7769  // This thread is a new CG root. Set up the proper variables.
7770  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7771  tmp->cg_root = thr; // Make thr the CG root
7772  // Init to thread limit stored when league primary threads were forked
7773  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7774  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7775  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7776  " cg_nthreads to 1\n",
7777  thr, tmp));
7778  tmp->up = thr->th.th_cg_roots;
7779  thr->th.th_cg_roots = tmp;
7780 
7781 // Launch league of teams now, but not let workers execute
7782 // (they hang on fork barrier until next parallel)
7783 #if INCLUDE_SSC_MARKS
7784  SSC_MARK_FORKING();
7785 #endif
7786  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7787  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7788  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7789 #if INCLUDE_SSC_MARKS
7790  SSC_MARK_JOINING();
7791 #endif
7792  // If the team size was reduced from the limit, set it to the new size
7793  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7794  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7795  // AC: last parameter "1" eliminates join barrier which won't work because
7796  // worker threads are in a fork barrier waiting for more parallel regions
7797  __kmp_join_call(loc, gtid
7798 #if OMPT_SUPPORT
7799  ,
7800  fork_context_intel
7801 #endif
7802  ,
7803  1);
7804 }
7805 
7806 int __kmp_invoke_teams_master(int gtid) {
7807  kmp_info_t *this_thr = __kmp_threads[gtid];
7808  kmp_team_t *team = this_thr->th.th_team;
7809 #if KMP_DEBUG
7810  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7811  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7812  (void *)__kmp_teams_master);
7813 #endif
7814  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7815 #if OMPT_SUPPORT
7816  int tid = __kmp_tid_from_gtid(gtid);
7817  ompt_data_t *task_data =
7818  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7819  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7820  if (ompt_enabled.ompt_callback_implicit_task) {
7821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7822  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7823  ompt_task_initial);
7824  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7825  }
7826 #endif
7827  __kmp_teams_master(gtid);
7828 #if OMPT_SUPPORT
7829  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7830 #endif
7831  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7832  return 1;
7833 }
7834 
7835 /* this sets the requested number of threads for the next parallel region
7836  encountered by this team. since this should be enclosed in the forkjoin
7837  critical section it should avoid race conditions with asymmetrical nested
7838  parallelism */
7839 
7840 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7841  kmp_info_t *thr = __kmp_threads[gtid];
7842 
7843  if (num_threads > 0)
7844  thr->th.th_set_nproc = num_threads;
7845 }
7846 
7847 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7848  int num_threads) {
7849  KMP_DEBUG_ASSERT(thr);
7850  // Remember the number of threads for inner parallel regions
7851  if (!TCR_4(__kmp_init_middle))
7852  __kmp_middle_initialize(); // get internal globals calculated
7853  __kmp_assign_root_init_mask();
7854  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7855  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7856 
7857  if (num_threads == 0) {
7858  if (__kmp_teams_thread_limit > 0) {
7859  num_threads = __kmp_teams_thread_limit;
7860  } else {
7861  num_threads = __kmp_avail_proc / num_teams;
7862  }
7863  // adjust num_threads w/o warning as it is not user setting
7864  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7865  // no thread_limit clause specified - do not change thread-limit-var ICV
7866  if (num_threads > __kmp_dflt_team_nth) {
7867  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7868  }
7869  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7870  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7871  } // prevent team size to exceed thread-limit-var
7872  if (num_teams * num_threads > __kmp_teams_max_nth) {
7873  num_threads = __kmp_teams_max_nth / num_teams;
7874  }
7875  if (num_threads == 0) {
7876  num_threads = 1;
7877  }
7878  } else {
7879  if (num_threads < 0) {
7880  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7881  __kmp_msg_null);
7882  num_threads = 1;
7883  }
7884  // This thread will be the primary thread of the league primary threads
7885  // Store new thread limit; old limit is saved in th_cg_roots list
7886  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7887  // num_threads = min(num_threads, nthreads-var)
7888  if (num_threads > __kmp_dflt_team_nth) {
7889  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7890  }
7891  if (num_teams * num_threads > __kmp_teams_max_nth) {
7892  int new_threads = __kmp_teams_max_nth / num_teams;
7893  if (new_threads == 0) {
7894  new_threads = 1;
7895  }
7896  if (new_threads != num_threads) {
7897  if (!__kmp_reserve_warn) { // user asked for too many threads
7898  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7899  __kmp_msg(kmp_ms_warning,
7900  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7901  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7902  }
7903  }
7904  num_threads = new_threads;
7905  }
7906  }
7907  thr->th.th_teams_size.nth = num_threads;
7908 }
7909 
7910 /* this sets the requested number of teams for the teams region and/or
7911  the number of threads for the next parallel region encountered */
7912 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7913  int num_threads) {
7914  kmp_info_t *thr = __kmp_threads[gtid];
7915  if (num_teams < 0) {
7916  // OpenMP specification requires requested values to be positive,
7917  // but people can send us any value, so we'd better check
7918  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7919  __kmp_msg_null);
7920  num_teams = 1;
7921  }
7922  if (num_teams == 0) {
7923  if (__kmp_nteams > 0) {
7924  num_teams = __kmp_nteams;
7925  } else {
7926  num_teams = 1; // default number of teams is 1.
7927  }
7928  }
7929  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7930  if (!__kmp_reserve_warn) {
7931  __kmp_reserve_warn = 1;
7932  __kmp_msg(kmp_ms_warning,
7933  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7934  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7935  }
7936  num_teams = __kmp_teams_max_nth;
7937  }
7938  // Set number of teams (number of threads in the outer "parallel" of the
7939  // teams)
7940  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7941 
7942  __kmp_push_thread_limit(thr, num_teams, num_threads);
7943 }
7944 
7945 /* This sets the requested number of teams for the teams region and/or
7946  the number of threads for the next parallel region encountered */
7947 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7948  int num_teams_ub, int num_threads) {
7949  kmp_info_t *thr = __kmp_threads[gtid];
7950  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7951  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7952  KMP_DEBUG_ASSERT(num_threads >= 0);
7953 
7954  if (num_teams_lb > num_teams_ub) {
7955  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7956  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7957  }
7958 
7959  int num_teams = 1; // defalt number of teams is 1.
7960 
7961  if (num_teams_lb == 0 && num_teams_ub > 0)
7962  num_teams_lb = num_teams_ub;
7963 
7964  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7965  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7966  if (num_teams > __kmp_teams_max_nth) {
7967  if (!__kmp_reserve_warn) {
7968  __kmp_reserve_warn = 1;
7969  __kmp_msg(kmp_ms_warning,
7970  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7971  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7972  }
7973  num_teams = __kmp_teams_max_nth;
7974  }
7975  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7976  num_teams = num_teams_ub;
7977  } else { // num_teams_lb <= num_teams <= num_teams_ub
7978  if (num_threads <= 0) {
7979  if (num_teams_ub > __kmp_teams_max_nth) {
7980  num_teams = num_teams_lb;
7981  } else {
7982  num_teams = num_teams_ub;
7983  }
7984  } else {
7985  num_teams = (num_threads > __kmp_teams_max_nth)
7986  ? num_teams
7987  : __kmp_teams_max_nth / num_threads;
7988  if (num_teams < num_teams_lb) {
7989  num_teams = num_teams_lb;
7990  } else if (num_teams > num_teams_ub) {
7991  num_teams = num_teams_ub;
7992  }
7993  }
7994  }
7995  // Set number of teams (number of threads in the outer "parallel" of the
7996  // teams)
7997  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7998 
7999  __kmp_push_thread_limit(thr, num_teams, num_threads);
8000 }
8001 
8002 // Set the proc_bind var to use in the following parallel region.
8003 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8004  kmp_info_t *thr = __kmp_threads[gtid];
8005  thr->th.th_set_proc_bind = proc_bind;
8006 }
8007 
8008 /* Launch the worker threads into the microtask. */
8009 
8010 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8011  kmp_info_t *this_thr = __kmp_threads[gtid];
8012 
8013 #ifdef KMP_DEBUG
8014  int f;
8015 #endif /* KMP_DEBUG */
8016 
8017  KMP_DEBUG_ASSERT(team);
8018  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8019  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8020  KMP_MB(); /* Flush all pending memory write invalidates. */
8021 
8022  team->t.t_construct = 0; /* no single directives seen yet */
8023  team->t.t_ordered.dt.t_value =
8024  0; /* thread 0 enters the ordered section first */
8025 
8026  /* Reset the identifiers on the dispatch buffer */
8027  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8028  if (team->t.t_max_nproc > 1) {
8029  int i;
8030  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8031  team->t.t_disp_buffer[i].buffer_index = i;
8032  team->t.t_disp_buffer[i].doacross_buf_idx = i;
8033  }
8034  } else {
8035  team->t.t_disp_buffer[0].buffer_index = 0;
8036  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8037  }
8038 
8039  KMP_MB(); /* Flush all pending memory write invalidates. */
8040  KMP_ASSERT(this_thr->th.th_team == team);
8041 
8042 #ifdef KMP_DEBUG
8043  for (f = 0; f < team->t.t_nproc; f++) {
8044  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8045  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8046  }
8047 #endif /* KMP_DEBUG */
8048 
8049  /* release the worker threads so they may begin working */
8050  __kmp_fork_barrier(gtid, 0);
8051 }
8052 
8053 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8054  kmp_info_t *this_thr = __kmp_threads[gtid];
8055 
8056  KMP_DEBUG_ASSERT(team);
8057  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8058  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8059  KMP_MB(); /* Flush all pending memory write invalidates. */
8060 
8061  /* Join barrier after fork */
8062 
8063 #ifdef KMP_DEBUG
8064  if (__kmp_threads[gtid] &&
8065  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8066  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8067  __kmp_threads[gtid]);
8068  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8069  "team->t.t_nproc=%d\n",
8070  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8071  team->t.t_nproc);
8072  __kmp_print_structure();
8073  }
8074  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8075  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8076 #endif /* KMP_DEBUG */
8077 
8078  __kmp_join_barrier(gtid); /* wait for everyone */
8079 #if OMPT_SUPPORT
8080  if (ompt_enabled.enabled &&
8081  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8082  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8083  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8084  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8085 #if OMPT_OPTIONAL
8086  void *codeptr = NULL;
8087  if (KMP_MASTER_TID(ds_tid) &&
8088  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8089  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8090  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8091 
8092  if (ompt_enabled.ompt_callback_sync_region_wait) {
8093  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8094  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8095  codeptr);
8096  }
8097  if (ompt_enabled.ompt_callback_sync_region) {
8098  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8099  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8100  codeptr);
8101  }
8102 #endif
8103  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8104  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8105  ompt_scope_end, NULL, task_data, 0, ds_tid,
8106  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8107  }
8108  }
8109 #endif
8110 
8111  KMP_MB(); /* Flush all pending memory write invalidates. */
8112  KMP_ASSERT(this_thr->th.th_team == team);
8113 }
8114 
8115 /* ------------------------------------------------------------------------ */
8116 
8117 #ifdef USE_LOAD_BALANCE
8118 
8119 // Return the worker threads actively spinning in the hot team, if we
8120 // are at the outermost level of parallelism. Otherwise, return 0.
8121 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8122  int i;
8123  int retval;
8124  kmp_team_t *hot_team;
8125 
8126  if (root->r.r_active) {
8127  return 0;
8128  }
8129  hot_team = root->r.r_hot_team;
8130  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8131  return hot_team->t.t_nproc - 1; // Don't count primary thread
8132  }
8133 
8134  // Skip the primary thread - it is accounted for elsewhere.
8135  retval = 0;
8136  for (i = 1; i < hot_team->t.t_nproc; i++) {
8137  if (hot_team->t.t_threads[i]->th.th_active) {
8138  retval++;
8139  }
8140  }
8141  return retval;
8142 }
8143 
8144 // Perform an automatic adjustment to the number of
8145 // threads used by the next parallel region.
8146 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8147  int retval;
8148  int pool_active;
8149  int hot_team_active;
8150  int team_curr_active;
8151  int system_active;
8152 
8153  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8154  set_nproc));
8155  KMP_DEBUG_ASSERT(root);
8156  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8157  ->th.th_current_task->td_icvs.dynamic == TRUE);
8158  KMP_DEBUG_ASSERT(set_nproc > 1);
8159 
8160  if (set_nproc == 1) {
8161  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8162  return 1;
8163  }
8164 
8165  // Threads that are active in the thread pool, active in the hot team for this
8166  // particular root (if we are at the outer par level), and the currently
8167  // executing thread (to become the primary thread) are available to add to the
8168  // new team, but are currently contributing to the system load, and must be
8169  // accounted for.
8170  pool_active = __kmp_thread_pool_active_nth;
8171  hot_team_active = __kmp_active_hot_team_nproc(root);
8172  team_curr_active = pool_active + hot_team_active + 1;
8173 
8174  // Check the system load.
8175  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8176  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8177  "hot team active = %d\n",
8178  system_active, pool_active, hot_team_active));
8179 
8180  if (system_active < 0) {
8181  // There was an error reading the necessary info from /proc, so use the
8182  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8183  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8184  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8185  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8186 
8187  // Make this call behave like the thread limit algorithm.
8188  retval = __kmp_avail_proc - __kmp_nth +
8189  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8190  if (retval > set_nproc) {
8191  retval = set_nproc;
8192  }
8193  if (retval < KMP_MIN_NTH) {
8194  retval = KMP_MIN_NTH;
8195  }
8196 
8197  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8198  retval));
8199  return retval;
8200  }
8201 
8202  // There is a slight delay in the load balance algorithm in detecting new
8203  // running procs. The real system load at this instant should be at least as
8204  // large as the #active omp thread that are available to add to the team.
8205  if (system_active < team_curr_active) {
8206  system_active = team_curr_active;
8207  }
8208  retval = __kmp_avail_proc - system_active + team_curr_active;
8209  if (retval > set_nproc) {
8210  retval = set_nproc;
8211  }
8212  if (retval < KMP_MIN_NTH) {
8213  retval = KMP_MIN_NTH;
8214  }
8215 
8216  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8217  return retval;
8218 } // __kmp_load_balance_nproc()
8219 
8220 #endif /* USE_LOAD_BALANCE */
8221 
8222 /* ------------------------------------------------------------------------ */
8223 
8224 /* NOTE: this is called with the __kmp_init_lock held */
8225 void __kmp_cleanup(void) {
8226  int f;
8227 
8228  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8229 
8230  if (TCR_4(__kmp_init_parallel)) {
8231 #if KMP_HANDLE_SIGNALS
8232  __kmp_remove_signals();
8233 #endif
8234  TCW_4(__kmp_init_parallel, FALSE);
8235  }
8236 
8237  if (TCR_4(__kmp_init_middle)) {
8238 #if KMP_AFFINITY_SUPPORTED
8239  __kmp_affinity_uninitialize();
8240 #endif /* KMP_AFFINITY_SUPPORTED */
8241  __kmp_cleanup_hierarchy();
8242  TCW_4(__kmp_init_middle, FALSE);
8243  }
8244 
8245  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8246 
8247  if (__kmp_init_serial) {
8248  __kmp_runtime_destroy();
8249  __kmp_init_serial = FALSE;
8250  }
8251 
8252  __kmp_cleanup_threadprivate_caches();
8253 
8254  for (f = 0; f < __kmp_threads_capacity; f++) {
8255  if (__kmp_root[f] != NULL) {
8256  __kmp_free(__kmp_root[f]);
8257  __kmp_root[f] = NULL;
8258  }
8259  }
8260  __kmp_free(__kmp_threads);
8261  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8262  // there is no need in freeing __kmp_root.
8263  __kmp_threads = NULL;
8264  __kmp_root = NULL;
8265  __kmp_threads_capacity = 0;
8266 
8267  // Free old __kmp_threads arrays if they exist.
8268  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8269  while (ptr) {
8270  kmp_old_threads_list_t *next = ptr->next;
8271  __kmp_free(ptr->threads);
8272  __kmp_free(ptr);
8273  ptr = next;
8274  }
8275 
8276 #if KMP_USE_DYNAMIC_LOCK
8277  __kmp_cleanup_indirect_user_locks();
8278 #else
8279  __kmp_cleanup_user_locks();
8280 #endif
8281 #if OMPD_SUPPORT
8282  if (ompd_state) {
8283  __kmp_free(ompd_env_block);
8284  ompd_env_block = NULL;
8285  ompd_env_block_size = 0;
8286  }
8287 #endif
8288 
8289 #if KMP_AFFINITY_SUPPORTED
8290  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8291  __kmp_cpuinfo_file = NULL;
8292 #endif /* KMP_AFFINITY_SUPPORTED */
8293 
8294 #if KMP_USE_ADAPTIVE_LOCKS
8295 #if KMP_DEBUG_ADAPTIVE_LOCKS
8296  __kmp_print_speculative_stats();
8297 #endif
8298 #endif
8299  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8300  __kmp_nested_nth.nth = NULL;
8301  __kmp_nested_nth.size = 0;
8302  __kmp_nested_nth.used = 0;
8303  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8304  __kmp_nested_proc_bind.bind_types = NULL;
8305  __kmp_nested_proc_bind.size = 0;
8306  __kmp_nested_proc_bind.used = 0;
8307  if (__kmp_affinity_format) {
8308  KMP_INTERNAL_FREE(__kmp_affinity_format);
8309  __kmp_affinity_format = NULL;
8310  }
8311 
8312  __kmp_i18n_catclose();
8313 
8314 #if KMP_USE_HIER_SCHED
8315  __kmp_hier_scheds.deallocate();
8316 #endif
8317 
8318 #if KMP_STATS_ENABLED
8319  __kmp_stats_fini();
8320 #endif
8321 
8322  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8323 }
8324 
8325 /* ------------------------------------------------------------------------ */
8326 
8327 int __kmp_ignore_mppbeg(void) {
8328  char *env;
8329 
8330  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8331  if (__kmp_str_match_false(env))
8332  return FALSE;
8333  }
8334  // By default __kmpc_begin() is no-op.
8335  return TRUE;
8336 }
8337 
8338 int __kmp_ignore_mppend(void) {
8339  char *env;
8340 
8341  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8342  if (__kmp_str_match_false(env))
8343  return FALSE;
8344  }
8345  // By default __kmpc_end() is no-op.
8346  return TRUE;
8347 }
8348 
8349 void __kmp_internal_begin(void) {
8350  int gtid;
8351  kmp_root_t *root;
8352 
8353  /* this is a very important step as it will register new sibling threads
8354  and assign these new uber threads a new gtid */
8355  gtid = __kmp_entry_gtid();
8356  root = __kmp_threads[gtid]->th.th_root;
8357  KMP_ASSERT(KMP_UBER_GTID(gtid));
8358 
8359  if (root->r.r_begin)
8360  return;
8361  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8362  if (root->r.r_begin) {
8363  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8364  return;
8365  }
8366 
8367  root->r.r_begin = TRUE;
8368 
8369  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8370 }
8371 
8372 /* ------------------------------------------------------------------------ */
8373 
8374 void __kmp_user_set_library(enum library_type arg) {
8375  int gtid;
8376  kmp_root_t *root;
8377  kmp_info_t *thread;
8378 
8379  /* first, make sure we are initialized so we can get our gtid */
8380 
8381  gtid = __kmp_entry_gtid();
8382  thread = __kmp_threads[gtid];
8383 
8384  root = thread->th.th_root;
8385 
8386  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8387  library_serial));
8388  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8389  thread */
8390  KMP_WARNING(SetLibraryIncorrectCall);
8391  return;
8392  }
8393 
8394  switch (arg) {
8395  case library_serial:
8396  thread->th.th_set_nproc = 0;
8397  set__nproc(thread, 1);
8398  break;
8399  case library_turnaround:
8400  thread->th.th_set_nproc = 0;
8401  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8402  : __kmp_dflt_team_nth_ub);
8403  break;
8404  case library_throughput:
8405  thread->th.th_set_nproc = 0;
8406  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8407  : __kmp_dflt_team_nth_ub);
8408  break;
8409  default:
8410  KMP_FATAL(UnknownLibraryType, arg);
8411  }
8412 
8413  __kmp_aux_set_library(arg);
8414 }
8415 
8416 void __kmp_aux_set_stacksize(size_t arg) {
8417  if (!__kmp_init_serial)
8418  __kmp_serial_initialize();
8419 
8420 #if KMP_OS_DARWIN
8421  if (arg & (0x1000 - 1)) {
8422  arg &= ~(0x1000 - 1);
8423  if (arg + 0x1000) /* check for overflow if we round up */
8424  arg += 0x1000;
8425  }
8426 #endif
8427  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8428 
8429  /* only change the default stacksize before the first parallel region */
8430  if (!TCR_4(__kmp_init_parallel)) {
8431  size_t value = arg; /* argument is in bytes */
8432 
8433  if (value < __kmp_sys_min_stksize)
8434  value = __kmp_sys_min_stksize;
8435  else if (value > KMP_MAX_STKSIZE)
8436  value = KMP_MAX_STKSIZE;
8437 
8438  __kmp_stksize = value;
8439 
8440  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8441  }
8442 
8443  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8444 }
8445 
8446 /* set the behaviour of the runtime library */
8447 /* TODO this can cause some odd behaviour with sibling parallelism... */
8448 void __kmp_aux_set_library(enum library_type arg) {
8449  __kmp_library = arg;
8450 
8451  switch (__kmp_library) {
8452  case library_serial: {
8453  KMP_INFORM(LibraryIsSerial);
8454  } break;
8455  case library_turnaround:
8456  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8457  __kmp_use_yield = 2; // only yield when oversubscribed
8458  break;
8459  case library_throughput:
8460  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8461  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8462  break;
8463  default:
8464  KMP_FATAL(UnknownLibraryType, arg);
8465  }
8466 }
8467 
8468 /* Getting team information common for all team API */
8469 // Returns NULL if not in teams construct
8470 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8471  kmp_info_t *thr = __kmp_entry_thread();
8472  teams_serialized = 0;
8473  if (thr->th.th_teams_microtask) {
8474  kmp_team_t *team = thr->th.th_team;
8475  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8476  int ii = team->t.t_level;
8477  teams_serialized = team->t.t_serialized;
8478  int level = tlevel + 1;
8479  KMP_DEBUG_ASSERT(ii >= tlevel);
8480  while (ii > level) {
8481  for (teams_serialized = team->t.t_serialized;
8482  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8483  }
8484  if (team->t.t_serialized && (!teams_serialized)) {
8485  team = team->t.t_parent;
8486  continue;
8487  }
8488  if (ii > level) {
8489  team = team->t.t_parent;
8490  ii--;
8491  }
8492  }
8493  return team;
8494  }
8495  return NULL;
8496 }
8497 
8498 int __kmp_aux_get_team_num() {
8499  int serialized;
8500  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8501  if (team) {
8502  if (serialized > 1) {
8503  return 0; // teams region is serialized ( 1 team of 1 thread ).
8504  } else {
8505  return team->t.t_master_tid;
8506  }
8507  }
8508  return 0;
8509 }
8510 
8511 int __kmp_aux_get_num_teams() {
8512  int serialized;
8513  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8514  if (team) {
8515  if (serialized > 1) {
8516  return 1;
8517  } else {
8518  return team->t.t_parent->t.t_nproc;
8519  }
8520  }
8521  return 1;
8522 }
8523 
8524 /* ------------------------------------------------------------------------ */
8525 
8526 /*
8527  * Affinity Format Parser
8528  *
8529  * Field is in form of: %[[[0].]size]type
8530  * % and type are required (%% means print a literal '%')
8531  * type is either single char or long name surrounded by {},
8532  * e.g., N or {num_threads}
8533  * 0 => leading zeros
8534  * . => right justified when size is specified
8535  * by default output is left justified
8536  * size is the *minimum* field length
8537  * All other characters are printed as is
8538  *
8539  * Available field types:
8540  * L {thread_level} - omp_get_level()
8541  * n {thread_num} - omp_get_thread_num()
8542  * h {host} - name of host machine
8543  * P {process_id} - process id (integer)
8544  * T {thread_identifier} - native thread identifier (integer)
8545  * N {num_threads} - omp_get_num_threads()
8546  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8547  * a {thread_affinity} - comma separated list of integers or integer ranges
8548  * (values of affinity mask)
8549  *
8550  * Implementation-specific field types can be added
8551  * If a type is unknown, print "undefined"
8552  */
8553 
8554 // Structure holding the short name, long name, and corresponding data type
8555 // for snprintf. A table of these will represent the entire valid keyword
8556 // field types.
8557 typedef struct kmp_affinity_format_field_t {
8558  char short_name; // from spec e.g., L -> thread level
8559  const char *long_name; // from spec thread_level -> thread level
8560  char field_format; // data type for snprintf (typically 'd' or 's'
8561  // for integer or string)
8562 } kmp_affinity_format_field_t;
8563 
8564 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8565 #if KMP_AFFINITY_SUPPORTED
8566  {'A', "thread_affinity", 's'},
8567 #endif
8568  {'t', "team_num", 'd'},
8569  {'T', "num_teams", 'd'},
8570  {'L', "nesting_level", 'd'},
8571  {'n', "thread_num", 'd'},
8572  {'N', "num_threads", 'd'},
8573  {'a', "ancestor_tnum", 'd'},
8574  {'H', "host", 's'},
8575  {'P', "process_id", 'd'},
8576  {'i', "native_thread_id", 'd'}};
8577 
8578 // Return the number of characters it takes to hold field
8579 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8580  const char **ptr,
8581  kmp_str_buf_t *field_buffer) {
8582  int rc, format_index, field_value;
8583  const char *width_left, *width_right;
8584  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8585  static const int FORMAT_SIZE = 20;
8586  char format[FORMAT_SIZE] = {0};
8587  char absolute_short_name = 0;
8588 
8589  KMP_DEBUG_ASSERT(gtid >= 0);
8590  KMP_DEBUG_ASSERT(th);
8591  KMP_DEBUG_ASSERT(**ptr == '%');
8592  KMP_DEBUG_ASSERT(field_buffer);
8593 
8594  __kmp_str_buf_clear(field_buffer);
8595 
8596  // Skip the initial %
8597  (*ptr)++;
8598 
8599  // Check for %% first
8600  if (**ptr == '%') {
8601  __kmp_str_buf_cat(field_buffer, "%", 1);
8602  (*ptr)++; // skip over the second %
8603  return 1;
8604  }
8605 
8606  // Parse field modifiers if they are present
8607  pad_zeros = false;
8608  if (**ptr == '0') {
8609  pad_zeros = true;
8610  (*ptr)++; // skip over 0
8611  }
8612  right_justify = false;
8613  if (**ptr == '.') {
8614  right_justify = true;
8615  (*ptr)++; // skip over .
8616  }
8617  // Parse width of field: [width_left, width_right)
8618  width_left = width_right = NULL;
8619  if (**ptr >= '0' && **ptr <= '9') {
8620  width_left = *ptr;
8621  SKIP_DIGITS(*ptr);
8622  width_right = *ptr;
8623  }
8624 
8625  // Create the format for KMP_SNPRINTF based on flags parsed above
8626  format_index = 0;
8627  format[format_index++] = '%';
8628  if (!right_justify)
8629  format[format_index++] = '-';
8630  if (pad_zeros)
8631  format[format_index++] = '0';
8632  if (width_left && width_right) {
8633  int i = 0;
8634  // Only allow 8 digit number widths.
8635  // This also prevents overflowing format variable
8636  while (i < 8 && width_left < width_right) {
8637  format[format_index++] = *width_left;
8638  width_left++;
8639  i++;
8640  }
8641  }
8642 
8643  // Parse a name (long or short)
8644  // Canonicalize the name into absolute_short_name
8645  found_valid_name = false;
8646  parse_long_name = (**ptr == '{');
8647  if (parse_long_name)
8648  (*ptr)++; // skip initial left brace
8649  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8650  sizeof(__kmp_affinity_format_table[0]);
8651  ++i) {
8652  char short_name = __kmp_affinity_format_table[i].short_name;
8653  const char *long_name = __kmp_affinity_format_table[i].long_name;
8654  char field_format = __kmp_affinity_format_table[i].field_format;
8655  if (parse_long_name) {
8656  size_t length = KMP_STRLEN(long_name);
8657  if (strncmp(*ptr, long_name, length) == 0) {
8658  found_valid_name = true;
8659  (*ptr) += length; // skip the long name
8660  }
8661  } else if (**ptr == short_name) {
8662  found_valid_name = true;
8663  (*ptr)++; // skip the short name
8664  }
8665  if (found_valid_name) {
8666  format[format_index++] = field_format;
8667  format[format_index++] = '\0';
8668  absolute_short_name = short_name;
8669  break;
8670  }
8671  }
8672  if (parse_long_name) {
8673  if (**ptr != '}') {
8674  absolute_short_name = 0;
8675  } else {
8676  (*ptr)++; // skip over the right brace
8677  }
8678  }
8679 
8680  // Attempt to fill the buffer with the requested
8681  // value using snprintf within __kmp_str_buf_print()
8682  switch (absolute_short_name) {
8683  case 't':
8684  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8685  break;
8686  case 'T':
8687  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8688  break;
8689  case 'L':
8690  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8691  break;
8692  case 'n':
8693  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8694  break;
8695  case 'H': {
8696  static const int BUFFER_SIZE = 256;
8697  char buf[BUFFER_SIZE];
8698  __kmp_expand_host_name(buf, BUFFER_SIZE);
8699  rc = __kmp_str_buf_print(field_buffer, format, buf);
8700  } break;
8701  case 'P':
8702  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8703  break;
8704  case 'i':
8705  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8706  break;
8707  case 'N':
8708  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8709  break;
8710  case 'a':
8711  field_value =
8712  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8713  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8714  break;
8715 #if KMP_AFFINITY_SUPPORTED
8716  case 'A': {
8717  kmp_str_buf_t buf;
8718  __kmp_str_buf_init(&buf);
8719  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8720  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8721  __kmp_str_buf_free(&buf);
8722  } break;
8723 #endif
8724  default:
8725  // According to spec, If an implementation does not have info for field
8726  // type, then "undefined" is printed
8727  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8728  // Skip the field
8729  if (parse_long_name) {
8730  SKIP_TOKEN(*ptr);
8731  if (**ptr == '}')
8732  (*ptr)++;
8733  } else {
8734  (*ptr)++;
8735  }
8736  }
8737 
8738  KMP_ASSERT(format_index <= FORMAT_SIZE);
8739  return rc;
8740 }
8741 
8742 /*
8743  * Return number of characters needed to hold the affinity string
8744  * (not including null byte character)
8745  * The resultant string is printed to buffer, which the caller can then
8746  * handle afterwards
8747  */
8748 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8749  kmp_str_buf_t *buffer) {
8750  const char *parse_ptr;
8751  size_t retval;
8752  const kmp_info_t *th;
8753  kmp_str_buf_t field;
8754 
8755  KMP_DEBUG_ASSERT(buffer);
8756  KMP_DEBUG_ASSERT(gtid >= 0);
8757 
8758  __kmp_str_buf_init(&field);
8759  __kmp_str_buf_clear(buffer);
8760 
8761  th = __kmp_threads[gtid];
8762  retval = 0;
8763 
8764  // If format is NULL or zero-length string, then we use
8765  // affinity-format-var ICV
8766  parse_ptr = format;
8767  if (parse_ptr == NULL || *parse_ptr == '\0') {
8768  parse_ptr = __kmp_affinity_format;
8769  }
8770  KMP_DEBUG_ASSERT(parse_ptr);
8771 
8772  while (*parse_ptr != '\0') {
8773  // Parse a field
8774  if (*parse_ptr == '%') {
8775  // Put field in the buffer
8776  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8777  __kmp_str_buf_catbuf(buffer, &field);
8778  retval += rc;
8779  } else {
8780  // Put literal character in buffer
8781  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8782  retval++;
8783  parse_ptr++;
8784  }
8785  }
8786  __kmp_str_buf_free(&field);
8787  return retval;
8788 }
8789 
8790 // Displays the affinity string to stdout
8791 void __kmp_aux_display_affinity(int gtid, const char *format) {
8792  kmp_str_buf_t buf;
8793  __kmp_str_buf_init(&buf);
8794  __kmp_aux_capture_affinity(gtid, format, &buf);
8795  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8796  __kmp_str_buf_free(&buf);
8797 }
8798 
8799 /* ------------------------------------------------------------------------ */
8800 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8801  int blocktime = arg; /* argument is in microseconds */
8802 #if KMP_USE_MONITOR
8803  int bt_intervals;
8804 #endif
8805  kmp_int8 bt_set;
8806 
8807  __kmp_save_internal_controls(thread);
8808 
8809  /* Normalize and set blocktime for the teams */
8810  if (blocktime < KMP_MIN_BLOCKTIME)
8811  blocktime = KMP_MIN_BLOCKTIME;
8812  else if (blocktime > KMP_MAX_BLOCKTIME)
8813  blocktime = KMP_MAX_BLOCKTIME;
8814 
8815  set__blocktime_team(thread->th.th_team, tid, blocktime);
8816  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8817 
8818 #if KMP_USE_MONITOR
8819  /* Calculate and set blocktime intervals for the teams */
8820  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8821 
8822  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8823  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8824 #endif
8825 
8826  /* Set whether blocktime has been set to "TRUE" */
8827  bt_set = TRUE;
8828 
8829  set__bt_set_team(thread->th.th_team, tid, bt_set);
8830  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8831 #if KMP_USE_MONITOR
8832  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8833  "bt_intervals=%d, monitor_updates=%d\n",
8834  __kmp_gtid_from_tid(tid, thread->th.th_team),
8835  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8836  __kmp_monitor_wakeups));
8837 #else
8838  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8839  __kmp_gtid_from_tid(tid, thread->th.th_team),
8840  thread->th.th_team->t.t_id, tid, blocktime));
8841 #endif
8842 }
8843 
8844 void __kmp_aux_set_defaults(char const *str, size_t len) {
8845  if (!__kmp_init_serial) {
8846  __kmp_serial_initialize();
8847  }
8848  __kmp_env_initialize(str);
8849 
8850  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8851  __kmp_env_print();
8852  }
8853 } // __kmp_aux_set_defaults
8854 
8855 /* ------------------------------------------------------------------------ */
8856 /* internal fast reduction routines */
8857 
8858 PACKED_REDUCTION_METHOD_T
8859 __kmp_determine_reduction_method(
8860  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8861  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8862  kmp_critical_name *lck) {
8863 
8864  // Default reduction method: critical construct ( lck != NULL, like in current
8865  // PAROPT )
8866  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8867  // can be selected by RTL
8868  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8869  // can be selected by RTL
8870  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8871  // among generated by PAROPT.
8872 
8873  PACKED_REDUCTION_METHOD_T retval;
8874 
8875  int team_size;
8876 
8877  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8878 
8879 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8880  (loc && \
8881  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8882 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8883 
8884  retval = critical_reduce_block;
8885 
8886  // another choice of getting a team size (with 1 dynamic deference) is slower
8887  team_size = __kmp_get_team_num_threads(global_tid);
8888  if (team_size == 1) {
8889 
8890  retval = empty_reduce_block;
8891 
8892  } else {
8893 
8894  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8895 
8896 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8897  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
8898 
8899 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8900  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8901 
8902  int teamsize_cutoff = 4;
8903 
8904 #if KMP_MIC_SUPPORTED
8905  if (__kmp_mic_type != non_mic) {
8906  teamsize_cutoff = 8;
8907  }
8908 #endif
8909  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8910  if (tree_available) {
8911  if (team_size <= teamsize_cutoff) {
8912  if (atomic_available) {
8913  retval = atomic_reduce_block;
8914  }
8915  } else {
8916  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8917  }
8918  } else if (atomic_available) {
8919  retval = atomic_reduce_block;
8920  }
8921 #else
8922 #error "Unknown or unsupported OS"
8923 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8924  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8925 
8926 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8927 
8928 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8929 
8930  // basic tuning
8931 
8932  if (atomic_available) {
8933  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8934  retval = atomic_reduce_block;
8935  }
8936  } // otherwise: use critical section
8937 
8938 #elif KMP_OS_DARWIN
8939 
8940  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8941  if (atomic_available && (num_vars <= 3)) {
8942  retval = atomic_reduce_block;
8943  } else if (tree_available) {
8944  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8945  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8946  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8947  }
8948  } // otherwise: use critical section
8949 
8950 #else
8951 #error "Unknown or unsupported OS"
8952 #endif
8953 
8954 #else
8955 #error "Unknown or unsupported architecture"
8956 #endif
8957  }
8958 
8959  // KMP_FORCE_REDUCTION
8960 
8961  // If the team is serialized (team_size == 1), ignore the forced reduction
8962  // method and stay with the unsynchronized method (empty_reduce_block)
8963  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8964  team_size != 1) {
8965 
8966  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8967 
8968  int atomic_available, tree_available;
8969 
8970  switch ((forced_retval = __kmp_force_reduction_method)) {
8971  case critical_reduce_block:
8972  KMP_ASSERT(lck); // lck should be != 0
8973  break;
8974 
8975  case atomic_reduce_block:
8976  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8977  if (!atomic_available) {
8978  KMP_WARNING(RedMethodNotSupported, "atomic");
8979  forced_retval = critical_reduce_block;
8980  }
8981  break;
8982 
8983  case tree_reduce_block:
8984  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8985  if (!tree_available) {
8986  KMP_WARNING(RedMethodNotSupported, "tree");
8987  forced_retval = critical_reduce_block;
8988  } else {
8989 #if KMP_FAST_REDUCTION_BARRIER
8990  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8991 #endif
8992  }
8993  break;
8994 
8995  default:
8996  KMP_ASSERT(0); // "unsupported method specified"
8997  }
8998 
8999  retval = forced_retval;
9000  }
9001 
9002  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9003 
9004 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9005 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9006 
9007  return (retval);
9008 }
9009 // this function is for testing set/get/determine reduce method
9010 kmp_int32 __kmp_get_reduce_method(void) {
9011  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9012 }
9013 
9014 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9015 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9016 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9017 
9018 // Hard pause shuts down the runtime completely. Resume happens naturally when
9019 // OpenMP is used subsequently.
9020 void __kmp_hard_pause() {
9021  __kmp_pause_status = kmp_hard_paused;
9022  __kmp_internal_end_thread(-1);
9023 }
9024 
9025 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9026 void __kmp_resume_if_soft_paused() {
9027  if (__kmp_pause_status == kmp_soft_paused) {
9028  __kmp_pause_status = kmp_not_paused;
9029 
9030  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9031  kmp_info_t *thread = __kmp_threads[gtid];
9032  if (thread) { // Wake it if sleeping
9033  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9034  thread);
9035  if (fl.is_sleeping())
9036  fl.resume(gtid);
9037  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9038  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9039  } else { // thread holds the lock and may sleep soon
9040  do { // until either the thread sleeps, or we can get the lock
9041  if (fl.is_sleeping()) {
9042  fl.resume(gtid);
9043  break;
9044  } else if (__kmp_try_suspend_mx(thread)) {
9045  __kmp_unlock_suspend_mx(thread);
9046  break;
9047  }
9048  } while (1);
9049  }
9050  }
9051  }
9052  }
9053 }
9054 
9055 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9056 // TODO: add warning messages
9057 int __kmp_pause_resource(kmp_pause_status_t level) {
9058  if (level == kmp_not_paused) { // requesting resume
9059  if (__kmp_pause_status == kmp_not_paused) {
9060  // error message about runtime not being paused, so can't resume
9061  return 1;
9062  } else {
9063  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9064  __kmp_pause_status == kmp_hard_paused);
9065  __kmp_pause_status = kmp_not_paused;
9066  return 0;
9067  }
9068  } else if (level == kmp_soft_paused) { // requesting soft pause
9069  if (__kmp_pause_status != kmp_not_paused) {
9070  // error message about already being paused
9071  return 1;
9072  } else {
9073  __kmp_soft_pause();
9074  return 0;
9075  }
9076  } else if (level == kmp_hard_paused) { // requesting hard pause
9077  if (__kmp_pause_status != kmp_not_paused) {
9078  // error message about already being paused
9079  return 1;
9080  } else {
9081  __kmp_hard_pause();
9082  return 0;
9083  }
9084  } else {
9085  // error message about invalid level
9086  return 1;
9087  }
9088 }
9089 
9090 void __kmp_omp_display_env(int verbose) {
9091  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9092  if (__kmp_init_serial == 0)
9093  __kmp_do_serial_initialize();
9094  __kmp_display_env_impl(!verbose, verbose);
9095  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9096 }
9097 
9098 // The team size is changing, so distributed barrier must be modified
9099 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9100  int new_nthreads) {
9101  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9102  bp_dist_bar);
9103  kmp_info_t **other_threads = team->t.t_threads;
9104 
9105  // We want all the workers to stop waiting on the barrier while we adjust the
9106  // size of the team.
9107  for (int f = 1; f < old_nthreads; ++f) {
9108  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9109  // Ignore threads that are already inactive or not present in the team
9110  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9111  // teams construct causes thread_limit to get passed in, and some of
9112  // those could be inactive; just ignore them
9113  continue;
9114  }
9115  // If thread is transitioning still to in_use state, wait for it
9116  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9117  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9118  KMP_CPU_PAUSE();
9119  }
9120  // The thread should be in_use now
9121  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9122  // Transition to unused state
9123  team->t.t_threads[f]->th.th_used_in_team.store(2);
9124  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9125  }
9126  // Release all the workers
9127  team->t.b->go_release();
9128 
9129  KMP_MFENCE();
9130 
9131  // Workers should see transition status 2 and move to 0; but may need to be
9132  // woken up first
9133  int count = old_nthreads - 1;
9134  while (count > 0) {
9135  count = old_nthreads - 1;
9136  for (int f = 1; f < old_nthreads; ++f) {
9137  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9138  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9139  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9140  void *, other_threads[f]->th.th_sleep_loc);
9141  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9142  }
9143  } else {
9144  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9145  count--;
9146  }
9147  }
9148  }
9149  // Now update the barrier size
9150  team->t.b->update_num_threads(new_nthreads);
9151  team->t.b->go_reset();
9152 }
9153 
9154 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9155  // Add the threads back to the team
9156  KMP_DEBUG_ASSERT(team);
9157  // Threads were paused and pointed at th_used_in_team temporarily during a
9158  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9159  // the thread that it should transition itself back into the team. Then, if
9160  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9161  // to wake it up.
9162  for (int f = 1; f < new_nthreads; ++f) {
9163  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9164  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9165  3);
9166  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9167  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9168  (kmp_flag_32<false, false> *)NULL);
9169  }
9170  }
9171  // The threads should be transitioning to the team; when they are done, they
9172  // should have set th_used_in_team to 1. This loop forces master to wait until
9173  // all threads have moved into the team and are waiting in the barrier.
9174  int count = new_nthreads - 1;
9175  while (count > 0) {
9176  count = new_nthreads - 1;
9177  for (int f = 1; f < new_nthreads; ++f) {
9178  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9179  count--;
9180  }
9181  }
9182  }
9183 }
9184 
9185 // Globals and functions for hidden helper task
9186 kmp_info_t **__kmp_hidden_helper_threads;
9187 kmp_info_t *__kmp_hidden_helper_main_thread;
9188 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9189 #if KMP_OS_LINUX
9190 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9191 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9192 #else
9193 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9194 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9195 #endif
9196 
9197 namespace {
9198 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9199 
9200 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9201  // This is an explicit synchronization on all hidden helper threads in case
9202  // that when a regular thread pushes a hidden helper task to one hidden
9203  // helper thread, the thread has not been awaken once since they're released
9204  // by the main thread after creating the team.
9205  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9206  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9207  __kmp_hidden_helper_threads_num)
9208  ;
9209 
9210  // If main thread, then wait for signal
9211  if (__kmpc_master(nullptr, *gtid)) {
9212  // First, unset the initial state and release the initial thread
9213  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9214  __kmp_hidden_helper_initz_release();
9215  __kmp_hidden_helper_main_thread_wait();
9216  // Now wake up all worker threads
9217  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9218  __kmp_hidden_helper_worker_thread_signal();
9219  }
9220  }
9221 }
9222 } // namespace
9223 
9224 void __kmp_hidden_helper_threads_initz_routine() {
9225  // Create a new root for hidden helper team/threads
9226  const int gtid = __kmp_register_root(TRUE);
9227  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9228  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9229  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9230  __kmp_hidden_helper_threads_num;
9231 
9232  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9233 
9234  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9235 
9236  // Set the initialization flag to FALSE
9237  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9238 
9239  __kmp_hidden_helper_threads_deinitz_release();
9240 }
9241 
9242 /* Nesting Mode:
9243  Set via KMP_NESTING_MODE, which takes an integer.
9244  Note: we skip duplicate topology levels, and skip levels with only
9245  one entity.
9246  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9247  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9248  in the topology, and initializes the number of threads at each of those
9249  levels to the number of entities at each level, respectively, below the
9250  entity at the parent level.
9251  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9252  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9253  the user to turn nesting on explicitly. This is an even more experimental
9254  option to this experimental feature, and may change or go away in the
9255  future.
9256 */
9257 
9258 // Allocate space to store nesting levels
9259 void __kmp_init_nesting_mode() {
9260  int levels = KMP_HW_LAST;
9261  __kmp_nesting_mode_nlevels = levels;
9262  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9263  for (int i = 0; i < levels; ++i)
9264  __kmp_nesting_nth_level[i] = 0;
9265  if (__kmp_nested_nth.size < levels) {
9266  __kmp_nested_nth.nth =
9267  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9268  __kmp_nested_nth.size = levels;
9269  }
9270 }
9271 
9272 // Set # threads for top levels of nesting; must be called after topology set
9273 void __kmp_set_nesting_mode_threads() {
9274  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9275 
9276  if (__kmp_nesting_mode == 1)
9277  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9278  else if (__kmp_nesting_mode > 1)
9279  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9280 
9281  if (__kmp_topology) { // use topology info
9282  int loc, hw_level;
9283  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9284  loc < __kmp_nesting_mode_nlevels;
9285  loc++, hw_level++) {
9286  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9287  if (__kmp_nesting_nth_level[loc] == 1)
9288  loc--;
9289  }
9290  // Make sure all cores are used
9291  if (__kmp_nesting_mode > 1 && loc > 1) {
9292  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9293  int num_cores = __kmp_topology->get_count(core_level);
9294  int upper_levels = 1;
9295  for (int level = 0; level < loc - 1; ++level)
9296  upper_levels *= __kmp_nesting_nth_level[level];
9297  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9298  __kmp_nesting_nth_level[loc - 1] =
9299  num_cores / __kmp_nesting_nth_level[loc - 2];
9300  }
9301  __kmp_nesting_mode_nlevels = loc;
9302  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9303  } else { // no topology info available; provide a reasonable guesstimation
9304  if (__kmp_avail_proc >= 4) {
9305  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9306  __kmp_nesting_nth_level[1] = 2;
9307  __kmp_nesting_mode_nlevels = 2;
9308  } else {
9309  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9310  __kmp_nesting_mode_nlevels = 1;
9311  }
9312  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9313  }
9314  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9315  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9316  }
9317  set__nproc(thread, __kmp_nesting_nth_level[0]);
9318  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9319  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9320  if (get__max_active_levels(thread) > 1) {
9321  // if max levels was set, set nesting mode levels to same
9322  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9323  }
9324  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9325  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9326 }
9327 
9328 // Empty symbols to export (see exports_so.txt) when feature is disabled
9329 extern "C" {
9330 #if !KMP_STATS_ENABLED
9331 void __kmp_reset_stats() {}
9332 #endif
9333 #if !USE_DEBUGGER
9334 int __kmp_omp_debug_struct_info = FALSE;
9335 int __kmp_debugging = FALSE;
9336 #endif
9337 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9338 void __kmp_itt_fini_ittlib() {}
9339 void __kmp_itt_init_ittlib() {}
9340 #endif
9341 }
9342 
9343 // end of file
__kmpc_fork_call
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
Definition: kmp_csupport.cpp:262
__kmpc_master
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:805
stats_state_e
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
__kmpc_end_serialized_parallel
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:576
kmp_sch_guided_chunked
@ kmp_sch_guided_chunked
Definition: kmp.h:363
sched_type
sched_type
Definition: kmp.h:358
KMP_IDENT_AUTOPAR
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:200
KMP_COUNT_VALUE
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
ident
Definition: kmp.h:235
kmp_sch_auto
@ kmp_sch_auto
Definition: kmp.h:365
kmp_sch_static
@ kmp_sch_static
Definition: kmp.h:361
ident::flags
kmp_int32 flags
Definition: kmp.h:237
KMP_INIT_PARTITIONED_TIMERS
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
__kmpc_serialized_parallel
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:558