LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 #ifdef BUILD_TIED_TASK_STACK
46 
47 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
48 // from top do bottom
49 //
50 // gtid: global thread identifier for thread containing stack
51 // thread_data: thread data for task team thread containing stack
52 // threshold: value above which the trace statement triggers
53 // location: string identifying call site of this function (for trace)
54 static void __kmp_trace_task_stack(kmp_int32 gtid,
55  kmp_thread_data_t *thread_data,
56  int threshold, char *location) {
57  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58  kmp_taskdata_t **stack_top = task_stack->ts_top;
59  kmp_int32 entries = task_stack->ts_entries;
60  kmp_taskdata_t *tied_task;
61 
62  KA_TRACE(
63  threshold,
64  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65  "first_block = %p, stack_top = %p \n",
66  location, gtid, entries, task_stack->ts_first_block, stack_top));
67 
68  KMP_DEBUG_ASSERT(stack_top != NULL);
69  KMP_DEBUG_ASSERT(entries > 0);
70 
71  while (entries != 0) {
72  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
73  // fix up ts_top if we need to pop from previous block
74  if (entries & TASK_STACK_INDEX_MASK == 0) {
75  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
76 
77  stack_block = stack_block->sb_prev;
78  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79  }
80 
81  // finish bookkeeping
82  stack_top--;
83  entries--;
84 
85  tied_task = *stack_top;
86 
87  KMP_DEBUG_ASSERT(tied_task != NULL);
88  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89 
90  KA_TRACE(threshold,
91  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
92  "stack_top=%p, tied_task=%p\n",
93  location, gtid, entries, stack_top, tied_task));
94  }
95  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
96 
97  KA_TRACE(threshold,
98  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99  location, gtid));
100 }
101 
102 // __kmp_init_task_stack: initialize the task stack for the first time
103 // after a thread_data structure is created.
104 // It should not be necessary to do this again (assuming the stack works).
105 //
106 // gtid: global thread identifier of calling thread
107 // thread_data: thread data for task team thread containing stack
108 static void __kmp_init_task_stack(kmp_int32 gtid,
109  kmp_thread_data_t *thread_data) {
110  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111  kmp_stack_block_t *first_block;
112 
113  // set up the first block of the stack
114  first_block = &task_stack->ts_first_block;
115  task_stack->ts_top = (kmp_taskdata_t **)first_block;
116  memset((void *)first_block, '\0',
117  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118 
119  // initialize the stack to be empty
120  task_stack->ts_entries = TASK_STACK_EMPTY;
121  first_block->sb_next = NULL;
122  first_block->sb_prev = NULL;
123 }
124 
125 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 // gtid: global thread identifier for calling thread
128 // thread_data: thread info for thread containing stack
129 static void __kmp_free_task_stack(kmp_int32 gtid,
130  kmp_thread_data_t *thread_data) {
131  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133 
134  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135  // free from the second block of the stack
136  while (stack_block != NULL) {
137  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138 
139  stack_block->sb_next = NULL;
140  stack_block->sb_prev = NULL;
141  if (stack_block != &task_stack->ts_first_block) {
142  __kmp_thread_free(thread,
143  stack_block); // free the block, if not the first
144  }
145  stack_block = next_block;
146  }
147  // initialize the stack to be empty
148  task_stack->ts_entries = 0;
149  task_stack->ts_top = NULL;
150 }
151 
152 // __kmp_push_task_stack: Push the tied task onto the task stack.
153 // Grow the stack if necessary by allocating another block.
154 //
155 // gtid: global thread identifier for calling thread
156 // thread: thread info for thread containing stack
157 // tied_task: the task to push on the stack
158 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159  kmp_taskdata_t *tied_task) {
160  // GEH - need to consider what to do if tt_threads_data not allocated yet
161  kmp_thread_data_t *thread_data =
162  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164 
165  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
166  return; // Don't push anything on stack if team or team tasks are serialized
167  }
168 
169  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171 
172  KA_TRACE(20,
173  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174  gtid, thread, tied_task));
175  // Store entry
176  *(task_stack->ts_top) = tied_task;
177 
178  // Do bookkeeping for next push
179  task_stack->ts_top++;
180  task_stack->ts_entries++;
181 
182  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
183  // Find beginning of this task block
184  kmp_stack_block_t *stack_block =
185  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186 
187  // Check if we already have a block
188  if (stack_block->sb_next !=
189  NULL) { // reset ts_top to beginning of next block
190  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
191  } else { // Alloc new block and link it up
192  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
193  thread, sizeof(kmp_stack_block_t));
194 
195  task_stack->ts_top = &new_block->sb_block[0];
196  stack_block->sb_next = new_block;
197  new_block->sb_prev = stack_block;
198  new_block->sb_next = NULL;
199 
200  KA_TRACE(
201  30,
202  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203  gtid, tied_task, new_block));
204  }
205  }
206  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207  tied_task));
208 }
209 
210 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
211 // the task, just check to make sure it matches the ending task passed in.
212 //
213 // gtid: global thread identifier for the calling thread
214 // thread: thread info structure containing stack
215 // tied_task: the task popped off the stack
216 // ending_task: the task that is ending (should match popped task)
217 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218  kmp_taskdata_t *ending_task) {
219  // GEH - need to consider what to do if tt_threads_data not allocated yet
220  kmp_thread_data_t *thread_data =
221  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223  kmp_taskdata_t *tied_task;
224 
225  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
226  // Don't pop anything from stack if team or team tasks are serialized
227  return;
228  }
229 
230  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
232 
233  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234  thread));
235 
236  // fix up ts_top if we need to pop from previous block
237  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
238  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
239 
240  stack_block = stack_block->sb_prev;
241  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242  }
243 
244  // finish bookkeeping
245  task_stack->ts_top--;
246  task_stack->ts_entries--;
247 
248  tied_task = *(task_stack->ts_top);
249 
250  KMP_DEBUG_ASSERT(tied_task != NULL);
251  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253 
254  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255  tied_task));
256  return;
257 }
258 #endif /* BUILD_TIED_TASK_STACK */
259 
260 // returns 1 if new task is allowed to execute, 0 otherwise
261 // checks Task Scheduling constraint (if requested) and
262 // mutexinoutset dependencies if any
263 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264  const kmp_taskdata_t *tasknew,
265  const kmp_taskdata_t *taskcurr) {
266  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
268  // only descendant of all deferred tied tasks can be scheduled, checking
269  // the last one is enough, as it in turn is the descendant of all others
270  kmp_taskdata_t *current = taskcurr->td_last_tied;
271  KMP_DEBUG_ASSERT(current != NULL);
272  // check if the task is not suspended on barrier
273  if (current->td_flags.tasktype == TASK_EXPLICIT ||
274  current->td_taskwait_thread > 0) { // <= 0 on barrier
275  kmp_int32 level = current->td_level;
276  kmp_taskdata_t *parent = tasknew->td_parent;
277  while (parent != current && parent->td_level > level) {
278  // check generation up to the level of the current task
279  parent = parent->td_parent;
280  KMP_DEBUG_ASSERT(parent != NULL);
281  }
282  if (parent != current)
283  return false;
284  }
285  }
286  // Check mutexinoutset dependencies, acquire locks
287  kmp_depnode_t *node = tasknew->td_depnode;
288 #if OMPX_TASKGRAPH
289  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
290 #else
291  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
292 #endif
293  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
294  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
296  continue;
297  // could not get the lock, release previous locks
298  for (int j = i - 1; j >= 0; --j)
299  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
300  return false;
301  }
302  // negative num_locks means all locks acquired successfully
303  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304  }
305  return true;
306 }
307 
308 // __kmp_realloc_task_deque:
309 // Re-allocates a task deque for a particular thread, copies the content from
310 // the old deque and adjusts the necessary data structures relating to the
311 // deque. This operation must be done with the deque_lock being held
312 static void __kmp_realloc_task_deque(kmp_info_t *thread,
313  kmp_thread_data_t *thread_data) {
314  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316  kmp_int32 new_size = 2 * size;
317 
318  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319  "%d] for thread_data %p\n",
320  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
321 
322  kmp_taskdata_t **new_deque =
323  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
324 
325  int i, j;
326  for (i = thread_data->td.td_deque_head, j = 0; j < size;
327  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
328  new_deque[j] = thread_data->td.td_deque[i];
329 
330  __kmp_free(thread_data->td.td_deque);
331 
332  thread_data->td.td_deque_head = 0;
333  thread_data->td.td_deque_tail = size;
334  thread_data->td.td_deque = new_deque;
335  thread_data->td.td_deque_size = new_size;
336 }
337 
338 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
340  kmp_thread_data_t *thread_data = &l->td;
341  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
342  thread_data->td.td_deque_last_stolen = -1;
343  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344  "for thread_data %p\n",
345  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349  return l;
350 }
351 
352 // The function finds the deque of priority tasks with given priority, or
353 // allocates a new deque and put it into sorted (high -> low) list of deques.
354 // Deques of non-default priority tasks are shared between all threads in team,
355 // as opposed to per-thread deques of tasks with default priority.
356 // The function is called under the lock task_team->tt.tt_task_pri_lock.
357 static kmp_thread_data_t *
358 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359  kmp_thread_data_t *thread_data;
360  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361  if (lst->priority == pri) {
362  // Found queue of tasks with given priority.
363  thread_data = &lst->td;
364  } else if (lst->priority < pri) {
365  // All current priority queues contain tasks with lower priority.
366  // Allocate new one for given priority tasks.
367  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368  thread_data = &list->td;
369  list->priority = pri;
370  list->next = lst;
371  task_team->tt.tt_task_pri_list = list;
372  } else { // task_team->tt.tt_task_pri_list->priority > pri
373  kmp_task_pri_t *next_queue = lst->next;
374  while (next_queue && next_queue->priority > pri) {
375  lst = next_queue;
376  next_queue = lst->next;
377  }
378  // lst->priority > pri && (next == NULL || pri >= next->priority)
379  if (next_queue == NULL) {
380  // No queue with pri priority, need to allocate new one.
381  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382  thread_data = &list->td;
383  list->priority = pri;
384  list->next = NULL;
385  lst->next = list;
386  } else if (next_queue->priority == pri) {
387  // Found queue of tasks with given priority.
388  thread_data = &next_queue->td;
389  } else { // lst->priority > pri > next->priority
390  // insert newly allocated between existed queues
391  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392  thread_data = &list->td;
393  list->priority = pri;
394  list->next = next_queue;
395  lst->next = list;
396  }
397  }
398  return thread_data;
399 }
400 
401 // __kmp_push_priority_task: Add a task to the team's priority task deque
402 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403  kmp_taskdata_t *taskdata,
404  kmp_task_team_t *task_team,
405  kmp_int32 pri) {
406  kmp_thread_data_t *thread_data = NULL;
407  KA_TRACE(20,
408  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409  gtid, taskdata, pri));
410 
411  // Find task queue specific to priority value
412  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413  if (UNLIKELY(lst == NULL)) {
414  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
415  if (task_team->tt.tt_task_pri_list == NULL) {
416  // List of queues is still empty, allocate one.
417  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418  thread_data = &list->td;
419  list->priority = pri;
420  list->next = NULL;
421  task_team->tt.tt_task_pri_list = list;
422  } else {
423  // Other thread initialized a queue. Check if it fits and get thread_data.
424  thread_data = __kmp_get_priority_deque_data(task_team, pri);
425  }
426  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
427  } else {
428  if (lst->priority == pri) {
429  // Found queue of tasks with given priority.
430  thread_data = &lst->td;
431  } else {
432  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
433  thread_data = __kmp_get_priority_deque_data(task_team, pri);
434  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
435  }
436  }
437  KMP_DEBUG_ASSERT(thread_data);
438 
439  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
440  // Check if deque is full
441  if (TCR_4(thread_data->td.td_deque_ntasks) >=
442  TASK_DEQUE_SIZE(thread_data->td)) {
443  if (__kmp_enable_task_throttling &&
444  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
445  thread->th.th_current_task)) {
446  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
447  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
448  "TASK_NOT_PUSHED for task %p\n",
449  gtid, taskdata));
450  return TASK_NOT_PUSHED;
451  } else {
452  // expand deque to push the task which is not allowed to execute
453  __kmp_realloc_task_deque(thread, thread_data);
454  }
455  }
456  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457  TASK_DEQUE_SIZE(thread_data->td));
458  // Push taskdata.
459  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460  // Wrap index.
461  thread_data->td.td_deque_tail =
462  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
463  TCW_4(thread_data->td.td_deque_ntasks,
464  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
465  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466  KMP_FSYNC_RELEASING(taskdata); // releasing child
467  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
468  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469  gtid, taskdata, thread_data->td.td_deque_ntasks,
470  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
472  task_team->tt.tt_num_task_pri++; // atomic inc
473  return TASK_SUCCESSFULLY_PUSHED;
474 }
475 
476 // __kmp_push_task: Add a task to the thread's deque
477 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478  kmp_info_t *thread = __kmp_threads[gtid];
479  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480 
481  // If we encounter a hidden helper task, and the current thread is not a
482  // hidden helper thread, we have to give the task to any hidden helper thread
483  // starting from its shadow one.
484  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
486  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
488  // Signal the hidden helper threads.
489  __kmp_hidden_helper_worker_thread_signal();
490  return TASK_SUCCESSFULLY_PUSHED;
491  }
492 
493  kmp_task_team_t *task_team = thread->th.th_task_team;
494  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495  kmp_thread_data_t *thread_data;
496 
497  KA_TRACE(20,
498  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499 
500  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501  // untied task needs to increment counter so that the task structure is not
502  // freed prematurely
503  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504  KMP_DEBUG_USE_VAR(counter);
505  KA_TRACE(
506  20,
507  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508  gtid, counter, taskdata));
509  }
510 
511  // The first check avoids building task_team thread data if serialized
512  if (UNLIKELY(taskdata->td_flags.task_serial)) {
513  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
514  "TASK_NOT_PUSHED for task %p\n",
515  gtid, taskdata));
516  return TASK_NOT_PUSHED;
517  }
518 
519  // Now that serialized tasks have returned, we can assume that we are not in
520  // immediate exec mode
521  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523  __kmp_enable_tasking(task_team, thread);
524  }
525  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527 
528  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
529  __kmp_max_task_priority > 0) {
530  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532  }
533 
534  // Find tasking deque specific to encountering thread
535  thread_data = &task_team->tt.tt_threads_data[tid];
536 
537  // No lock needed since only owner can allocate. If the task is hidden_helper,
538  // we don't need it either because we have initialized the dequeue for hidden
539  // helper thread data.
540  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541  __kmp_alloc_task_deque(thread, thread_data);
542  }
543 
544  int locked = 0;
545  // Check if deque is full
546  if (TCR_4(thread_data->td.td_deque_ntasks) >=
547  TASK_DEQUE_SIZE(thread_data->td)) {
548  if (__kmp_enable_task_throttling &&
549  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
550  thread->th.th_current_task)) {
551  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
552  "TASK_NOT_PUSHED for task %p\n",
553  gtid, taskdata));
554  return TASK_NOT_PUSHED;
555  } else {
556  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
557  locked = 1;
558  if (TCR_4(thread_data->td.td_deque_ntasks) >=
559  TASK_DEQUE_SIZE(thread_data->td)) {
560  // expand deque to push the task which is not allowed to execute
561  __kmp_realloc_task_deque(thread, thread_data);
562  }
563  }
564  }
565  // Lock the deque for the task push operation
566  if (!locked) {
567  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
568  // Need to recheck as we can get a proxy task from thread outside of OpenMP
569  if (TCR_4(thread_data->td.td_deque_ntasks) >=
570  TASK_DEQUE_SIZE(thread_data->td)) {
571  if (__kmp_enable_task_throttling &&
572  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
573  thread->th.th_current_task)) {
574  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
575  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576  "returning TASK_NOT_PUSHED for task %p\n",
577  gtid, taskdata));
578  return TASK_NOT_PUSHED;
579  } else {
580  // expand deque to push the task which is not allowed to execute
581  __kmp_realloc_task_deque(thread, thread_data);
582  }
583  }
584  }
585  // Must have room since no thread can add tasks but calling thread
586  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587  TASK_DEQUE_SIZE(thread_data->td));
588 
589  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590  taskdata; // Push taskdata
591  // Wrap index.
592  thread_data->td.td_deque_tail =
593  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
594  TCW_4(thread_data->td.td_deque_ntasks,
595  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
596  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597  KMP_FSYNC_RELEASING(taskdata); // releasing child
598  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599  "task=%p ntasks=%d head=%u tail=%u\n",
600  gtid, taskdata, thread_data->td.td_deque_ntasks,
601  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602 
603  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
604 
605  return TASK_SUCCESSFULLY_PUSHED;
606 }
607 
608 // __kmp_pop_current_task_from_thread: set up current task from called thread
609 // when team ends
610 //
611 // this_thr: thread structure to set current_task in.
612 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614  "this_thread=%p, curtask=%p, "
615  "curtask_parent=%p\n",
616  0, this_thr, this_thr->th.th_current_task,
617  this_thr->th.th_current_task->td_parent));
618 
619  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620 
621  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622  "this_thread=%p, curtask=%p, "
623  "curtask_parent=%p\n",
624  0, this_thr, this_thr->th.th_current_task,
625  this_thr->th.th_current_task->td_parent));
626 }
627 
628 // __kmp_push_current_task_to_thread: set up current task in called thread for a
629 // new team
630 //
631 // this_thr: thread structure to set up
632 // team: team for implicit task data
633 // tid: thread within team to set up
634 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
635  int tid) {
636  // current task of the thread is a parent of the new just created implicit
637  // tasks of new team
638  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639  "curtask=%p "
640  "parent_task=%p\n",
641  tid, this_thr, this_thr->th.th_current_task,
642  team->t.t_implicit_task_taskdata[tid].td_parent));
643 
644  KMP_DEBUG_ASSERT(this_thr != NULL);
645 
646  if (tid == 0) {
647  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
648  team->t.t_implicit_task_taskdata[0].td_parent =
649  this_thr->th.th_current_task;
650  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
651  }
652  } else {
653  team->t.t_implicit_task_taskdata[tid].td_parent =
654  team->t.t_implicit_task_taskdata[0].td_parent;
655  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656  }
657 
658  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659  "curtask=%p "
660  "parent_task=%p\n",
661  tid, this_thr, this_thr->th.th_current_task,
662  team->t.t_implicit_task_taskdata[tid].td_parent));
663 }
664 
665 // __kmp_task_start: bookkeeping for a task starting execution
666 //
667 // GTID: global thread id of calling thread
668 // task: task starting execution
669 // current_task: task suspending
670 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671  kmp_taskdata_t *current_task) {
672  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673  kmp_info_t *thread = __kmp_threads[gtid];
674 
675  KA_TRACE(10,
676  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677  gtid, taskdata, current_task));
678 
679  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680 
681  // mark currently executing task as suspended
682  // TODO: GEH - make sure root team implicit task is initialized properly.
683  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684  current_task->td_flags.executing = 0;
685 
686 // Add task to stack if tied
687 #ifdef BUILD_TIED_TASK_STACK
688  if (taskdata->td_flags.tiedness == TASK_TIED) {
689  __kmp_push_task_stack(gtid, thread, taskdata);
690  }
691 #endif /* BUILD_TIED_TASK_STACK */
692 
693  // mark starting task as executing and as current task
694  thread->th.th_current_task = taskdata;
695 
696  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
697  taskdata->td_flags.tiedness == TASK_UNTIED);
698  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
699  taskdata->td_flags.tiedness == TASK_UNTIED);
700  taskdata->td_flags.started = 1;
701  taskdata->td_flags.executing = 1;
702  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
703  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
704 
705  // GEH TODO: shouldn't we pass some sort of location identifier here?
706  // APT: yes, we will pass location here.
707  // need to store current thread state (in a thread or taskdata structure)
708  // before setting work_state, otherwise wrong state is set after end of task
709 
710  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711 
712  return;
713 }
714 
715 #if OMPT_SUPPORT
716 //------------------------------------------------------------------------------
717 // __ompt_task_init:
718 // Initialize OMPT fields maintained by a task. This will only be called after
719 // ompt_start_tool, so we already know whether ompt is enabled or not.
720 
721 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
722  // The calls to __ompt_task_init already have the ompt_enabled condition.
723  task->ompt_task_info.task_data.value = 0;
724  task->ompt_task_info.frame.exit_frame = ompt_data_none;
725  task->ompt_task_info.frame.enter_frame = ompt_data_none;
726  task->ompt_task_info.frame.exit_frame_flags =
727  ompt_frame_runtime | ompt_frame_framepointer;
728  task->ompt_task_info.frame.enter_frame_flags =
729  ompt_frame_runtime | ompt_frame_framepointer;
730  task->ompt_task_info.dispatch_chunk.start = 0;
731  task->ompt_task_info.dispatch_chunk.iterations = 0;
732 }
733 
734 // __ompt_task_start:
735 // Build and trigger task-begin event
736 static inline void __ompt_task_start(kmp_task_t *task,
737  kmp_taskdata_t *current_task,
738  kmp_int32 gtid) {
739  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
740  ompt_task_status_t status = ompt_task_switch;
741  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
742  status = ompt_task_yield;
743  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
744  }
745  /* let OMPT know that we're about to run this task */
746  if (ompt_enabled.ompt_callback_task_schedule) {
747  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
748  &(current_task->ompt_task_info.task_data), status,
749  &(taskdata->ompt_task_info.task_data));
750  }
751  taskdata->ompt_task_info.scheduling_parent = current_task;
752 }
753 
754 // __ompt_task_finish:
755 // Build and trigger final task-schedule event
756 static inline void __ompt_task_finish(kmp_task_t *task,
757  kmp_taskdata_t *resumed_task,
758  ompt_task_status_t status) {
759  if (ompt_enabled.ompt_callback_task_schedule) {
760  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
761  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
762  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
763  status = ompt_task_cancel;
764  }
765 
766  /* let OMPT know that we're returning to the callee task */
767  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
768  &(taskdata->ompt_task_info.task_data), status,
769  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
770  }
771 }
772 #endif
773 
774 template <bool ompt>
775 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
776  kmp_task_t *task,
777  void *frame_address,
778  void *return_address) {
779  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
780  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
781 
782  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
783  "current_task=%p\n",
784  gtid, loc_ref, taskdata, current_task));
785 
786  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
787  // untied task needs to increment counter so that the task structure is not
788  // freed prematurely
789  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
790  KMP_DEBUG_USE_VAR(counter);
791  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
792  "incremented for task %p\n",
793  gtid, counter, taskdata));
794  }
795 
796  taskdata->td_flags.task_serial =
797  1; // Execute this task immediately, not deferred.
798  __kmp_task_start(gtid, task, current_task);
799 
800 #if OMPT_SUPPORT
801  if (ompt) {
802  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
803  current_task->ompt_task_info.frame.enter_frame.ptr =
804  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
805  current_task->ompt_task_info.frame.enter_frame_flags =
806  taskdata->ompt_task_info.frame.exit_frame_flags =
807  ompt_frame_application | ompt_frame_framepointer;
808  }
809  if (ompt_enabled.ompt_callback_task_create) {
810  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
811  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
812  &(parent_info->task_data), &(parent_info->frame),
813  &(taskdata->ompt_task_info.task_data),
814  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
815  return_address);
816  }
817  __ompt_task_start(task, current_task, gtid);
818  }
819 #endif // OMPT_SUPPORT
820 
821  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
822  loc_ref, taskdata));
823 }
824 
825 #if OMPT_SUPPORT
826 OMPT_NOINLINE
827 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
828  kmp_task_t *task,
829  void *frame_address,
830  void *return_address) {
831  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
832  return_address);
833 }
834 #endif // OMPT_SUPPORT
835 
836 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
837 // execution
838 //
839 // loc_ref: source location information; points to beginning of task block.
840 // gtid: global thread number.
841 // task: task thunk for the started task.
842 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
843  kmp_task_t *task) {
844 #if OMPT_SUPPORT
845  if (UNLIKELY(ompt_enabled.enabled)) {
846  OMPT_STORE_RETURN_ADDRESS(gtid);
847  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
848  OMPT_GET_FRAME_ADDRESS(1),
849  OMPT_LOAD_RETURN_ADDRESS(gtid));
850  return;
851  }
852 #endif
853  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
854 }
855 
856 #ifdef TASK_UNUSED
857 // __kmpc_omp_task_begin: report that a given task has started execution
858 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
859 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
860  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
861 
862  KA_TRACE(
863  10,
864  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
865  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
866 
867  __kmp_task_start(gtid, task, current_task);
868 
869  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
870  loc_ref, KMP_TASK_TO_TASKDATA(task)));
871  return;
872 }
873 #endif // TASK_UNUSED
874 
875 // __kmp_free_task: free the current task space and the space for shareds
876 //
877 // gtid: Global thread ID of calling thread
878 // taskdata: task to free
879 // thread: thread data structure of caller
880 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
881  kmp_info_t *thread) {
882  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
883  taskdata));
884 
885  // Check to make sure all flags and counters have the correct values
886  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
887  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
888  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
889  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
890  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
891  taskdata->td_flags.task_serial == 1);
892  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
893  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
894  // Clear data to not be re-used later by mistake.
895  task->data1.destructors = NULL;
896  task->data2.priority = 0;
897 
898  taskdata->td_flags.freed = 1;
899 #if OMPX_TASKGRAPH
900  // do not free tasks in taskgraph
901  if (!taskdata->is_taskgraph) {
902 #endif
903 // deallocate the taskdata and shared variable blocks associated with this task
904 #if USE_FAST_MEMORY
905  __kmp_fast_free(thread, taskdata);
906 #else /* ! USE_FAST_MEMORY */
907  __kmp_thread_free(thread, taskdata);
908 #endif
909 #if OMPX_TASKGRAPH
910  } else {
911  taskdata->td_flags.complete = 0;
912  taskdata->td_flags.started = 0;
913  taskdata->td_flags.freed = 0;
914  taskdata->td_flags.executing = 0;
915  taskdata->td_flags.task_serial =
916  (taskdata->td_parent->td_flags.final ||
917  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
918 
919  // taskdata->td_allow_completion_event.pending_events_count = 1;
920  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
921  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
922  // start at one because counts current task and children
923  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
924  }
925 #endif
926 
927  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
928 }
929 
930 // __kmp_free_task_and_ancestors: free the current task and ancestors without
931 // children
932 //
933 // gtid: Global thread ID of calling thread
934 // taskdata: task to free
935 // thread: thread data structure of caller
936 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
937  kmp_taskdata_t *taskdata,
938  kmp_info_t *thread) {
939  // Proxy tasks must always be allowed to free their parents
940  // because they can be run in background even in serial mode.
941  kmp_int32 team_serial =
942  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
943  !taskdata->td_flags.proxy;
944  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
945 
946  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
947  KMP_DEBUG_ASSERT(children >= 0);
948 
949  // Now, go up the ancestor tree to see if any ancestors can now be freed.
950  while (children == 0) {
951  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
952 
953  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
954  "and freeing itself\n",
955  gtid, taskdata));
956 
957  // --- Deallocate my ancestor task ---
958  __kmp_free_task(gtid, taskdata, thread);
959 
960  taskdata = parent_taskdata;
961 
962  if (team_serial)
963  return;
964  // Stop checking ancestors at implicit task instead of walking up ancestor
965  // tree to avoid premature deallocation of ancestors.
966  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
967  if (taskdata->td_dephash) { // do we need to cleanup dephash?
968  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
969  kmp_tasking_flags_t flags_old = taskdata->td_flags;
970  if (children == 0 && flags_old.complete == 1) {
971  kmp_tasking_flags_t flags_new = flags_old;
972  flags_new.complete = 0;
973  if (KMP_COMPARE_AND_STORE_ACQ32(
974  RCAST(kmp_int32 *, &taskdata->td_flags),
975  *RCAST(kmp_int32 *, &flags_old),
976  *RCAST(kmp_int32 *, &flags_new))) {
977  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
978  "dephash of implicit task %p\n",
979  gtid, taskdata));
980  // cleanup dephash of finished implicit task
981  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
982  }
983  }
984  }
985  return;
986  }
987  // Predecrement simulated by "- 1" calculation
988  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
989  KMP_DEBUG_ASSERT(children >= 0);
990  }
991 
992  KA_TRACE(
993  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
994  "not freeing it yet\n",
995  gtid, taskdata, children));
996 }
997 
998 // Only need to keep track of child task counts if any of the following:
999 // 1. team parallel and tasking not serialized;
1000 // 2. it is a proxy or detachable or hidden helper task
1001 // 3. the children counter of its parent task is greater than 0.
1002 // The reason for the 3rd one is for serialized team that found detached task,
1003 // hidden helper task, T. In this case, the execution of T is still deferred,
1004 // and it is also possible that a regular task depends on T. In this case, if we
1005 // don't track the children, task synchronization will be broken.
1006 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
1007  kmp_tasking_flags_t flags = taskdata->td_flags;
1008  bool ret = !(flags.team_serial || flags.tasking_ser);
1009  ret = ret || flags.proxy == TASK_PROXY ||
1010  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
1011  ret = ret ||
1012  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
1013 #if OMPX_TASKGRAPH
1014  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1015  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
1016 #endif
1017  return ret;
1018 }
1019 
1020 // __kmp_task_finish: bookkeeping to do when a task finishes execution
1021 //
1022 // gtid: global thread ID for calling thread
1023 // task: task to be finished
1024 // resumed_task: task to be resumed. (may be NULL if task is serialized)
1025 //
1026 // template<ompt>: effectively ompt_enabled.enabled!=0
1027 // the version with ompt=false is inlined, allowing to optimize away all ompt
1028 // code in this case
1029 template <bool ompt>
1030 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1031  kmp_taskdata_t *resumed_task) {
1032  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1033  kmp_info_t *thread = __kmp_threads[gtid];
1034  kmp_task_team_t *task_team =
1035  thread->th.th_task_team; // might be NULL for serial teams...
1036 #if OMPX_TASKGRAPH
1037  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1038  bool is_taskgraph;
1039 #endif
1040 #if KMP_DEBUG
1041  kmp_int32 children = 0;
1042 #endif
1043  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1044  "task %p\n",
1045  gtid, taskdata, resumed_task));
1046 
1047  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1048 
1049 #if OMPX_TASKGRAPH
1050  is_taskgraph = taskdata->is_taskgraph;
1051 #endif
1052 
1053 // Pop task from stack if tied
1054 #ifdef BUILD_TIED_TASK_STACK
1055  if (taskdata->td_flags.tiedness == TASK_TIED) {
1056  __kmp_pop_task_stack(gtid, thread, taskdata);
1057  }
1058 #endif /* BUILD_TIED_TASK_STACK */
1059 
1060  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1061  // untied task needs to check the counter so that the task structure is not
1062  // freed prematurely
1063  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1064  KA_TRACE(
1065  20,
1066  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1067  gtid, counter, taskdata));
1068  if (counter > 0) {
1069  // untied task is not done, to be continued possibly by other thread, do
1070  // not free it now
1071  if (resumed_task == NULL) {
1072  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1073  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1074  // task is the parent
1075  }
1076  thread->th.th_current_task = resumed_task; // restore current_task
1077  resumed_task->td_flags.executing = 1; // resume previous task
1078  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1079  "resuming task %p\n",
1080  gtid, taskdata, resumed_task));
1081  return;
1082  }
1083  }
1084 
1085  // bookkeeping for resuming task:
1086  // GEH - note tasking_ser => task_serial
1087  KMP_DEBUG_ASSERT(
1088  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1089  taskdata->td_flags.task_serial);
1090  if (taskdata->td_flags.task_serial) {
1091  if (resumed_task == NULL) {
1092  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1093  // task is the parent
1094  }
1095  } else {
1096  KMP_DEBUG_ASSERT(resumed_task !=
1097  NULL); // verify that resumed task is passed as argument
1098  }
1099 
1100  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1101  destructor thunk that has been generated by the compiler. The code is
1102  placed here, since at this point other tasks might have been released
1103  hence overlapping the destructor invocations with some other work in the
1104  released tasks. The OpenMP spec is not specific on when the destructors
1105  are invoked, so we should be free to choose. */
1106  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1107  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1108  KMP_ASSERT(destr_thunk);
1109  destr_thunk(gtid, task);
1110  }
1111 
1112  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1113  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1114  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1115 
1116  bool completed = true;
1117  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1118  if (taskdata->td_allow_completion_event.type ==
1119  KMP_EVENT_ALLOW_COMPLETION) {
1120  // event hasn't been fulfilled yet. Try to detach task.
1121  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1122  if (taskdata->td_allow_completion_event.type ==
1123  KMP_EVENT_ALLOW_COMPLETION) {
1124  // task finished execution
1125  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1126  taskdata->td_flags.executing = 0; // suspend the finishing task
1127 
1128 #if OMPT_SUPPORT
1129  // For a detached task, which is not completed, we switch back
1130  // the omp_fulfill_event signals completion
1131  // locking is necessary to avoid a race with ompt_task_late_fulfill
1132  if (ompt)
1133  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1134 #endif
1135 
1136  // no access to taskdata after this point!
1137  // __kmp_fulfill_event might free taskdata at any time from now
1138 
1139  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1140  completed = false;
1141  }
1142  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1143  }
1144  }
1145 
1146  // Tasks with valid target async handles must be re-enqueued.
1147  if (taskdata->td_target_data.async_handle != NULL) {
1148  // Note: no need to translate gtid to its shadow. If the current thread is a
1149  // hidden helper one, then the gtid is already correct. Otherwise, hidden
1150  // helper threads are disabled, and gtid refers to a OpenMP thread.
1151  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
1152  if (KMP_HIDDEN_HELPER_THREAD(gtid))
1153  __kmp_hidden_helper_worker_thread_signal();
1154  completed = false;
1155  }
1156 
1157  if (completed) {
1158  taskdata->td_flags.complete = 1; // mark the task as completed
1159 #if OMPX_TASKGRAPH
1160  taskdata->td_flags.onced = 1; // mark the task as ran once already
1161 #endif
1162 
1163 #if OMPT_SUPPORT
1164  // This is not a detached task, we are done here
1165  if (ompt)
1166  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1167 #endif
1168  // TODO: What would be the balance between the conditions in the function
1169  // and an atomic operation?
1170  if (__kmp_track_children_task(taskdata)) {
1171  __kmp_release_deps(gtid, taskdata);
1172  // Predecrement simulated by "- 1" calculation
1173 #if KMP_DEBUG
1174  children = -1 +
1175 #endif
1176  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1177  KMP_DEBUG_ASSERT(children >= 0);
1178 #if OMPX_TASKGRAPH
1179  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1180 #else
1181  if (taskdata->td_taskgroup)
1182 #endif
1183  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1184  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1185  task_team->tt.tt_hidden_helper_task_encountered)) {
1186  // if we found proxy or hidden helper tasks there could exist a dependency
1187  // chain with the proxy task as origin
1188  __kmp_release_deps(gtid, taskdata);
1189  }
1190  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1191  // called. Othertwise, if a task is executed immediately from the
1192  // release_deps code, the flag will be reset to 1 again by this same
1193  // function
1194  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1195  taskdata->td_flags.executing = 0; // suspend the finishing task
1196 
1197  // Decrement the counter of hidden helper tasks to be executed.
1198  if (taskdata->td_flags.hidden_helper) {
1199  // Hidden helper tasks can only be executed by hidden helper threads.
1200  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1201  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1202  }
1203  }
1204 
1205  KA_TRACE(
1206  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1207  gtid, taskdata, children));
1208 
1209  // Free this task and then ancestor tasks if they have no children.
1210  // Restore th_current_task first as suggested by John:
1211  // johnmc: if an asynchronous inquiry peers into the runtime system
1212  // it doesn't see the freed task as the current task.
1213  thread->th.th_current_task = resumed_task;
1214  if (completed)
1215  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1216 
1217  // TODO: GEH - make sure root team implicit task is initialized properly.
1218  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1219  resumed_task->td_flags.executing = 1; // resume previous task
1220 
1221 #if OMPX_TASKGRAPH
1222  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1223  taskdata->td_taskgroup) {
1224  // TDG: we only release taskgroup barrier here because
1225  // free_task_and_ancestors will call
1226  // __kmp_free_task, which resets all task parameters such as
1227  // taskdata->started, etc. If we release the barrier earlier, these
1228  // parameters could be read before being reset. This is not an issue for
1229  // non-TDG implementation because we never reuse a task(data) structure
1230  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1231  }
1232 #endif
1233 
1234  KA_TRACE(
1235  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1236  gtid, taskdata, resumed_task));
1237 
1238  return;
1239 }
1240 
1241 template <bool ompt>
1242 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1243  kmp_int32 gtid,
1244  kmp_task_t *task) {
1245  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1246  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1247  KMP_DEBUG_ASSERT(gtid >= 0);
1248  // this routine will provide task to resume
1249  __kmp_task_finish<ompt>(gtid, task, NULL);
1250 
1251  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1252  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1253 
1254 #if OMPT_SUPPORT
1255  if (ompt) {
1256  ompt_frame_t *ompt_frame;
1257  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1258  ompt_frame->enter_frame = ompt_data_none;
1259  ompt_frame->enter_frame_flags =
1260  ompt_frame_runtime | ompt_frame_framepointer;
1261  }
1262 #endif
1263 
1264  return;
1265 }
1266 
1267 #if OMPT_SUPPORT
1268 OMPT_NOINLINE
1269 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1270  kmp_task_t *task) {
1271  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1272 }
1273 #endif // OMPT_SUPPORT
1274 
1275 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1276 //
1277 // loc_ref: source location information; points to end of task block.
1278 // gtid: global thread number.
1279 // task: task thunk for the completed task.
1280 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1281  kmp_task_t *task) {
1282 #if OMPT_SUPPORT
1283  if (UNLIKELY(ompt_enabled.enabled)) {
1284  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1285  return;
1286  }
1287 #endif
1288  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1289 }
1290 
1291 #ifdef TASK_UNUSED
1292 // __kmpc_omp_task_complete: report that a task has completed execution
1293 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1294 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1295  kmp_task_t *task) {
1296  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1297  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1298 
1299  __kmp_task_finish<false>(gtid, task,
1300  NULL); // Not sure how to find task to resume
1301 
1302  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1303  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1304  return;
1305 }
1306 #endif // TASK_UNUSED
1307 
1308 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1309 // task for a given thread
1310 //
1311 // loc_ref: reference to source location of parallel region
1312 // this_thr: thread data structure corresponding to implicit task
1313 // team: team for this_thr
1314 // tid: thread id of given thread within team
1315 // set_curr_task: TRUE if need to push current task to thread
1316 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1317 // have already been done elsewhere.
1318 // TODO: Get better loc_ref. Value passed in may be NULL
1319 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1320  kmp_team_t *team, int tid, int set_curr_task) {
1321  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1322 
1323  KF_TRACE(
1324  10,
1325  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1326  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1327 
1328  task->td_task_id = KMP_GEN_TASK_ID();
1329  task->td_team = team;
1330  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1331  // in debugger)
1332  task->td_ident = loc_ref;
1333  task->td_taskwait_ident = NULL;
1334  task->td_taskwait_counter = 0;
1335  task->td_taskwait_thread = 0;
1336 
1337  task->td_flags.tiedness = TASK_TIED;
1338  task->td_flags.tasktype = TASK_IMPLICIT;
1339  task->td_flags.proxy = TASK_FULL;
1340 
1341  // All implicit tasks are executed immediately, not deferred
1342  task->td_flags.task_serial = 1;
1343  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1344  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1345 
1346  task->td_flags.started = 1;
1347  task->td_flags.executing = 1;
1348  task->td_flags.complete = 0;
1349  task->td_flags.freed = 0;
1350 #if OMPX_TASKGRAPH
1351  task->td_flags.onced = 0;
1352 #endif
1353 
1354  task->td_depnode = NULL;
1355  task->td_last_tied = task;
1356  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1357 
1358  if (set_curr_task) { // only do this init first time thread is created
1359  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1360  // Not used: don't need to deallocate implicit task
1361  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1362  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1363  task->td_dephash = NULL;
1364  __kmp_push_current_task_to_thread(this_thr, team, tid);
1365  } else {
1366  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1367  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1368  }
1369 
1370 #if OMPT_SUPPORT
1371  if (UNLIKELY(ompt_enabled.enabled))
1372  __ompt_task_init(task, tid);
1373 #endif
1374 
1375  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1376  team, task));
1377 }
1378 
1379 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1380 // at the end of parallel regions. Some resources are kept for reuse in the next
1381 // parallel region.
1382 //
1383 // thread: thread data structure corresponding to implicit task
1384 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1385  kmp_taskdata_t *task = thread->th.th_current_task;
1386  if (task->td_dephash) {
1387  int children;
1388  task->td_flags.complete = 1;
1389 #if OMPX_TASKGRAPH
1390  task->td_flags.onced = 1;
1391 #endif
1392  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1393  kmp_tasking_flags_t flags_old = task->td_flags;
1394  if (children == 0 && flags_old.complete == 1) {
1395  kmp_tasking_flags_t flags_new = flags_old;
1396  flags_new.complete = 0;
1397  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1398  *RCAST(kmp_int32 *, &flags_old),
1399  *RCAST(kmp_int32 *, &flags_new))) {
1400  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1401  "dephash of implicit task %p\n",
1402  thread->th.th_info.ds.ds_gtid, task));
1403  __kmp_dephash_free_entries(thread, task->td_dephash);
1404  }
1405  }
1406  }
1407 }
1408 
1409 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1410 // when these are destroyed regions
1411 //
1412 // thread: thread data structure corresponding to implicit task
1413 void __kmp_free_implicit_task(kmp_info_t *thread) {
1414  kmp_taskdata_t *task = thread->th.th_current_task;
1415  if (task && task->td_dephash) {
1416  __kmp_dephash_free(thread, task->td_dephash);
1417  task->td_dephash = NULL;
1418  }
1419 }
1420 
1421 // Round up a size to a power of two specified by val: Used to insert padding
1422 // between structures co-allocated using a single malloc() call
1423 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1424  if (size & (val - 1)) {
1425  size &= ~(val - 1);
1426  if (size <= KMP_SIZE_T_MAX - val) {
1427  size += val; // Round up if there is no overflow.
1428  }
1429  }
1430  return size;
1431 } // __kmp_round_up_to_va
1432 
1433 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1434 //
1435 // loc_ref: source location information
1436 // gtid: global thread number.
1437 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1438 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1439 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1440 // private vars accessed in task.
1441 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1442 // in task.
1443 // task_entry: Pointer to task code entry point generated by compiler.
1444 // returns: a pointer to the allocated kmp_task_t structure (task).
1445 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1446  kmp_tasking_flags_t *flags,
1447  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1448  kmp_routine_entry_t task_entry) {
1449  kmp_task_t *task;
1450  kmp_taskdata_t *taskdata;
1451  kmp_info_t *thread = __kmp_threads[gtid];
1452  kmp_team_t *team = thread->th.th_team;
1453  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1454  size_t shareds_offset;
1455 
1456  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1457  __kmp_middle_initialize();
1458 
1459  if (flags->hidden_helper) {
1460  if (__kmp_enable_hidden_helper) {
1461  if (!TCR_4(__kmp_init_hidden_helper))
1462  __kmp_hidden_helper_initialize();
1463  } else {
1464  // If the hidden helper task is not enabled, reset the flag to FALSE.
1465  flags->hidden_helper = FALSE;
1466  }
1467  }
1468 
1469  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1470  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1471  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1472  sizeof_shareds, task_entry));
1473 
1474  KMP_DEBUG_ASSERT(parent_task);
1475  if (parent_task->td_flags.final) {
1476  if (flags->merged_if0) {
1477  }
1478  flags->final = 1;
1479  }
1480 
1481  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1482  // Untied task encountered causes the TSC algorithm to check entire deque of
1483  // the victim thread. If no untied task encountered, then checking the head
1484  // of the deque should be enough.
1485  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1486  }
1487 
1488  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1489  // the tasking setup
1490  // when that happens is too late.
1491  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1492  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1493  if (flags->proxy == TASK_PROXY) {
1494  flags->tiedness = TASK_UNTIED;
1495  flags->merged_if0 = 1;
1496  }
1497  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1498  tasking support enabled */
1499  if ((thread->th.th_task_team) == NULL) {
1500  /* This should only happen if the team is serialized
1501  setup a task team and propagate it to the thread */
1502  KMP_DEBUG_ASSERT(team->t.t_serialized);
1503  KA_TRACE(30,
1504  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1505  gtid));
1506  // 1 indicates setup the current team regardless of nthreads
1507  __kmp_task_team_setup(thread, team, 1);
1508  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1509  }
1510  kmp_task_team_t *task_team = thread->th.th_task_team;
1511 
1512  /* tasking must be enabled now as the task might not be pushed */
1513  if (!KMP_TASKING_ENABLED(task_team)) {
1514  KA_TRACE(
1515  30,
1516  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1517  __kmp_enable_tasking(task_team, thread);
1518  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1519  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1520  // No lock needed since only owner can allocate
1521  if (thread_data->td.td_deque == NULL) {
1522  __kmp_alloc_task_deque(thread, thread_data);
1523  }
1524  }
1525 
1526  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1527  task_team->tt.tt_found_proxy_tasks == FALSE)
1528  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1529  if (flags->hidden_helper &&
1530  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1531  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1532  }
1533 
1534  // Calculate shared structure offset including padding after kmp_task_t struct
1535  // to align pointers in shared struct
1536  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1537  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1538 
1539  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1540  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1541  shareds_offset));
1542  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1543  sizeof_shareds));
1544 
1545  // Avoid double allocation here by combining shareds with taskdata
1546 #if USE_FAST_MEMORY
1547  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1548  sizeof_shareds);
1549 #else /* ! USE_FAST_MEMORY */
1550  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1551  sizeof_shareds);
1552 #endif /* USE_FAST_MEMORY */
1553 
1554  task = KMP_TASKDATA_TO_TASK(taskdata);
1555 
1556 // Make sure task & taskdata are aligned appropriately
1557 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1558  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1559  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1560 #else
1561  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1562  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1563 #endif
1564  if (sizeof_shareds > 0) {
1565  // Avoid double allocation here by combining shareds with taskdata
1566  task->shareds = &((char *)taskdata)[shareds_offset];
1567  // Make sure shareds struct is aligned to pointer size
1568  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1569  0);
1570  } else {
1571  task->shareds = NULL;
1572  }
1573  task->routine = task_entry;
1574  task->part_id = 0; // AC: Always start with 0 part id
1575 
1576  taskdata->td_task_id = KMP_GEN_TASK_ID();
1577  taskdata->td_team = thread->th.th_team;
1578  taskdata->td_alloc_thread = thread;
1579  taskdata->td_parent = parent_task;
1580  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1581  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1582  taskdata->td_ident = loc_ref;
1583  taskdata->td_taskwait_ident = NULL;
1584  taskdata->td_taskwait_counter = 0;
1585  taskdata->td_taskwait_thread = 0;
1586  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1587  // avoid copying icvs for proxy tasks
1588  if (flags->proxy == TASK_FULL)
1589  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1590 
1591  taskdata->td_flags = *flags;
1592  taskdata->td_task_team = thread->th.th_task_team;
1593  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1594  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1595  // If it is hidden helper task, we need to set the team and task team
1596  // correspondingly.
1597  if (flags->hidden_helper) {
1598  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1599  taskdata->td_team = shadow_thread->th.th_team;
1600  taskdata->td_task_team = shadow_thread->th.th_task_team;
1601  }
1602 
1603  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1604  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1605 
1606  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1607  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1608 
1609  // GEH - Note we serialize the task if the team is serialized to make sure
1610  // implicit parallel region tasks are not left until program termination to
1611  // execute. Also, it helps locality to execute immediately.
1612 
1613  taskdata->td_flags.task_serial =
1614  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1615  taskdata->td_flags.tasking_ser || flags->merged_if0);
1616 
1617  taskdata->td_flags.started = 0;
1618  taskdata->td_flags.executing = 0;
1619  taskdata->td_flags.complete = 0;
1620  taskdata->td_flags.freed = 0;
1621 #if OMPX_TASKGRAPH
1622  taskdata->td_flags.onced = 0;
1623 #endif
1624  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1625  // start at one because counts current task and children
1626  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1627  taskdata->td_taskgroup =
1628  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1629  taskdata->td_dephash = NULL;
1630  taskdata->td_depnode = NULL;
1631  taskdata->td_target_data.async_handle = NULL;
1632  if (flags->tiedness == TASK_UNTIED)
1633  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1634  else
1635  taskdata->td_last_tied = taskdata;
1636  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1637 #if OMPT_SUPPORT
1638  if (UNLIKELY(ompt_enabled.enabled))
1639  __ompt_task_init(taskdata, gtid);
1640 #endif
1641  // TODO: What would be the balance between the conditions in the function and
1642  // an atomic operation?
1643  if (__kmp_track_children_task(taskdata)) {
1644  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1645  if (parent_task->td_taskgroup)
1646  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1647  // Only need to keep track of allocated child tasks for explicit tasks since
1648  // implicit not deallocated
1649  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1650  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1651  }
1652  if (flags->hidden_helper) {
1653  taskdata->td_flags.task_serial = FALSE;
1654  // Increment the number of hidden helper tasks to be executed
1655  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1656  }
1657  }
1658 
1659 #if OMPX_TASKGRAPH
1660  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1661  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1662  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1663  taskdata->is_taskgraph = 1;
1664  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1665  taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1666  }
1667 #endif
1668  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1669  gtid, taskdata, taskdata->td_parent));
1670 
1671  return task;
1672 }
1673 
1674 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1675  kmp_int32 flags, size_t sizeof_kmp_task_t,
1676  size_t sizeof_shareds,
1677  kmp_routine_entry_t task_entry) {
1678  kmp_task_t *retval;
1679  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1680  __kmp_assert_valid_gtid(gtid);
1681  input_flags->native = FALSE;
1682  // __kmp_task_alloc() sets up all other runtime flags
1683  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1684  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1685  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1686  input_flags->proxy ? "proxy" : "",
1687  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1688  sizeof_shareds, task_entry));
1689 
1690  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1691  sizeof_shareds, task_entry);
1692 
1693  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1694 
1695  return retval;
1696 }
1697 
1698 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1699  kmp_int32 flags,
1700  size_t sizeof_kmp_task_t,
1701  size_t sizeof_shareds,
1702  kmp_routine_entry_t task_entry,
1703  kmp_int64 device_id) {
1704  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1705  // target task is untied defined in the specification
1706  input_flags.tiedness = TASK_UNTIED;
1707 
1708  if (__kmp_enable_hidden_helper)
1709  input_flags.hidden_helper = TRUE;
1710 
1711  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1712  sizeof_shareds, task_entry);
1713 }
1714 
1728 kmp_int32
1730  kmp_task_t *new_task, kmp_int32 naffins,
1731  kmp_task_affinity_info_t *affin_list) {
1732  return 0;
1733 }
1734 
1735 // __kmp_invoke_task: invoke the specified task
1736 //
1737 // gtid: global thread ID of caller
1738 // task: the task to invoke
1739 // current_task: the task to resume after task invocation
1740 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1741  kmp_taskdata_t *current_task) {
1742  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1743  kmp_info_t *thread;
1744  int discard = 0 /* false */;
1745  KA_TRACE(
1746  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1747  gtid, taskdata, current_task));
1748  KMP_DEBUG_ASSERT(task);
1749  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1750  taskdata->td_flags.complete == 1)) {
1751  // This is a proxy task that was already completed but it needs to run
1752  // its bottom-half finish
1753  KA_TRACE(
1754  30,
1755  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1756  gtid, taskdata));
1757 
1758  __kmp_bottom_half_finish_proxy(gtid, task);
1759 
1760  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1761  "proxy task %p, resuming task %p\n",
1762  gtid, taskdata, current_task));
1763 
1764  return;
1765  }
1766 
1767 #if OMPT_SUPPORT
1768  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1769  // does not execute code.
1770  ompt_thread_info_t oldInfo;
1771  if (UNLIKELY(ompt_enabled.enabled)) {
1772  // Store the threads states and restore them after the task
1773  thread = __kmp_threads[gtid];
1774  oldInfo = thread->th.ompt_thread_info;
1775  thread->th.ompt_thread_info.wait_id = 0;
1776  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1777  ? ompt_state_work_serial
1778  : ompt_state_work_parallel;
1779  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1780  }
1781 #endif
1782 
1783  // Proxy tasks are not handled by the runtime
1784  if (taskdata->td_flags.proxy != TASK_PROXY) {
1785  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1786  }
1787 
1788  // TODO: cancel tasks if the parallel region has also been cancelled
1789  // TODO: check if this sequence can be hoisted above __kmp_task_start
1790  // if cancellation has been enabled for this run ...
1791  if (UNLIKELY(__kmp_omp_cancellation)) {
1792  thread = __kmp_threads[gtid];
1793  kmp_team_t *this_team = thread->th.th_team;
1794  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1795  if ((taskgroup && taskgroup->cancel_request) ||
1796  (this_team->t.t_cancel_request == cancel_parallel)) {
1797 #if OMPT_SUPPORT && OMPT_OPTIONAL
1798  ompt_data_t *task_data;
1799  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1800  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1801  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1802  task_data,
1803  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1804  : ompt_cancel_parallel) |
1805  ompt_cancel_discarded_task,
1806  NULL);
1807  }
1808 #endif
1809  KMP_COUNT_BLOCK(TASK_cancelled);
1810  // this task belongs to a task group and we need to cancel it
1811  discard = 1 /* true */;
1812  }
1813  }
1814 
1815  // Invoke the task routine and pass in relevant data.
1816  // Thunks generated by gcc take a different argument list.
1817  if (!discard) {
1818  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1819  taskdata->td_last_tied = current_task->td_last_tied;
1820  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1821  }
1822 #if KMP_STATS_ENABLED
1823  KMP_COUNT_BLOCK(TASK_executed);
1824  switch (KMP_GET_THREAD_STATE()) {
1825  case FORK_JOIN_BARRIER:
1826  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1827  break;
1828  case PLAIN_BARRIER:
1829  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1830  break;
1831  case TASKYIELD:
1832  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1833  break;
1834  case TASKWAIT:
1835  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1836  break;
1837  case TASKGROUP:
1838  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1839  break;
1840  default:
1841  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1842  break;
1843  }
1844 #endif // KMP_STATS_ENABLED
1845 
1846 // OMPT task begin
1847 #if OMPT_SUPPORT
1848  if (UNLIKELY(ompt_enabled.enabled))
1849  __ompt_task_start(task, current_task, gtid);
1850 #endif
1851 #if OMPT_SUPPORT && OMPT_OPTIONAL
1852  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1853  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1854  ompt_data_t instance = ompt_data_none;
1855  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1856  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1857  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1858  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1859  ompt_dispatch_taskloop_chunk, instance);
1860  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1861  }
1862 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1863 
1864 #if OMPD_SUPPORT
1865  if (ompd_state & OMPD_ENABLE_BP)
1866  ompd_bp_task_begin();
1867 #endif
1868 
1869 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1870  kmp_uint64 cur_time;
1871  kmp_int32 kmp_itt_count_task =
1872  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1873  current_task->td_flags.tasktype == TASK_IMPLICIT;
1874  if (kmp_itt_count_task) {
1875  thread = __kmp_threads[gtid];
1876  // Time outer level explicit task on barrier for adjusting imbalance time
1877  if (thread->th.th_bar_arrive_time)
1878  cur_time = __itt_get_timestamp();
1879  else
1880  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1881  }
1882  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1883 #endif
1884 
1885 #if ENABLE_LIBOMPTARGET
1886  if (taskdata->td_target_data.async_handle != NULL) {
1887  // If we have a valid target async handle, that means that we have already
1888  // executed the task routine once. We must query for the handle completion
1889  // instead of re-executing the routine.
1890  KMP_ASSERT(tgt_target_nowait_query);
1891  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1892  } else
1893 #endif
1894  if (task->routine != NULL) {
1895 #ifdef KMP_GOMP_COMPAT
1896  if (taskdata->td_flags.native) {
1897  ((void (*)(void *))(*(task->routine)))(task->shareds);
1898  } else
1899 #endif /* KMP_GOMP_COMPAT */
1900  {
1901  (*(task->routine))(gtid, task);
1902  }
1903  }
1904  KMP_POP_PARTITIONED_TIMER();
1905 
1906 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1907  if (kmp_itt_count_task) {
1908  // Barrier imbalance - adjust arrive time with the task duration
1909  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1910  }
1911  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1912  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1913 #endif
1914  }
1915 
1916 #if OMPD_SUPPORT
1917  if (ompd_state & OMPD_ENABLE_BP)
1918  ompd_bp_task_end();
1919 #endif
1920 
1921  // Proxy tasks are not handled by the runtime
1922  if (taskdata->td_flags.proxy != TASK_PROXY) {
1923 #if OMPT_SUPPORT
1924  if (UNLIKELY(ompt_enabled.enabled)) {
1925  thread->th.ompt_thread_info = oldInfo;
1926  if (taskdata->td_flags.tiedness == TASK_TIED) {
1927  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1928  }
1929  __kmp_task_finish<true>(gtid, task, current_task);
1930  } else
1931 #endif
1932  __kmp_task_finish<false>(gtid, task, current_task);
1933  }
1934 
1935  KA_TRACE(
1936  30,
1937  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1938  gtid, taskdata, current_task));
1939  return;
1940 }
1941 
1942 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1943 //
1944 // loc_ref: location of original task pragma (ignored)
1945 // gtid: Global Thread ID of encountering thread
1946 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1947 // Returns:
1948 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1949 // be resumed later.
1950 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1951 // resumed later.
1952 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1953  kmp_task_t *new_task) {
1954  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1955 
1956  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1957  loc_ref, new_taskdata));
1958 
1959 #if OMPT_SUPPORT
1960  kmp_taskdata_t *parent;
1961  if (UNLIKELY(ompt_enabled.enabled)) {
1962  parent = new_taskdata->td_parent;
1963  if (ompt_enabled.ompt_callback_task_create) {
1964  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1965  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1966  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1967  OMPT_GET_RETURN_ADDRESS(0));
1968  }
1969  }
1970 #endif
1971 
1972  /* Should we execute the new task or queue it? For now, let's just always try
1973  to queue it. If the queue fills up, then we'll execute it. */
1974 
1975  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1976  { // Execute this task immediately
1977  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1978  new_taskdata->td_flags.task_serial = 1;
1979  __kmp_invoke_task(gtid, new_task, current_task);
1980  }
1981 
1982  KA_TRACE(
1983  10,
1984  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1985  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1986  gtid, loc_ref, new_taskdata));
1987 
1988 #if OMPT_SUPPORT
1989  if (UNLIKELY(ompt_enabled.enabled)) {
1990  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1991  }
1992 #endif
1993  return TASK_CURRENT_NOT_QUEUED;
1994 }
1995 
1996 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1997 //
1998 // gtid: Global Thread ID of encountering thread
1999 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2000 // serialize_immediate: if TRUE then if the task is executed immediately its
2001 // execution will be serialized
2002 // Returns:
2003 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2004 // be resumed later.
2005 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2006 // resumed later.
2007 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2008  bool serialize_immediate) {
2009  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2010 
2011 #if OMPX_TASKGRAPH
2012  if (new_taskdata->is_taskgraph &&
2013  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2014  kmp_tdg_info_t *tdg = new_taskdata->tdg;
2015  // extend the record_map if needed
2016  if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
2017  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2018  // map_size could have been updated by another thread if recursive
2019  // taskloop
2020  if (new_taskdata->td_task_id >= tdg->map_size) {
2021  kmp_uint old_size = tdg->map_size;
2022  kmp_uint new_size = old_size * 2;
2023  kmp_node_info_t *old_record = tdg->record_map;
2024  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
2025  new_size * sizeof(kmp_node_info_t));
2026 
2027  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2028  tdg->record_map = new_record;
2029 
2030  __kmp_free(old_record);
2031 
2032  for (kmp_int i = old_size; i < new_size; i++) {
2033  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
2034  __kmp_successors_size * sizeof(kmp_int32));
2035  new_record[i].task = nullptr;
2036  new_record[i].successors = successorsList;
2037  new_record[i].nsuccessors = 0;
2038  new_record[i].npredecessors = 0;
2039  new_record[i].successors_size = __kmp_successors_size;
2040  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
2041  }
2042  // update the size at the end, so that we avoid other
2043  // threads use old_record while map_size is already updated
2044  tdg->map_size = new_size;
2045  }
2046  __kmp_release_bootstrap_lock(&tdg->graph_lock);
2047  }
2048  // record a task
2049  if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
2050  tdg->record_map[new_taskdata->td_task_id].task = new_task;
2051  tdg->record_map[new_taskdata->td_task_id].parent_task =
2052  new_taskdata->td_parent;
2053  KMP_ATOMIC_INC(&tdg->num_tasks);
2054  }
2055  }
2056 #endif
2057 
2058  /* Should we execute the new task or queue it? For now, let's just always try
2059  to queue it. If the queue fills up, then we'll execute it. */
2060  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
2061  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
2062  { // Execute this task immediately
2063  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2064  if (serialize_immediate)
2065  new_taskdata->td_flags.task_serial = 1;
2066  __kmp_invoke_task(gtid, new_task, current_task);
2067  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2068  __kmp_wpolicy_passive) {
2069  kmp_info_t *this_thr = __kmp_threads[gtid];
2070  kmp_team_t *team = this_thr->th.th_team;
2071  kmp_int32 nthreads = this_thr->th.th_team_nproc;
2072  for (int i = 0; i < nthreads; ++i) {
2073  kmp_info_t *thread = team->t.t_threads[i];
2074  if (thread == this_thr)
2075  continue;
2076  if (thread->th.th_sleep_loc != NULL) {
2077  __kmp_null_resume_wrapper(thread);
2078  break; // awake one thread at a time
2079  }
2080  }
2081  }
2082  return TASK_CURRENT_NOT_QUEUED;
2083 }
2084 
2085 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2086 // non-thread-switchable task from the parent thread only!
2087 //
2088 // loc_ref: location of original task pragma (ignored)
2089 // gtid: Global Thread ID of encountering thread
2090 // new_task: non-thread-switchable task thunk allocated by
2091 // __kmp_omp_task_alloc()
2092 // Returns:
2093 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2094 // be resumed later.
2095 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2096 // resumed later.
2097 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2098  kmp_task_t *new_task) {
2099  kmp_int32 res;
2100  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2101 
2102 #if KMP_DEBUG || OMPT_SUPPORT
2103  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2104 #endif
2105  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2106  new_taskdata));
2107  __kmp_assert_valid_gtid(gtid);
2108 
2109 #if OMPT_SUPPORT
2110  kmp_taskdata_t *parent = NULL;
2111  if (UNLIKELY(ompt_enabled.enabled)) {
2112  if (!new_taskdata->td_flags.started) {
2113  OMPT_STORE_RETURN_ADDRESS(gtid);
2114  parent = new_taskdata->td_parent;
2115  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2116  parent->ompt_task_info.frame.enter_frame.ptr =
2117  OMPT_GET_FRAME_ADDRESS(0);
2118  }
2119  if (ompt_enabled.ompt_callback_task_create) {
2120  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2121  &(parent->ompt_task_info.task_data),
2122  &(parent->ompt_task_info.frame),
2123  &(new_taskdata->ompt_task_info.task_data),
2124  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2125  OMPT_LOAD_RETURN_ADDRESS(gtid));
2126  }
2127  } else {
2128  // We are scheduling the continuation of an UNTIED task.
2129  // Scheduling back to the parent task.
2130  __ompt_task_finish(new_task,
2131  new_taskdata->ompt_task_info.scheduling_parent,
2132  ompt_task_switch);
2133  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2134  }
2135  }
2136 #endif
2137 
2138  res = __kmp_omp_task(gtid, new_task, true);
2139 
2140  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2141  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2142  gtid, loc_ref, new_taskdata));
2143 #if OMPT_SUPPORT
2144  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2145  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2146  }
2147 #endif
2148  return res;
2149 }
2150 
2151 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2152 // a taskloop task with the correct OMPT return address
2153 //
2154 // loc_ref: location of original task pragma (ignored)
2155 // gtid: Global Thread ID of encountering thread
2156 // new_task: non-thread-switchable task thunk allocated by
2157 // __kmp_omp_task_alloc()
2158 // codeptr_ra: return address for OMPT callback
2159 // Returns:
2160 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2161 // be resumed later.
2162 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2163 // resumed later.
2164 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2165  kmp_task_t *new_task, void *codeptr_ra) {
2166  kmp_int32 res;
2167  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2168 
2169 #if KMP_DEBUG || OMPT_SUPPORT
2170  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2171 #endif
2172  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2173  new_taskdata));
2174 
2175 #if OMPT_SUPPORT
2176  kmp_taskdata_t *parent = NULL;
2177  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2178  parent = new_taskdata->td_parent;
2179  if (!parent->ompt_task_info.frame.enter_frame.ptr)
2180  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2181  if (ompt_enabled.ompt_callback_task_create) {
2182  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2183  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2184  &(new_taskdata->ompt_task_info.task_data),
2185  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2186  codeptr_ra);
2187  }
2188  }
2189 #endif
2190 
2191  res = __kmp_omp_task(gtid, new_task, true);
2192 
2193  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2194  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2195  gtid, loc_ref, new_taskdata));
2196 #if OMPT_SUPPORT
2197  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2198  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2199  }
2200 #endif
2201  return res;
2202 }
2203 
2204 template <bool ompt>
2205 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2206  void *frame_address,
2207  void *return_address) {
2208  kmp_taskdata_t *taskdata = nullptr;
2209  kmp_info_t *thread;
2210  int thread_finished = FALSE;
2211  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2212 
2213  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2214  KMP_DEBUG_ASSERT(gtid >= 0);
2215 
2216  if (__kmp_tasking_mode != tskm_immediate_exec) {
2217  thread = __kmp_threads[gtid];
2218  taskdata = thread->th.th_current_task;
2219 
2220 #if OMPT_SUPPORT && OMPT_OPTIONAL
2221  ompt_data_t *my_task_data;
2222  ompt_data_t *my_parallel_data;
2223 
2224  if (ompt) {
2225  my_task_data = &(taskdata->ompt_task_info.task_data);
2226  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2227 
2228  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2229 
2230  if (ompt_enabled.ompt_callback_sync_region) {
2231  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2232  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2233  my_task_data, return_address);
2234  }
2235 
2236  if (ompt_enabled.ompt_callback_sync_region_wait) {
2237  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2238  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2239  my_task_data, return_address);
2240  }
2241  }
2242 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2243 
2244 // Debugger: The taskwait is active. Store location and thread encountered the
2245 // taskwait.
2246 #if USE_ITT_BUILD
2247 // Note: These values are used by ITT events as well.
2248 #endif /* USE_ITT_BUILD */
2249  taskdata->td_taskwait_counter += 1;
2250  taskdata->td_taskwait_ident = loc_ref;
2251  taskdata->td_taskwait_thread = gtid + 1;
2252 
2253 #if USE_ITT_BUILD
2254  void *itt_sync_obj = NULL;
2255 #if USE_ITT_NOTIFY
2256  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2257 #endif /* USE_ITT_NOTIFY */
2258 #endif /* USE_ITT_BUILD */
2259 
2260  bool must_wait =
2261  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2262 
2263  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2264  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2265  // If hidden helper thread is encountered, we must enable wait here.
2266  must_wait =
2267  must_wait ||
2268  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2269  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2270 
2271  if (must_wait) {
2272  kmp_flag_32<false, false> flag(
2273  RCAST(std::atomic<kmp_uint32> *,
2274  &(taskdata->td_incomplete_child_tasks)),
2275  0U);
2276  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2277  flag.execute_tasks(thread, gtid, FALSE,
2278  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2279  __kmp_task_stealing_constraint);
2280  }
2281  }
2282 #if USE_ITT_BUILD
2283  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2284  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2285 #endif /* USE_ITT_BUILD */
2286 
2287  // Debugger: The taskwait is completed. Location remains, but thread is
2288  // negated.
2289  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2290 
2291 #if OMPT_SUPPORT && OMPT_OPTIONAL
2292  if (ompt) {
2293  if (ompt_enabled.ompt_callback_sync_region_wait) {
2294  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2295  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2296  my_task_data, return_address);
2297  }
2298  if (ompt_enabled.ompt_callback_sync_region) {
2299  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2300  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2301  my_task_data, return_address);
2302  }
2303  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2304  }
2305 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2306  }
2307 
2308  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2309  "returning TASK_CURRENT_NOT_QUEUED\n",
2310  gtid, taskdata));
2311 
2312  return TASK_CURRENT_NOT_QUEUED;
2313 }
2314 
2315 #if OMPT_SUPPORT && OMPT_OPTIONAL
2316 OMPT_NOINLINE
2317 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2318  void *frame_address,
2319  void *return_address) {
2320  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2321  return_address);
2322 }
2323 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2324 
2325 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2326 // complete
2327 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2328 #if OMPT_SUPPORT && OMPT_OPTIONAL
2329  if (UNLIKELY(ompt_enabled.enabled)) {
2330  OMPT_STORE_RETURN_ADDRESS(gtid);
2331  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2332  OMPT_LOAD_RETURN_ADDRESS(gtid));
2333  }
2334 #endif
2335  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2336 }
2337 
2338 // __kmpc_omp_taskyield: switch to a different task
2339 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2340  kmp_taskdata_t *taskdata = NULL;
2341  kmp_info_t *thread;
2342  int thread_finished = FALSE;
2343 
2344  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2345  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2346 
2347  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2348  gtid, loc_ref, end_part));
2349  __kmp_assert_valid_gtid(gtid);
2350 
2351  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2352  thread = __kmp_threads[gtid];
2353  taskdata = thread->th.th_current_task;
2354 // Should we model this as a task wait or not?
2355 // Debugger: The taskwait is active. Store location and thread encountered the
2356 // taskwait.
2357 #if USE_ITT_BUILD
2358 // Note: These values are used by ITT events as well.
2359 #endif /* USE_ITT_BUILD */
2360  taskdata->td_taskwait_counter += 1;
2361  taskdata->td_taskwait_ident = loc_ref;
2362  taskdata->td_taskwait_thread = gtid + 1;
2363 
2364 #if USE_ITT_BUILD
2365  void *itt_sync_obj = NULL;
2366 #if USE_ITT_NOTIFY
2367  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2368 #endif /* USE_ITT_NOTIFY */
2369 #endif /* USE_ITT_BUILD */
2370  if (!taskdata->td_flags.team_serial) {
2371  kmp_task_team_t *task_team = thread->th.th_task_team;
2372  if (task_team != NULL) {
2373  if (KMP_TASKING_ENABLED(task_team)) {
2374 #if OMPT_SUPPORT
2375  if (UNLIKELY(ompt_enabled.enabled))
2376  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2377 #endif
2378  __kmp_execute_tasks_32(
2379  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2380  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2381  __kmp_task_stealing_constraint);
2382 #if OMPT_SUPPORT
2383  if (UNLIKELY(ompt_enabled.enabled))
2384  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2385 #endif
2386  }
2387  }
2388  }
2389 #if USE_ITT_BUILD
2390  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2391 #endif /* USE_ITT_BUILD */
2392 
2393  // Debugger: The taskwait is completed. Location remains, but thread is
2394  // negated.
2395  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2396  }
2397 
2398  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2399  "returning TASK_CURRENT_NOT_QUEUED\n",
2400  gtid, taskdata));
2401 
2402  return TASK_CURRENT_NOT_QUEUED;
2403 }
2404 
2405 // Task Reduction implementation
2406 //
2407 // Note: initial implementation didn't take into account the possibility
2408 // to specify omp_orig for initializer of the UDR (user defined reduction).
2409 // Corrected implementation takes into account the omp_orig object.
2410 // Compiler is free to use old implementation if omp_orig is not specified.
2411 
2420 typedef struct kmp_taskred_flags {
2422  unsigned lazy_priv : 1;
2423  unsigned reserved31 : 31;
2425 
2429 typedef struct kmp_task_red_input {
2430  void *reduce_shar;
2431  size_t reduce_size;
2432  // three compiler-generated routines (init, fini are optional):
2433  void *reduce_init;
2434  void *reduce_fini;
2435  void *reduce_comb;
2438 
2442 typedef struct kmp_taskred_data {
2443  void *reduce_shar;
2444  size_t reduce_size;
2446  void *reduce_priv;
2447  void *reduce_pend;
2448  // three compiler-generated routines (init, fini are optional):
2449  void *reduce_comb;
2450  void *reduce_init;
2451  void *reduce_fini;
2452  void *reduce_orig;
2454 
2460 typedef struct kmp_taskred_input {
2461  void *reduce_shar;
2462  void *reduce_orig;
2463  size_t reduce_size;
2464  // three compiler-generated routines (init, fini are optional):
2465  void *reduce_init;
2466  void *reduce_fini;
2467  void *reduce_comb;
2474 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2475 template <>
2476 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2477  kmp_task_red_input_t &src) {
2478  item.reduce_orig = NULL;
2479 }
2480 template <>
2481 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2482  kmp_taskred_input_t &src) {
2483  if (src.reduce_orig != NULL) {
2484  item.reduce_orig = src.reduce_orig;
2485  } else {
2486  item.reduce_orig = src.reduce_shar;
2487  } // non-NULL reduce_orig means new interface used
2488 }
2489 
2490 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2491 template <>
2492 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2493  size_t offset) {
2494  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2495 }
2496 template <>
2497 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2498  size_t offset) {
2499  ((void (*)(void *, void *))item.reduce_init)(
2500  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2501 }
2502 
2503 template <typename T>
2504 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2505  __kmp_assert_valid_gtid(gtid);
2506  kmp_info_t *thread = __kmp_threads[gtid];
2507  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2508  kmp_uint32 nth = thread->th.th_team_nproc;
2509  kmp_taskred_data_t *arr;
2510 
2511  // check input data just in case
2512  KMP_ASSERT(tg != NULL);
2513  KMP_ASSERT(data != NULL);
2514  KMP_ASSERT(num > 0);
2515  if (nth == 1 && !__kmp_enable_hidden_helper) {
2516  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2517  gtid, tg));
2518  return (void *)tg;
2519  }
2520  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2521  gtid, tg, num));
2522  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2523  thread, num * sizeof(kmp_taskred_data_t));
2524  for (int i = 0; i < num; ++i) {
2525  size_t size = data[i].reduce_size - 1;
2526  // round the size up to cache line per thread-specific item
2527  size += CACHE_LINE - size % CACHE_LINE;
2528  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2529  arr[i].reduce_shar = data[i].reduce_shar;
2530  arr[i].reduce_size = size;
2531  arr[i].flags = data[i].flags;
2532  arr[i].reduce_comb = data[i].reduce_comb;
2533  arr[i].reduce_init = data[i].reduce_init;
2534  arr[i].reduce_fini = data[i].reduce_fini;
2535  __kmp_assign_orig<T>(arr[i], data[i]);
2536  if (!arr[i].flags.lazy_priv) {
2537  // allocate cache-line aligned block and fill it with zeros
2538  arr[i].reduce_priv = __kmp_allocate(nth * size);
2539  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2540  if (arr[i].reduce_init != NULL) {
2541  // initialize all thread-specific items
2542  for (size_t j = 0; j < nth; ++j) {
2543  __kmp_call_init<T>(arr[i], j * size);
2544  }
2545  }
2546  } else {
2547  // only allocate space for pointers now,
2548  // objects will be lazily allocated/initialized if/when requested
2549  // note that __kmp_allocate zeroes the allocated memory
2550  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2551  }
2552  }
2553  tg->reduce_data = (void *)arr;
2554  tg->reduce_num_data = num;
2555  return (void *)tg;
2556 }
2557 
2572 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2573 #if OMPX_TASKGRAPH
2574  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2575  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2576  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2577  this_tdg->rec_taskred_data =
2578  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2579  this_tdg->rec_num_taskred = num;
2580  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2581  sizeof(kmp_task_red_input_t) * num);
2582  }
2583 #endif
2584  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2585 }
2586 
2599 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2600 #if OMPX_TASKGRAPH
2601  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2602  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2603  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2604  this_tdg->rec_taskred_data =
2605  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2606  this_tdg->rec_num_taskred = num;
2607  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2608  sizeof(kmp_task_red_input_t) * num);
2609  }
2610 #endif
2611  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2612 }
2613 
2614 // Copy task reduction data (except for shared pointers).
2615 template <typename T>
2616 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2617  kmp_taskgroup_t *tg, void *reduce_data) {
2618  kmp_taskred_data_t *arr;
2619  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2620  " from data %p\n",
2621  thr, tg, reduce_data));
2622  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2623  thr, num * sizeof(kmp_taskred_data_t));
2624  // threads will share private copies, thunk routines, sizes, flags, etc.:
2625  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2626  for (int i = 0; i < num; ++i) {
2627  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2628  }
2629  tg->reduce_data = (void *)arr;
2630  tg->reduce_num_data = num;
2631 }
2632 
2642 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2643  __kmp_assert_valid_gtid(gtid);
2644  kmp_info_t *thread = __kmp_threads[gtid];
2645  kmp_int32 nth = thread->th.th_team_nproc;
2646  if (nth == 1)
2647  return data; // nothing to do
2648 
2649  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2650  if (tg == NULL)
2651  tg = thread->th.th_current_task->td_taskgroup;
2652  KMP_ASSERT(tg != NULL);
2653  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2654  kmp_int32 num = tg->reduce_num_data;
2655  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2656 
2657 #if OMPX_TASKGRAPH
2658  if ((thread->th.th_current_task->is_taskgraph) &&
2659  (!__kmp_tdg_is_recording(
2660  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2661  tg = thread->th.th_current_task->td_taskgroup;
2662  KMP_ASSERT(tg != NULL);
2663  KMP_ASSERT(tg->reduce_data != NULL);
2664  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2665  num = tg->reduce_num_data;
2666  }
2667 #endif
2668 
2669  KMP_ASSERT(data != NULL);
2670  while (tg != NULL) {
2671  for (int i = 0; i < num; ++i) {
2672  if (!arr[i].flags.lazy_priv) {
2673  if (data == arr[i].reduce_shar ||
2674  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2675  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2676  } else {
2677  // check shared location first
2678  void **p_priv = (void **)(arr[i].reduce_priv);
2679  if (data == arr[i].reduce_shar)
2680  goto found;
2681  // check if we get some thread specific location as parameter
2682  for (int j = 0; j < nth; ++j)
2683  if (data == p_priv[j])
2684  goto found;
2685  continue; // not found, continue search
2686  found:
2687  if (p_priv[tid] == NULL) {
2688  // allocate thread specific object lazily
2689  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2690  if (arr[i].reduce_init != NULL) {
2691  if (arr[i].reduce_orig != NULL) { // new interface
2692  ((void (*)(void *, void *))arr[i].reduce_init)(
2693  p_priv[tid], arr[i].reduce_orig);
2694  } else { // old interface (single parameter)
2695  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2696  }
2697  }
2698  }
2699  return p_priv[tid];
2700  }
2701  }
2702  KMP_ASSERT(tg->parent);
2703  tg = tg->parent;
2704  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2705  num = tg->reduce_num_data;
2706  }
2707  KMP_ASSERT2(0, "Unknown task reduction item");
2708  return NULL; // ERROR, this line never executed
2709 }
2710 
2711 // Finalize task reduction.
2712 // Called from __kmpc_end_taskgroup()
2713 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2714  kmp_int32 nth = th->th.th_team_nproc;
2715  KMP_DEBUG_ASSERT(
2716  nth > 1 ||
2717  __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2718  // are using hidden helper threads
2719  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2720  kmp_int32 num = tg->reduce_num_data;
2721  for (int i = 0; i < num; ++i) {
2722  void *sh_data = arr[i].reduce_shar;
2723  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2724  void (*f_comb)(void *, void *) =
2725  (void (*)(void *, void *))(arr[i].reduce_comb);
2726  if (!arr[i].flags.lazy_priv) {
2727  void *pr_data = arr[i].reduce_priv;
2728  size_t size = arr[i].reduce_size;
2729  for (int j = 0; j < nth; ++j) {
2730  void *priv_data = (char *)pr_data + j * size;
2731  f_comb(sh_data, priv_data); // combine results
2732  if (f_fini)
2733  f_fini(priv_data); // finalize if needed
2734  }
2735  } else {
2736  void **pr_data = (void **)(arr[i].reduce_priv);
2737  for (int j = 0; j < nth; ++j) {
2738  if (pr_data[j] != NULL) {
2739  f_comb(sh_data, pr_data[j]); // combine results
2740  if (f_fini)
2741  f_fini(pr_data[j]); // finalize if needed
2742  __kmp_free(pr_data[j]);
2743  }
2744  }
2745  }
2746  __kmp_free(arr[i].reduce_priv);
2747  }
2748  __kmp_thread_free(th, arr);
2749  tg->reduce_data = NULL;
2750  tg->reduce_num_data = 0;
2751 }
2752 
2753 // Cleanup task reduction data for parallel or worksharing,
2754 // do not touch task private data other threads still working with.
2755 // Called from __kmpc_end_taskgroup()
2756 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2757  __kmp_thread_free(th, tg->reduce_data);
2758  tg->reduce_data = NULL;
2759  tg->reduce_num_data = 0;
2760 }
2761 
2762 template <typename T>
2763 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2764  int num, T *data) {
2765  __kmp_assert_valid_gtid(gtid);
2766  kmp_info_t *thr = __kmp_threads[gtid];
2767  kmp_int32 nth = thr->th.th_team_nproc;
2768  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2769  if (nth == 1) {
2770  KA_TRACE(10,
2771  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2772  gtid, thr->th.th_current_task->td_taskgroup));
2773  return (void *)thr->th.th_current_task->td_taskgroup;
2774  }
2775  kmp_team_t *team = thr->th.th_team;
2776  void *reduce_data;
2777  kmp_taskgroup_t *tg;
2778  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2779  if (reduce_data == NULL &&
2780  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2781  (void *)1)) {
2782  // single thread enters this block to initialize common reduction data
2783  KMP_DEBUG_ASSERT(reduce_data == NULL);
2784  // first initialize own data, then make a copy other threads can use
2785  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2786  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2787  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2788  // fini counters should be 0 at this point
2789  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2790  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2791  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2792  } else {
2793  while (
2794  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2795  (void *)1) { // wait for task reduction initialization
2796  KMP_CPU_PAUSE();
2797  }
2798  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2799  tg = thr->th.th_current_task->td_taskgroup;
2800  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2801  }
2802  return tg;
2803 }
2804 
2821 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2822  int num, void *data) {
2823  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2824  (kmp_task_red_input_t *)data);
2825 }
2826 
2841 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2842  void *data) {
2843  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2844  (kmp_taskred_input_t *)data);
2845 }
2846 
2855 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2856  __kmpc_end_taskgroup(loc, gtid);
2857 }
2858 
2859 // __kmpc_taskgroup: Start a new taskgroup
2860 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2861  __kmp_assert_valid_gtid(gtid);
2862  kmp_info_t *thread = __kmp_threads[gtid];
2863  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2864  kmp_taskgroup_t *tg_new =
2865  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2866  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2867  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2868  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2869  tg_new->parent = taskdata->td_taskgroup;
2870  tg_new->reduce_data = NULL;
2871  tg_new->reduce_num_data = 0;
2872  tg_new->gomp_data = NULL;
2873  taskdata->td_taskgroup = tg_new;
2874 
2875 #if OMPT_SUPPORT && OMPT_OPTIONAL
2876  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2877  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2878  if (!codeptr)
2879  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2880  kmp_team_t *team = thread->th.th_team;
2881  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2882  // FIXME: I think this is wrong for lwt!
2883  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2884 
2885  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2886  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2887  &(my_task_data), codeptr);
2888  }
2889 #endif
2890 }
2891 
2892 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2893 // and its descendants are complete
2894 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2895  __kmp_assert_valid_gtid(gtid);
2896  kmp_info_t *thread = __kmp_threads[gtid];
2897  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2898  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2899  int thread_finished = FALSE;
2900 
2901 #if OMPT_SUPPORT && OMPT_OPTIONAL
2902  kmp_team_t *team;
2903  ompt_data_t my_task_data;
2904  ompt_data_t my_parallel_data;
2905  void *codeptr = nullptr;
2906  if (UNLIKELY(ompt_enabled.enabled)) {
2907  team = thread->th.th_team;
2908  my_task_data = taskdata->ompt_task_info.task_data;
2909  // FIXME: I think this is wrong for lwt!
2910  my_parallel_data = team->t.ompt_team_info.parallel_data;
2911  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2912  if (!codeptr)
2913  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2914  }
2915 #endif
2916 
2917  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2918  KMP_DEBUG_ASSERT(taskgroup != NULL);
2919  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2920 
2921  if (__kmp_tasking_mode != tskm_immediate_exec) {
2922  // mark task as waiting not on a barrier
2923  taskdata->td_taskwait_counter += 1;
2924  taskdata->td_taskwait_ident = loc;
2925  taskdata->td_taskwait_thread = gtid + 1;
2926 #if USE_ITT_BUILD
2927  // For ITT the taskgroup wait is similar to taskwait until we need to
2928  // distinguish them
2929  void *itt_sync_obj = NULL;
2930 #if USE_ITT_NOTIFY
2931  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2932 #endif /* USE_ITT_NOTIFY */
2933 #endif /* USE_ITT_BUILD */
2934 
2935 #if OMPT_SUPPORT && OMPT_OPTIONAL
2936  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2937  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2938  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2939  &(my_task_data), codeptr);
2940  }
2941 #endif
2942 
2943  if (!taskdata->td_flags.team_serial ||
2944  (thread->th.th_task_team != NULL &&
2945  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2946  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2947  kmp_flag_32<false, false> flag(
2948  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2949  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2950  flag.execute_tasks(thread, gtid, FALSE,
2951  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2952  __kmp_task_stealing_constraint);
2953  }
2954  }
2955  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2956 
2957 #if OMPT_SUPPORT && OMPT_OPTIONAL
2958  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2959  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2960  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2961  &(my_task_data), codeptr);
2962  }
2963 #endif
2964 
2965 #if USE_ITT_BUILD
2966  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2967  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2968 #endif /* USE_ITT_BUILD */
2969  }
2970  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2971 
2972  if (taskgroup->reduce_data != NULL &&
2973  !taskgroup->gomp_data) { // need to reduce?
2974  int cnt;
2975  void *reduce_data;
2976  kmp_team_t *t = thread->th.th_team;
2977  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2978  // check if <priv> data of the first reduction variable shared for the team
2979  void *priv0 = arr[0].reduce_priv;
2980  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2981  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2982  // finishing task reduction on parallel
2983  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2984  if (cnt == thread->th.th_team_nproc - 1) {
2985  // we are the last thread passing __kmpc_reduction_modifier_fini()
2986  // finalize task reduction:
2987  __kmp_task_reduction_fini(thread, taskgroup);
2988  // cleanup fields in the team structure:
2989  // TODO: is relaxed store enough here (whole barrier should follow)?
2990  __kmp_thread_free(thread, reduce_data);
2991  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2992  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2993  } else {
2994  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2995  // so do not finalize reduction, just clean own copy of the data
2996  __kmp_task_reduction_clean(thread, taskgroup);
2997  }
2998  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2999  NULL &&
3000  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3001  // finishing task reduction on worksharing
3002  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
3003  if (cnt == thread->th.th_team_nproc - 1) {
3004  // we are the last thread passing __kmpc_reduction_modifier_fini()
3005  __kmp_task_reduction_fini(thread, taskgroup);
3006  // cleanup fields in team structure:
3007  // TODO: is relaxed store enough here (whole barrier should follow)?
3008  __kmp_thread_free(thread, reduce_data);
3009  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
3010  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
3011  } else {
3012  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3013  // so do not finalize reduction, just clean own copy of the data
3014  __kmp_task_reduction_clean(thread, taskgroup);
3015  }
3016  } else {
3017  // finishing task reduction on taskgroup
3018  __kmp_task_reduction_fini(thread, taskgroup);
3019  }
3020  }
3021  // Restore parent taskgroup for the current task
3022  taskdata->td_taskgroup = taskgroup->parent;
3023  __kmp_thread_free(thread, taskgroup);
3024 
3025  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3026  gtid, taskdata));
3027 
3028 #if OMPT_SUPPORT && OMPT_OPTIONAL
3029  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3030  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3031  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3032  &(my_task_data), codeptr);
3033  }
3034 #endif
3035 }
3036 
3037 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3038  kmp_task_team_t *task_team,
3039  kmp_int32 is_constrained) {
3040  kmp_task_t *task = NULL;
3041  kmp_taskdata_t *taskdata;
3042  kmp_taskdata_t *current;
3043  kmp_thread_data_t *thread_data;
3044  int ntasks = task_team->tt.tt_num_task_pri;
3045  if (ntasks == 0) {
3046  KA_TRACE(
3047  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3048  return NULL;
3049  }
3050  do {
3051  // decrement num_tasks to "reserve" one task to get for execution
3052  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
3053  ntasks - 1))
3054  break;
3055  ntasks = task_team->tt.tt_num_task_pri;
3056  } while (ntasks > 0);
3057  if (ntasks == 0) {
3058  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3059  __kmp_get_gtid()));
3060  return NULL;
3061  }
3062  // We got a "ticket" to get a "reserved" priority task
3063  int deque_ntasks;
3064  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3065  do {
3066  KMP_ASSERT(list != NULL);
3067  thread_data = &list->td;
3068  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3069  deque_ntasks = thread_data->td.td_deque_ntasks;
3070  if (deque_ntasks == 0) {
3071  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3072  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3073  __kmp_get_gtid(), thread_data));
3074  list = list->next;
3075  }
3076  } while (deque_ntasks == 0);
3077  KMP_DEBUG_ASSERT(deque_ntasks);
3078  int target = thread_data->td.td_deque_head;
3079  current = __kmp_threads[gtid]->th.th_current_task;
3080  taskdata = thread_data->td.td_deque[target];
3081  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3082  // Bump head pointer and Wrap.
3083  thread_data->td.td_deque_head =
3084  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3085  } else {
3086  if (!task_team->tt.tt_untied_task_encountered) {
3087  // The TSC does not allow to steal victim task
3088  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3089  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3090  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3091  gtid, thread_data, task_team, deque_ntasks, target,
3092  thread_data->td.td_deque_tail));
3093  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3094  return NULL;
3095  }
3096  int i;
3097  // walk through the deque trying to steal any task
3098  taskdata = NULL;
3099  for (i = 1; i < deque_ntasks; ++i) {
3100  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3101  taskdata = thread_data->td.td_deque[target];
3102  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3103  break; // found task to execute
3104  } else {
3105  taskdata = NULL;
3106  }
3107  }
3108  if (taskdata == NULL) {
3109  // No appropriate candidate found to execute
3110  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3111  KA_TRACE(
3112  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3113  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3114  gtid, thread_data, task_team, deque_ntasks,
3115  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3116  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3117  return NULL;
3118  }
3119  int prev = target;
3120  for (i = i + 1; i < deque_ntasks; ++i) {
3121  // shift remaining tasks in the deque left by 1
3122  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3123  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3124  prev = target;
3125  }
3126  KMP_DEBUG_ASSERT(
3127  thread_data->td.td_deque_tail ==
3128  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
3129  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3130  }
3131  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
3132  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3133  task = KMP_TASKDATA_TO_TASK(taskdata);
3134  return task;
3135 }
3136 
3137 // __kmp_remove_my_task: remove a task from my own deque
3138 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
3139  kmp_task_team_t *task_team,
3140  kmp_int32 is_constrained) {
3141  kmp_task_t *task;
3142  kmp_taskdata_t *taskdata;
3143  kmp_thread_data_t *thread_data;
3144  kmp_uint32 tail;
3145 
3146  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3147  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3148  NULL); // Caller should check this condition
3149 
3150  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3151 
3152  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3153  gtid, thread_data->td.td_deque_ntasks,
3154  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3155 
3156  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3157  KA_TRACE(10,
3158  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3159  "ntasks=%d head=%u tail=%u\n",
3160  gtid, thread_data->td.td_deque_ntasks,
3161  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3162  return NULL;
3163  }
3164 
3165  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3166 
3167  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3168  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3169  KA_TRACE(10,
3170  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3171  "ntasks=%d head=%u tail=%u\n",
3172  gtid, thread_data->td.td_deque_ntasks,
3173  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3174  return NULL;
3175  }
3176 
3177  tail = (thread_data->td.td_deque_tail - 1) &
3178  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3179  taskdata = thread_data->td.td_deque[tail];
3180 
3181  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
3182  thread->th.th_current_task)) {
3183  // The TSC does not allow to steal victim task
3184  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3185  KA_TRACE(10,
3186  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3187  "ntasks=%d head=%u tail=%u\n",
3188  gtid, thread_data->td.td_deque_ntasks,
3189  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3190  return NULL;
3191  }
3192 
3193  thread_data->td.td_deque_tail = tail;
3194  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3195 
3196  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3197 
3198  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3199  "ntasks=%d head=%u tail=%u\n",
3200  gtid, taskdata, thread_data->td.td_deque_ntasks,
3201  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3202 
3203  task = KMP_TASKDATA_TO_TASK(taskdata);
3204  return task;
3205 }
3206 
3207 // __kmp_steal_task: remove a task from another thread's deque
3208 // Assume that calling thread has already checked existence of
3209 // task_team thread_data before calling this routine.
3210 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
3211  kmp_task_team_t *task_team,
3212  std::atomic<kmp_int32> *unfinished_threads,
3213  int *thread_finished,
3214  kmp_int32 is_constrained) {
3215  kmp_task_t *task;
3216  kmp_taskdata_t *taskdata;
3217  kmp_taskdata_t *current;
3218  kmp_thread_data_t *victim_td, *threads_data;
3219  kmp_int32 target;
3220  kmp_int32 victim_tid;
3221 
3222  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3223 
3224  threads_data = task_team->tt.tt_threads_data;
3225  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3226 
3227  victim_tid = victim_thr->th.th_info.ds.ds_tid;
3228  victim_td = &threads_data[victim_tid];
3229 
3230  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3231  "task_team=%p ntasks=%d head=%u tail=%u\n",
3232  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3233  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3234  victim_td->td.td_deque_tail));
3235 
3236  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3237  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3238  "task_team=%p ntasks=%d head=%u tail=%u\n",
3239  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3240  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3241  victim_td->td.td_deque_tail));
3242  return NULL;
3243  }
3244 
3245  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3246 
3247  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3248  // Check again after we acquire the lock
3249  if (ntasks == 0) {
3250  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3251  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3252  "task_team=%p ntasks=%d head=%u tail=%u\n",
3253  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3254  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3255  return NULL;
3256  }
3257 
3258  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3259  current = __kmp_threads[gtid]->th.th_current_task;
3260  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3261  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3262  // Bump head pointer and Wrap.
3263  victim_td->td.td_deque_head =
3264  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3265  } else {
3266  if (!task_team->tt.tt_untied_task_encountered) {
3267  // The TSC does not allow to steal victim task
3268  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3269  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3270  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3271  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3272  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3273  return NULL;
3274  }
3275  int i;
3276  // walk through victim's deque trying to steal any task
3277  target = victim_td->td.td_deque_head;
3278  taskdata = NULL;
3279  for (i = 1; i < ntasks; ++i) {
3280  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3281  taskdata = victim_td->td.td_deque[target];
3282  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3283  break; // found victim task
3284  } else {
3285  taskdata = NULL;
3286  }
3287  }
3288  if (taskdata == NULL) {
3289  // No appropriate candidate to steal found
3290  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3291  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3292  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3293  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3294  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3295  return NULL;
3296  }
3297  int prev = target;
3298  for (i = i + 1; i < ntasks; ++i) {
3299  // shift remaining tasks in the deque left by 1
3300  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3301  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3302  prev = target;
3303  }
3304  KMP_DEBUG_ASSERT(
3305  victim_td->td.td_deque_tail ==
3306  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3307  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3308  }
3309  if (*thread_finished) {
3310  // We need to un-mark this victim as a finished victim. This must be done
3311  // before releasing the lock, or else other threads (starting with the
3312  // primary thread victim) might be prematurely released from the barrier!!!
3313 #if KMP_DEBUG
3314  kmp_int32 count =
3315 #endif
3316  KMP_ATOMIC_INC(unfinished_threads);
3317  KA_TRACE(
3318  20,
3319  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3320  gtid, count + 1, task_team));
3321  *thread_finished = FALSE;
3322  }
3323  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3324 
3325  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3326 
3327  KMP_COUNT_BLOCK(TASK_stolen);
3328  KA_TRACE(10,
3329  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3330  "task_team=%p ntasks=%d head=%u tail=%u\n",
3331  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3332  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3333 
3334  task = KMP_TASKDATA_TO_TASK(taskdata);
3335  return task;
3336 }
3337 
3338 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3339 // condition is statisfied (return true) or there are none left (return false).
3340 //
3341 // final_spin is TRUE if this is the spin at the release barrier.
3342 // thread_finished indicates whether the thread is finished executing all
3343 // the tasks it has on its deque, and is at the release barrier.
3344 // spinner is the location on which to spin.
3345 // spinner == NULL means only execute a single task and return.
3346 // checker is the value to check to terminate the spin.
3347 template <class C>
3348 static inline int __kmp_execute_tasks_template(
3349  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3350  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3351  kmp_int32 is_constrained) {
3352  kmp_task_team_t *task_team = thread->th.th_task_team;
3353  kmp_thread_data_t *threads_data;
3354  kmp_task_t *task;
3355  kmp_info_t *other_thread;
3356  kmp_taskdata_t *current_task = thread->th.th_current_task;
3357  std::atomic<kmp_int32> *unfinished_threads;
3358  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3359  tid = thread->th.th_info.ds.ds_tid;
3360 
3361  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3362  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3363 
3364  if (task_team == NULL || current_task == NULL)
3365  return FALSE;
3366 
3367  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3368  "*thread_finished=%d\n",
3369  gtid, final_spin, *thread_finished));
3370 
3371  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3372  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3373 
3374  KMP_DEBUG_ASSERT(threads_data != NULL);
3375 
3376  nthreads = task_team->tt.tt_nproc;
3377  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3378  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
3379  task_team->tt.tt_hidden_helper_task_encountered);
3380  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3381 
3382  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3383  // getting tasks from target constructs
3384  while (1) { // Inner loop to find a task and execute it
3385  task = NULL;
3386  if (task_team->tt.tt_num_task_pri) { // get priority task first
3387  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3388  }
3389  if (task == NULL && use_own_tasks) { // check own queue next
3390  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3391  }
3392  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3393  int asleep = 1;
3394  use_own_tasks = 0;
3395  // Try to steal from the last place I stole from successfully.
3396  if (victim_tid == -2) { // haven't stolen anything yet
3397  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3398  if (victim_tid !=
3399  -1) // if we have a last stolen from victim, get the thread
3400  other_thread = threads_data[victim_tid].td.td_thr;
3401  }
3402  if (victim_tid != -1) { // found last victim
3403  asleep = 0;
3404  } else if (!new_victim) { // no recent steals and we haven't already
3405  // used a new victim; select a random thread
3406  do { // Find a different thread to steal work from.
3407  // Pick a random thread. Initial plan was to cycle through all the
3408  // threads, and only return if we tried to steal from every thread,
3409  // and failed. Arch says that's not such a great idea.
3410  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3411  if (victim_tid >= tid) {
3412  ++victim_tid; // Adjusts random distribution to exclude self
3413  }
3414  // Found a potential victim
3415  other_thread = threads_data[victim_tid].td.td_thr;
3416  // There is a slight chance that __kmp_enable_tasking() did not wake
3417  // up all threads waiting at the barrier. If victim is sleeping,
3418  // then wake it up. Since we were going to pay the cache miss
3419  // penalty for referencing another thread's kmp_info_t struct
3420  // anyway,
3421  // the check shouldn't cost too much performance at this point. In
3422  // extra barrier mode, tasks do not sleep at the separate tasking
3423  // barrier, so this isn't a problem.
3424  asleep = 0;
3425  if ((__kmp_tasking_mode == tskm_task_teams) &&
3426  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3427  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3428  NULL)) {
3429  asleep = 1;
3430  __kmp_null_resume_wrapper(other_thread);
3431  // A sleeping thread should not have any tasks on it's queue.
3432  // There is a slight possibility that it resumes, steals a task
3433  // from another thread, which spawns more tasks, all in the time
3434  // that it takes this thread to check => don't write an assertion
3435  // that the victim's queue is empty. Try stealing from a
3436  // different thread.
3437  }
3438  } while (asleep);
3439  }
3440 
3441  if (!asleep) {
3442  // We have a victim to try to steal from
3443  task = __kmp_steal_task(other_thread, gtid, task_team,
3444  unfinished_threads, thread_finished,
3445  is_constrained);
3446  }
3447  if (task != NULL) { // set last stolen to victim
3448  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3449  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3450  // The pre-refactored code did not try more than 1 successful new
3451  // vicitm, unless the last one generated more local tasks;
3452  // new_victim keeps track of this
3453  new_victim = 1;
3454  }
3455  } else { // No tasks found; unset last_stolen
3456  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3457  victim_tid = -2; // no successful victim found
3458  }
3459  }
3460 
3461  if (task == NULL)
3462  break; // break out of tasking loop
3463 
3464 // Found a task; execute it
3465 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3466  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3467  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3468  // get the object reliably
3469  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3470  }
3471  __kmp_itt_task_starting(itt_sync_obj);
3472  }
3473 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3474  __kmp_invoke_task(gtid, task, current_task);
3475 #if USE_ITT_BUILD
3476  if (itt_sync_obj != NULL)
3477  __kmp_itt_task_finished(itt_sync_obj);
3478 #endif /* USE_ITT_BUILD */
3479  // If this thread is only partway through the barrier and the condition is
3480  // met, then return now, so that the barrier gather/release pattern can
3481  // proceed. If this thread is in the last spin loop in the barrier,
3482  // waiting to be released, we know that the termination condition will not
3483  // be satisfied, so don't waste any cycles checking it.
3484  if (flag == NULL || (!final_spin && flag->done_check())) {
3485  KA_TRACE(
3486  15,
3487  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3488  gtid));
3489  return TRUE;
3490  }
3491  if (thread->th.th_task_team == NULL) {
3492  break;
3493  }
3494  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3495  // If execution of a stolen task results in more tasks being placed on our
3496  // run queue, reset use_own_tasks
3497  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3498  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3499  "other tasks, restart\n",
3500  gtid));
3501  use_own_tasks = 1;
3502  new_victim = 0;
3503  }
3504  }
3505 
3506  // The task source has been exhausted. If in final spin loop of barrier,
3507  // check if termination condition is satisfied. The work queue may be empty
3508  // but there might be proxy tasks still executing.
3509  if (final_spin &&
3510  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3511  // First, decrement the #unfinished threads, if that has not already been
3512  // done. This decrement might be to the spin location, and result in the
3513  // termination condition being satisfied.
3514  if (!*thread_finished) {
3515 #if KMP_DEBUG
3516  kmp_int32 count = -1 +
3517 #endif
3518  KMP_ATOMIC_DEC(unfinished_threads);
3519  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3520  "unfinished_threads to %d task_team=%p\n",
3521  gtid, count, task_team));
3522  *thread_finished = TRUE;
3523  }
3524 
3525  // It is now unsafe to reference thread->th.th_team !!!
3526  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3527  // thread to pass through the barrier, where it might reset each thread's
3528  // th.th_team field for the next parallel region. If we can steal more
3529  // work, we know that this has not happened yet.
3530  if (flag != NULL && flag->done_check()) {
3531  KA_TRACE(
3532  15,
3533  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3534  gtid));
3535  return TRUE;
3536  }
3537  }
3538 
3539  // If this thread's task team is NULL, primary thread has recognized that
3540  // there are no more tasks; bail out
3541  if (thread->th.th_task_team == NULL) {
3542  KA_TRACE(15,
3543  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3544  return FALSE;
3545  }
3546 
3547  // Check the flag again to see if it has already done in case to be trapped
3548  // into infinite loop when a if0 task depends on a hidden helper task
3549  // outside any parallel region. Detached tasks are not impacted in this case
3550  // because the only thread executing this function has to execute the proxy
3551  // task so it is in another code path that has the same check.
3552  if (flag == NULL || (!final_spin && flag->done_check())) {
3553  KA_TRACE(15,
3554  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3555  gtid));
3556  return TRUE;
3557  }
3558 
3559  // We could be getting tasks from target constructs; if this is the only
3560  // thread, keep trying to execute tasks from own queue
3561  if (nthreads == 1 &&
3562  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3563  use_own_tasks = 1;
3564  else {
3565  KA_TRACE(15,
3566  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3567  return FALSE;
3568  }
3569  }
3570 }
3571 
3572 template <bool C, bool S>
3573 int __kmp_execute_tasks_32(
3574  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3575  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3576  kmp_int32 is_constrained) {
3577  return __kmp_execute_tasks_template(
3578  thread, gtid, flag, final_spin,
3579  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3580 }
3581 
3582 template <bool C, bool S>
3583 int __kmp_execute_tasks_64(
3584  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3585  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3586  kmp_int32 is_constrained) {
3587  return __kmp_execute_tasks_template(
3588  thread, gtid, flag, final_spin,
3589  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3590 }
3591 
3592 template <bool C, bool S>
3593 int __kmp_atomic_execute_tasks_64(
3594  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3595  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3596  kmp_int32 is_constrained) {
3597  return __kmp_execute_tasks_template(
3598  thread, gtid, flag, final_spin,
3599  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3600 }
3601 
3602 int __kmp_execute_tasks_oncore(
3603  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3604  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3605  kmp_int32 is_constrained) {
3606  return __kmp_execute_tasks_template(
3607  thread, gtid, flag, final_spin,
3608  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3609 }
3610 
3611 template int
3612 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3613  kmp_flag_32<false, false> *, int,
3614  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3615 
3616 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3617  kmp_flag_64<false, true> *,
3618  int,
3619  int *USE_ITT_BUILD_ARG(void *),
3620  kmp_int32);
3621 
3622 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3623  kmp_flag_64<true, false> *,
3624  int,
3625  int *USE_ITT_BUILD_ARG(void *),
3626  kmp_int32);
3627 
3628 template int __kmp_atomic_execute_tasks_64<false, true>(
3629  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3630  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3631 
3632 template int __kmp_atomic_execute_tasks_64<true, false>(
3633  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3634  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3635 
3636 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3637 // next barrier so they can assist in executing enqueued tasks.
3638 // First thread in allocates the task team atomically.
3639 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3640  kmp_info_t *this_thr) {
3641  kmp_thread_data_t *threads_data;
3642  int nthreads, i, is_init_thread;
3643 
3644  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3645  __kmp_gtid_from_thread(this_thr)));
3646 
3647  KMP_DEBUG_ASSERT(task_team != NULL);
3648  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3649 
3650  nthreads = task_team->tt.tt_nproc;
3651  KMP_DEBUG_ASSERT(nthreads > 0);
3652  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3653 
3654  // Allocate or increase the size of threads_data if necessary
3655  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3656 
3657  if (!is_init_thread) {
3658  // Some other thread already set up the array.
3659  KA_TRACE(
3660  20,
3661  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3662  __kmp_gtid_from_thread(this_thr)));
3663  return;
3664  }
3665  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3666  KMP_DEBUG_ASSERT(threads_data != NULL);
3667 
3668  if (__kmp_tasking_mode == tskm_task_teams &&
3669  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3670  // Release any threads sleeping at the barrier, so that they can steal
3671  // tasks and execute them. In extra barrier mode, tasks do not sleep
3672  // at the separate tasking barrier, so this isn't a problem.
3673  for (i = 0; i < nthreads; i++) {
3674  void *sleep_loc;
3675  kmp_info_t *thread = threads_data[i].td.td_thr;
3676 
3677  if (i == this_thr->th.th_info.ds.ds_tid) {
3678  continue;
3679  }
3680  // Since we haven't locked the thread's suspend mutex lock at this
3681  // point, there is a small window where a thread might be putting
3682  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3683  // To work around this, __kmp_execute_tasks_template() periodically checks
3684  // see if other threads are sleeping (using the same random mechanism that
3685  // is used for task stealing) and awakens them if they are.
3686  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3687  NULL) {
3688  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3689  __kmp_gtid_from_thread(this_thr),
3690  __kmp_gtid_from_thread(thread)));
3691  __kmp_null_resume_wrapper(thread);
3692  } else {
3693  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3694  __kmp_gtid_from_thread(this_thr),
3695  __kmp_gtid_from_thread(thread)));
3696  }
3697  }
3698  }
3699 
3700  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3701  __kmp_gtid_from_thread(this_thr)));
3702 }
3703 
3704 /* // TODO: Check the comment consistency
3705  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3706  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3707  * After a child * thread checks into a barrier and calls __kmp_release() from
3708  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3709  * longer assume that the kmp_team_t structure is intact (at any moment, the
3710  * primary thread may exit the barrier code and free the team data structure,
3711  * and return the threads to the thread pool).
3712  *
3713  * This does not work with the tasking code, as the thread is still
3714  * expected to participate in the execution of any tasks that may have been
3715  * spawned my a member of the team, and the thread still needs access to all
3716  * to each thread in the team, so that it can steal work from it.
3717  *
3718  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3719  * counting mechanism, and is allocated by the primary thread before calling
3720  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3721  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3722  * of the kmp_task_team_t structs for consecutive barriers can overlap
3723  * (and will, unless the primary thread is the last thread to exit the barrier
3724  * release phase, which is not typical). The existence of such a struct is
3725  * useful outside the context of tasking.
3726  *
3727  * We currently use the existence of the threads array as an indicator that
3728  * tasks were spawned since the last barrier. If the structure is to be
3729  * useful outside the context of tasking, then this will have to change, but
3730  * not setting the field minimizes the performance impact of tasking on
3731  * barriers, when no explicit tasks were spawned (pushed, actually).
3732  */
3733 
3734 static kmp_task_team_t *__kmp_free_task_teams =
3735  NULL; // Free list for task_team data structures
3736 // Lock for task team data structures
3737 kmp_bootstrap_lock_t __kmp_task_team_lock =
3738  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3739 
3740 // __kmp_alloc_task_deque:
3741 // Allocates a task deque for a particular thread, and initialize the necessary
3742 // data structures relating to the deque. This only happens once per thread
3743 // per task team since task teams are recycled. No lock is needed during
3744 // allocation since each thread allocates its own deque.
3745 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3746  kmp_thread_data_t *thread_data) {
3747  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3748  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3749 
3750  // Initialize last stolen task field to "none"
3751  thread_data->td.td_deque_last_stolen = -1;
3752 
3753  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3754  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3755  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3756 
3757  KE_TRACE(
3758  10,
3759  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3760  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3761  // Allocate space for task deque, and zero the deque
3762  // Cannot use __kmp_thread_calloc() because threads not around for
3763  // kmp_reap_task_team( ).
3764  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3765  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3766  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3767 }
3768 
3769 // __kmp_free_task_deque:
3770 // Deallocates a task deque for a particular thread. Happens at library
3771 // deallocation so don't need to reset all thread data fields.
3772 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3773  if (thread_data->td.td_deque != NULL) {
3774  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3775  TCW_4(thread_data->td.td_deque_ntasks, 0);
3776  __kmp_free(thread_data->td.td_deque);
3777  thread_data->td.td_deque = NULL;
3778  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3779  }
3780 
3781 #ifdef BUILD_TIED_TASK_STACK
3782  // GEH: Figure out what to do here for td_susp_tied_tasks
3783  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3784  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3785  }
3786 #endif // BUILD_TIED_TASK_STACK
3787 }
3788 
3789 // __kmp_realloc_task_threads_data:
3790 // Allocates a threads_data array for a task team, either by allocating an
3791 // initial array or enlarging an existing array. Only the first thread to get
3792 // the lock allocs or enlarges the array and re-initializes the array elements.
3793 // That thread returns "TRUE", the rest return "FALSE".
3794 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3795 // The current size is given by task_team -> tt.tt_max_threads.
3796 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3797  kmp_task_team_t *task_team) {
3798  kmp_thread_data_t **threads_data_p;
3799  kmp_int32 nthreads, maxthreads;
3800  int is_init_thread = FALSE;
3801 
3802  if (TCR_4(task_team->tt.tt_found_tasks)) {
3803  // Already reallocated and initialized.
3804  return FALSE;
3805  }
3806 
3807  threads_data_p = &task_team->tt.tt_threads_data;
3808  nthreads = task_team->tt.tt_nproc;
3809  maxthreads = task_team->tt.tt_max_threads;
3810 
3811  // All threads must lock when they encounter the first task of the implicit
3812  // task region to make sure threads_data fields are (re)initialized before
3813  // used.
3814  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3815 
3816  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3817  // first thread to enable tasking
3818  kmp_team_t *team = thread->th.th_team;
3819  int i;
3820 
3821  is_init_thread = TRUE;
3822  if (maxthreads < nthreads) {
3823 
3824  if (*threads_data_p != NULL) {
3825  kmp_thread_data_t *old_data = *threads_data_p;
3826  kmp_thread_data_t *new_data = NULL;
3827 
3828  KE_TRACE(
3829  10,
3830  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3831  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3832  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3833  // Reallocate threads_data to have more elements than current array
3834  // Cannot use __kmp_thread_realloc() because threads not around for
3835  // kmp_reap_task_team( ). Note all new array entries are initialized
3836  // to zero by __kmp_allocate().
3837  new_data = (kmp_thread_data_t *)__kmp_allocate(
3838  nthreads * sizeof(kmp_thread_data_t));
3839  // copy old data to new data
3840  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3841  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3842 
3843 #ifdef BUILD_TIED_TASK_STACK
3844  // GEH: Figure out if this is the right thing to do
3845  for (i = maxthreads; i < nthreads; i++) {
3846  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3847  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3848  }
3849 #endif // BUILD_TIED_TASK_STACK
3850  // Install the new data and free the old data
3851  (*threads_data_p) = new_data;
3852  __kmp_free(old_data);
3853  } else {
3854  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3855  "threads data for task_team %p, size = %d\n",
3856  __kmp_gtid_from_thread(thread), task_team, nthreads));
3857  // Make the initial allocate for threads_data array, and zero entries
3858  // Cannot use __kmp_thread_calloc() because threads not around for
3859  // kmp_reap_task_team( ).
3860  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3861  nthreads * sizeof(kmp_thread_data_t));
3862 #ifdef BUILD_TIED_TASK_STACK
3863  // GEH: Figure out if this is the right thing to do
3864  for (i = 0; i < nthreads; i++) {
3865  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3866  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3867  }
3868 #endif // BUILD_TIED_TASK_STACK
3869  }
3870  task_team->tt.tt_max_threads = nthreads;
3871  } else {
3872  // If array has (more than) enough elements, go ahead and use it
3873  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3874  }
3875 
3876  // initialize threads_data pointers back to thread_info structures
3877  for (i = 0; i < nthreads; i++) {
3878  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3879  thread_data->td.td_thr = team->t.t_threads[i];
3880 
3881  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3882  // The last stolen field survives across teams / barrier, and the number
3883  // of threads may have changed. It's possible (likely?) that a new
3884  // parallel region will exhibit the same behavior as previous region.
3885  thread_data->td.td_deque_last_stolen = -1;
3886  }
3887  }
3888 
3889  KMP_MB();
3890  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3891  }
3892 
3893  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3894  return is_init_thread;
3895 }
3896 
3897 // __kmp_free_task_threads_data:
3898 // Deallocates a threads_data array for a task team, including any attached
3899 // tasking deques. Only occurs at library shutdown.
3900 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3901  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3902  if (task_team->tt.tt_threads_data != NULL) {
3903  int i;
3904  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3905  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3906  }
3907  __kmp_free(task_team->tt.tt_threads_data);
3908  task_team->tt.tt_threads_data = NULL;
3909  }
3910  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3911 }
3912 
3913 // __kmp_free_task_pri_list:
3914 // Deallocates tasking deques used for priority tasks.
3915 // Only occurs at library shutdown.
3916 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3917  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3918  if (task_team->tt.tt_task_pri_list != NULL) {
3919  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3920  while (list != NULL) {
3921  kmp_task_pri_t *next = list->next;
3922  __kmp_free_task_deque(&list->td);
3923  __kmp_free(list);
3924  list = next;
3925  }
3926  task_team->tt.tt_task_pri_list = NULL;
3927  }
3928  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3929 }
3930 
3931 // __kmp_allocate_task_team:
3932 // Allocates a task team associated with a specific team, taking it from
3933 // the global task team free list if possible. Also initializes data
3934 // structures.
3935 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3936  kmp_team_t *team) {
3937  kmp_task_team_t *task_team = NULL;
3938  int nthreads;
3939 
3940  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3941  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3942 
3943  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3944  // Take a task team from the task team pool
3945  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3946  if (__kmp_free_task_teams != NULL) {
3947  task_team = __kmp_free_task_teams;
3948  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3949  task_team->tt.tt_next = NULL;
3950  }
3951  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3952  }
3953 
3954  if (task_team == NULL) {
3955  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3956  "task team for team %p\n",
3957  __kmp_gtid_from_thread(thread), team));
3958  // Allocate a new task team if one is not available. Cannot use
3959  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3960  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3961  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3962  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3963 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3964  // suppress race conditions detection on synchronization flags in debug mode
3965  // this helps to analyze library internals eliminating false positives
3966  __itt_suppress_mark_range(
3967  __itt_suppress_range, __itt_suppress_threading_errors,
3968  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3969  __itt_suppress_mark_range(__itt_suppress_range,
3970  __itt_suppress_threading_errors,
3971  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3972  sizeof(task_team->tt.tt_active));
3973 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3974  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3975  // task_team->tt.tt_threads_data = NULL;
3976  // task_team->tt.tt_max_threads = 0;
3977  // task_team->tt.tt_next = NULL;
3978  }
3979 
3980  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3981  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3982  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3983  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3984 
3985  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3986  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3987  TCW_4(task_team->tt.tt_active, TRUE);
3988 
3989  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3990  "unfinished_threads init'd to %d\n",
3991  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3992  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3993  return task_team;
3994 }
3995 
3996 // __kmp_free_task_team:
3997 // Frees the task team associated with a specific thread, and adds it
3998 // to the global task team free list.
3999 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
4000  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
4001  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
4002 
4003  // Put task team back on free list
4004  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4005 
4006  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4007  task_team->tt.tt_next = __kmp_free_task_teams;
4008  TCW_PTR(__kmp_free_task_teams, task_team);
4009 
4010  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4011 }
4012 
4013 // __kmp_reap_task_teams:
4014 // Free all the task teams on the task team free list.
4015 // Should only be done during library shutdown.
4016 // Cannot do anything that needs a thread structure or gtid since they are
4017 // already gone.
4018 void __kmp_reap_task_teams(void) {
4019  kmp_task_team_t *task_team;
4020 
4021  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4022  // Free all task_teams on the free list
4023  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4024  while ((task_team = __kmp_free_task_teams) != NULL) {
4025  __kmp_free_task_teams = task_team->tt.tt_next;
4026  task_team->tt.tt_next = NULL;
4027 
4028  // Free threads_data if necessary
4029  if (task_team->tt.tt_threads_data != NULL) {
4030  __kmp_free_task_threads_data(task_team);
4031  }
4032  if (task_team->tt.tt_task_pri_list != NULL) {
4033  __kmp_free_task_pri_list(task_team);
4034  }
4035  __kmp_free(task_team);
4036  }
4037  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4038  }
4039 }
4040 
4041 // __kmp_wait_to_unref_task_teams:
4042 // Some threads could still be in the fork barrier release code, possibly
4043 // trying to steal tasks. Wait for each thread to unreference its task team.
4044 void __kmp_wait_to_unref_task_teams(void) {
4045  kmp_info_t *thread;
4046  kmp_uint32 spins;
4047  kmp_uint64 time;
4048  int done;
4049 
4050  KMP_INIT_YIELD(spins);
4051  KMP_INIT_BACKOFF(time);
4052 
4053  for (;;) {
4054  done = TRUE;
4055 
4056  // TODO: GEH - this may be is wrong because some sync would be necessary
4057  // in case threads are added to the pool during the traversal. Need to
4058  // verify that lock for thread pool is held when calling this routine.
4059  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4060  thread = thread->th.th_next_pool) {
4061 #if KMP_OS_WINDOWS
4062  DWORD exit_val;
4063 #endif
4064  if (TCR_PTR(thread->th.th_task_team) == NULL) {
4065  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4066  __kmp_gtid_from_thread(thread)));
4067  continue;
4068  }
4069 #if KMP_OS_WINDOWS
4070  // TODO: GEH - add this check for Linux* OS / OS X* as well?
4071  if (!__kmp_is_thread_alive(thread, &exit_val)) {
4072  thread->th.th_task_team = NULL;
4073  continue;
4074  }
4075 #endif
4076 
4077  done = FALSE; // Because th_task_team pointer is not NULL for this thread
4078 
4079  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4080  "unreference task_team\n",
4081  __kmp_gtid_from_thread(thread)));
4082 
4083  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4084  void *sleep_loc;
4085  // If the thread is sleeping, awaken it.
4086  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4087  NULL) {
4088  KA_TRACE(
4089  10,
4090  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4091  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4092  __kmp_null_resume_wrapper(thread);
4093  }
4094  }
4095  }
4096  if (done) {
4097  break;
4098  }
4099 
4100  // If oversubscribed or have waited a bit, yield.
4101  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4102  }
4103 }
4104 
4105 void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) {
4106  // Shift values from th_task_state_top+1 to task_state_stack_sz
4107  if (this_thr->th.th_task_state_top + 1 >=
4108  this_thr->th.th_task_state_stack_sz) { // increase size
4109  kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz;
4110  kmp_uint8 *old_stack, *new_stack;
4111  kmp_uint32 i;
4112  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
4113  for (i = 0; i <= this_thr->th.th_task_state_top; ++i) {
4114  new_stack[i] = this_thr->th.th_task_state_memo_stack[i];
4115  }
4116  // If we need to reallocate do the shift at the same time.
4117  for (; i < this_thr->th.th_task_state_stack_sz; ++i) {
4118  new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i];
4119  }
4120  for (i = this_thr->th.th_task_state_stack_sz; i < new_size;
4121  ++i) { // zero-init rest of stack
4122  new_stack[i] = 0;
4123  }
4124  old_stack = this_thr->th.th_task_state_memo_stack;
4125  this_thr->th.th_task_state_memo_stack = new_stack;
4126  this_thr->th.th_task_state_stack_sz = new_size;
4127  __kmp_free(old_stack);
4128  } else {
4129  kmp_uint8 *end;
4130  kmp_uint32 i;
4131 
4132  end = &this_thr->th
4133  .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz];
4134 
4135  for (i = this_thr->th.th_task_state_stack_sz - 1;
4136  i > this_thr->th.th_task_state_top; i--, end--)
4137  end[0] = end[-1];
4138  }
4139  this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] =
4140  value;
4141 }
4142 
4143 // __kmp_task_team_setup: Create a task_team for the current team, but use
4144 // an already created, unused one if it already exists.
4145 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
4146  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4147 
4148  // If this task_team hasn't been created yet, allocate it. It will be used in
4149  // the region after the next.
4150  // If it exists, it is the current task team and shouldn't be touched yet as
4151  // it may still be in use.
4152  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
4153  (always || team->t.t_nproc > 1)) {
4154  team->t.t_task_team[this_thr->th.th_task_state] =
4155  __kmp_allocate_task_team(this_thr, team);
4156  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4157  " for team %d at parity=%d\n",
4158  __kmp_gtid_from_thread(this_thr),
4159  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4160  this_thr->th.th_task_state));
4161  }
4162  if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) {
4163  // fix task state stack to adjust for proxy and helper tasks
4164  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack"
4165  " for team %d at parity=%d\n",
4166  __kmp_gtid_from_thread(this_thr), team->t.t_id,
4167  this_thr->th.th_task_state));
4168  __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state);
4169  }
4170 
4171  // After threads exit the release, they will call sync, and then point to this
4172  // other task_team; make sure it is allocated and properly initialized. As
4173  // threads spin in the barrier release phase, they will continue to use the
4174  // previous task_team struct(above), until they receive the signal to stop
4175  // checking for tasks (they can't safely reference the kmp_team_t struct,
4176  // which could be reallocated by the primary thread). No task teams are formed
4177  // for serialized teams.
4178  if (team->t.t_nproc > 1) {
4179  int other_team = 1 - this_thr->th.th_task_state;
4180  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
4181  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4182  team->t.t_task_team[other_team] =
4183  __kmp_allocate_task_team(this_thr, team);
4184  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
4185  "task_team %p for team %d at parity=%d\n",
4186  __kmp_gtid_from_thread(this_thr),
4187  team->t.t_task_team[other_team], team->t.t_id, other_team));
4188  } else { // Leave the old task team struct in place for the upcoming region;
4189  // adjust as needed
4190  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4191  if (!task_team->tt.tt_active ||
4192  team->t.t_nproc != task_team->tt.tt_nproc) {
4193  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
4194  TCW_4(task_team->tt.tt_found_tasks, FALSE);
4195  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4196  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4197  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
4198  team->t.t_nproc);
4199  TCW_4(task_team->tt.tt_active, TRUE);
4200  }
4201  // if team size has changed, the first thread to enable tasking will
4202  // realloc threads_data if necessary
4203  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4204  "%p for team %d at parity=%d\n",
4205  __kmp_gtid_from_thread(this_thr),
4206  team->t.t_task_team[other_team], team->t.t_id, other_team));
4207  }
4208  }
4209 
4210  // For regular thread, task enabling should be called when the task is going
4211  // to be pushed to a dequeue. However, for the hidden helper thread, we need
4212  // it ahead of time so that some operations can be performed without race
4213  // condition.
4214  if (this_thr == __kmp_hidden_helper_main_thread) {
4215  for (int i = 0; i < 2; ++i) {
4216  kmp_task_team_t *task_team = team->t.t_task_team[i];
4217  if (KMP_TASKING_ENABLED(task_team)) {
4218  continue;
4219  }
4220  __kmp_enable_tasking(task_team, this_thr);
4221  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4222  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4223  if (thread_data->td.td_deque == NULL) {
4224  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4225  }
4226  }
4227  }
4228  }
4229 }
4230 
4231 // __kmp_task_team_sync: Propagation of task team data from team to threads
4232 // which happens just after the release phase of a team barrier. This may be
4233 // called by any thread, but only for teams with # threads > 1.
4234 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4235  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4236 
4237  // Toggle the th_task_state field, to switch which task_team this thread
4238  // refers to
4239  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4240 
4241  // It is now safe to propagate the task team pointer from the team struct to
4242  // the current thread.
4243  TCW_PTR(this_thr->th.th_task_team,
4244  team->t.t_task_team[this_thr->th.th_task_state]);
4245  KA_TRACE(20,
4246  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4247  "%p from Team #%d (parity=%d)\n",
4248  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4249  team->t.t_id, this_thr->th.th_task_state));
4250 }
4251 
4252 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4253 // barrier gather phase. Only called by primary thread if #threads in team > 1
4254 // or if proxy tasks were created.
4255 //
4256 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4257 // by passing in 0 optionally as the last argument. When wait is zero, primary
4258 // thread does not wait for unfinished_threads to reach 0.
4259 void __kmp_task_team_wait(
4260  kmp_info_t *this_thr,
4261  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4262  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4263 
4264  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4265  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4266 
4267  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4268  if (wait) {
4269  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4270  "(for unfinished_threads to reach 0) on task_team = %p\n",
4271  __kmp_gtid_from_thread(this_thr), task_team));
4272  // Worker threads may have dropped through to release phase, but could
4273  // still be executing tasks. Wait here for tasks to complete. To avoid
4274  // memory contention, only primary thread checks termination condition.
4275  kmp_flag_32<false, false> flag(
4276  RCAST(std::atomic<kmp_uint32> *,
4277  &task_team->tt.tt_unfinished_threads),
4278  0U);
4279  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4280  }
4281  // Deactivate the old task team, so that the worker threads will stop
4282  // referencing it while spinning.
4283  KA_TRACE(
4284  20,
4285  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4286  "setting active to false, setting local and team's pointer to NULL\n",
4287  __kmp_gtid_from_thread(this_thr), task_team));
4288  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
4289  task_team->tt.tt_found_proxy_tasks == TRUE ||
4290  task_team->tt.tt_hidden_helper_task_encountered == TRUE);
4291  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4292  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4293  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4294  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4295  KMP_MB();
4296 
4297  TCW_PTR(this_thr->th.th_task_team, NULL);
4298  }
4299 }
4300 
4301 // __kmp_tasking_barrier:
4302 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4303 // Internal function to execute all tasks prior to a regular barrier or a join
4304 // barrier. It is a full barrier itself, which unfortunately turns regular
4305 // barriers into double barriers and join barriers into 1 1/2 barriers.
4306 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4307  std::atomic<kmp_uint32> *spin = RCAST(
4308  std::atomic<kmp_uint32> *,
4309  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4310  int flag = FALSE;
4311  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4312 
4313 #if USE_ITT_BUILD
4314  KMP_FSYNC_SPIN_INIT(spin, NULL);
4315 #endif /* USE_ITT_BUILD */
4316  kmp_flag_32<false, false> spin_flag(spin, 0U);
4317  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4318  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4319 #if USE_ITT_BUILD
4320  // TODO: What about itt_sync_obj??
4321  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4322 #endif /* USE_ITT_BUILD */
4323 
4324  if (TCR_4(__kmp_global.g.g_done)) {
4325  if (__kmp_global.g.g_abort)
4326  __kmp_abort_thread();
4327  break;
4328  }
4329  KMP_YIELD(TRUE);
4330  }
4331 #if USE_ITT_BUILD
4332  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4333 #endif /* USE_ITT_BUILD */
4334 }
4335 
4336 // __kmp_give_task puts a task into a given thread queue if:
4337 // - the queue for that thread was created
4338 // - there's space in that queue
4339 // Because of this, __kmp_push_task needs to check if there's space after
4340 // getting the lock
4341 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4342  kmp_int32 pass) {
4343  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4344  kmp_task_team_t *task_team = taskdata->td_task_team;
4345 
4346  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4347  taskdata, tid));
4348 
4349  // If task_team is NULL something went really bad...
4350  KMP_DEBUG_ASSERT(task_team != NULL);
4351 
4352  bool result = false;
4353  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4354 
4355  if (thread_data->td.td_deque == NULL) {
4356  // There's no queue in this thread, go find another one
4357  // We're guaranteed that at least one thread has a queue
4358  KA_TRACE(30,
4359  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4360  tid, taskdata));
4361  return result;
4362  }
4363 
4364  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4365  TASK_DEQUE_SIZE(thread_data->td)) {
4366  KA_TRACE(
4367  30,
4368  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4369  taskdata, tid));
4370 
4371  // if this deque is bigger than the pass ratio give a chance to another
4372  // thread
4373  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4374  return result;
4375 
4376  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4377  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4378  TASK_DEQUE_SIZE(thread_data->td)) {
4379  // expand deque to push the task which is not allowed to execute
4380  __kmp_realloc_task_deque(thread, thread_data);
4381  }
4382 
4383  } else {
4384 
4385  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4386 
4387  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4388  TASK_DEQUE_SIZE(thread_data->td)) {
4389  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4390  "thread %d.\n",
4391  taskdata, tid));
4392 
4393  // if this deque is bigger than the pass ratio give a chance to another
4394  // thread
4395  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4396  goto release_and_exit;
4397 
4398  __kmp_realloc_task_deque(thread, thread_data);
4399  }
4400  }
4401 
4402  // lock is held here, and there is space in the deque
4403 
4404  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4405  // Wrap index.
4406  thread_data->td.td_deque_tail =
4407  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4408  TCW_4(thread_data->td.td_deque_ntasks,
4409  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4410 
4411  result = true;
4412  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4413  taskdata, tid));
4414 
4415 release_and_exit:
4416  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4417 
4418  return result;
4419 }
4420 
4421 #define PROXY_TASK_FLAG 0x40000000
4422 /* The finish of the proxy tasks is divided in two pieces:
4423  - the top half is the one that can be done from a thread outside the team
4424  - the bottom half must be run from a thread within the team
4425 
4426  In order to run the bottom half the task gets queued back into one of the
4427  threads of the team. Once the td_incomplete_child_task counter of the parent
4428  is decremented the threads can leave the barriers. So, the bottom half needs
4429  to be queued before the counter is decremented. The top half is therefore
4430  divided in two parts:
4431  - things that can be run before queuing the bottom half
4432  - things that must be run after queuing the bottom half
4433 
4434  This creates a second race as the bottom half can free the task before the
4435  second top half is executed. To avoid this we use the
4436  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4437  half. */
4438 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4439  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4440  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4441  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4442  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4443 
4444  taskdata->td_flags.complete = 1; // mark the task as completed
4445 #if OMPX_TASKGRAPH
4446  taskdata->td_flags.onced = 1;
4447 #endif
4448 
4449  if (taskdata->td_taskgroup)
4450  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4451 
4452  // Create an imaginary children for this task so the bottom half cannot
4453  // release the task before we have completed the second top half
4454  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4455 }
4456 
4457 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4458 #if KMP_DEBUG
4459  kmp_int32 children = 0;
4460  // Predecrement simulated by "- 1" calculation
4461  children = -1 +
4462 #endif
4463  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4464  KMP_DEBUG_ASSERT(children >= 0);
4465 
4466  // Remove the imaginary children
4467  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4468 }
4469 
4470 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4471  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4472  kmp_info_t *thread = __kmp_threads[gtid];
4473 
4474  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4475  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4476  1); // top half must run before bottom half
4477 
4478  // We need to wait to make sure the top half is finished
4479  // Spinning here should be ok as this should happen quickly
4480  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4481  PROXY_TASK_FLAG) > 0)
4482  ;
4483 
4484  __kmp_release_deps(gtid, taskdata);
4485  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4486 }
4487 
4496 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4497  KMP_DEBUG_ASSERT(ptask != NULL);
4498  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4499  KA_TRACE(
4500  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4501  gtid, taskdata));
4502  __kmp_assert_valid_gtid(gtid);
4503  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4504 
4505  __kmp_first_top_half_finish_proxy(taskdata);
4506  __kmp_second_top_half_finish_proxy(taskdata);
4507  __kmp_bottom_half_finish_proxy(gtid, ptask);
4508 
4509  KA_TRACE(10,
4510  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4511  gtid, taskdata));
4512 }
4513 
4514 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4515  KMP_DEBUG_ASSERT(ptask != NULL);
4516  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4517 
4518  // Enqueue task to complete bottom half completion from a thread within the
4519  // corresponding team
4520  kmp_team_t *team = taskdata->td_team;
4521  kmp_int32 nthreads = team->t.t_nproc;
4522  kmp_info_t *thread;
4523 
4524  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4525  // but we cannot use __kmp_get_random here
4526  kmp_int32 start_k = start % nthreads;
4527  kmp_int32 pass = 1;
4528  kmp_int32 k = start_k;
4529 
4530  do {
4531  // For now we're just linearly trying to find a thread
4532  thread = team->t.t_threads[k];
4533  k = (k + 1) % nthreads;
4534 
4535  // we did a full pass through all the threads
4536  if (k == start_k)
4537  pass = pass << 1;
4538 
4539  } while (!__kmp_give_task(thread, k, ptask, pass));
4540 
4541  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4542  // awake at least one thread to execute given task
4543  for (int i = 0; i < nthreads; ++i) {
4544  thread = team->t.t_threads[i];
4545  if (thread->th.th_sleep_loc != NULL) {
4546  __kmp_null_resume_wrapper(thread);
4547  break;
4548  }
4549  }
4550  }
4551 }
4552 
4560 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4561  KMP_DEBUG_ASSERT(ptask != NULL);
4562  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4563 
4564  KA_TRACE(
4565  10,
4566  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4567  taskdata));
4568 
4569  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4570 
4571  __kmp_first_top_half_finish_proxy(taskdata);
4572 
4573  __kmpc_give_task(ptask);
4574 
4575  __kmp_second_top_half_finish_proxy(taskdata);
4576 
4577  KA_TRACE(
4578  10,
4579  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4580  taskdata));
4581 }
4582 
4583 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4584  kmp_task_t *task) {
4585  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4586  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4587  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4588  td->td_allow_completion_event.ed.task = task;
4589  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4590  }
4591  return &td->td_allow_completion_event;
4592 }
4593 
4594 void __kmp_fulfill_event(kmp_event_t *event) {
4595  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4596  kmp_task_t *ptask = event->ed.task;
4597  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4598  bool detached = false;
4599  int gtid = __kmp_get_gtid();
4600 
4601  // The associated task might have completed or could be completing at this
4602  // point.
4603  // We need to take the lock to avoid races
4604  __kmp_acquire_tas_lock(&event->lock, gtid);
4605  if (taskdata->td_flags.proxy == TASK_PROXY) {
4606  detached = true;
4607  } else {
4608 #if OMPT_SUPPORT
4609  // The OMPT event must occur under mutual exclusion,
4610  // otherwise the tool might access ptask after free
4611  if (UNLIKELY(ompt_enabled.enabled))
4612  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4613 #endif
4614  }
4615  event->type = KMP_EVENT_UNINITIALIZED;
4616  __kmp_release_tas_lock(&event->lock, gtid);
4617 
4618  if (detached) {
4619 #if OMPT_SUPPORT
4620  // We free ptask afterwards and know the task is finished,
4621  // so locking is not necessary
4622  if (UNLIKELY(ompt_enabled.enabled))
4623  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4624 #endif
4625  // If the task detached complete the proxy task
4626  if (gtid >= 0) {
4627  kmp_team_t *team = taskdata->td_team;
4628  kmp_info_t *thread = __kmp_get_thread();
4629  if (thread->th.th_team == team) {
4630  __kmpc_proxy_task_completed(gtid, ptask);
4631  return;
4632  }
4633  }
4634 
4635  // fallback
4637  }
4638  }
4639 }
4640 
4641 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4642 // for taskloop
4643 //
4644 // thread: allocating thread
4645 // task_src: pointer to source task to be duplicated
4646 // taskloop_recur: used only when dealing with taskgraph,
4647 // indicating whether we need to update task->td_task_id
4648 // returns: a pointer to the allocated kmp_task_t structure (task).
4649 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4650 #if OMPX_TASKGRAPH
4651  , int taskloop_recur
4652 #endif
4653 ) {
4654  kmp_task_t *task;
4655  kmp_taskdata_t *taskdata;
4656  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4657  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4658  size_t shareds_offset;
4659  size_t task_size;
4660 
4661  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4662  task_src));
4663  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4664  TASK_FULL); // it should not be proxy task
4665  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4666  task_size = taskdata_src->td_size_alloc;
4667 
4668  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4669  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4670  task_size));
4671 #if USE_FAST_MEMORY
4672  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4673 #else
4674  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4675 #endif /* USE_FAST_MEMORY */
4676  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4677 
4678  task = KMP_TASKDATA_TO_TASK(taskdata);
4679 
4680  // Initialize new task (only specific fields not affected by memcpy)
4681 #if OMPX_TASKGRAPH
4682  if (!taskdata->is_taskgraph || taskloop_recur)
4683  taskdata->td_task_id = KMP_GEN_TASK_ID();
4684  else if (taskdata->is_taskgraph &&
4685  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4686  taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4687 #else
4688  taskdata->td_task_id = KMP_GEN_TASK_ID();
4689 #endif
4690  if (task->shareds != NULL) { // need setup shareds pointer
4691  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4692  task->shareds = &((char *)taskdata)[shareds_offset];
4693  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4694  0);
4695  }
4696  taskdata->td_alloc_thread = thread;
4697  taskdata->td_parent = parent_task;
4698  // task inherits the taskgroup from the parent task
4699  taskdata->td_taskgroup = parent_task->td_taskgroup;
4700  // tied task needs to initialize the td_last_tied at creation,
4701  // untied one does this when it is scheduled for execution
4702  if (taskdata->td_flags.tiedness == TASK_TIED)
4703  taskdata->td_last_tied = taskdata;
4704 
4705  // Only need to keep track of child task counts if team parallel and tasking
4706  // not serialized
4707  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4708  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4709  if (parent_task->td_taskgroup)
4710  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4711  // Only need to keep track of allocated child tasks for explicit tasks since
4712  // implicit not deallocated
4713  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4714  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4715  }
4716 
4717  KA_TRACE(20,
4718  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4719  thread, taskdata, taskdata->td_parent));
4720 #if OMPT_SUPPORT
4721  if (UNLIKELY(ompt_enabled.enabled))
4722  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4723 #endif
4724  return task;
4725 }
4726 
4727 // Routine optionally generated by the compiler for setting the lastprivate flag
4728 // and calling needed constructors for private/firstprivate objects
4729 // (used to form taskloop tasks from pattern task)
4730 // Parameters: dest task, src task, lastprivate flag.
4731 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4732 
4733 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4734 
4735 // class to encapsulate manipulating loop bounds in a taskloop task.
4736 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4737 // the loop bound variables.
4738 class kmp_taskloop_bounds_t {
4739  kmp_task_t *task;
4740  const kmp_taskdata_t *taskdata;
4741  size_t lower_offset;
4742  size_t upper_offset;
4743 
4744 public:
4745  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4746  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4747  lower_offset((char *)lb - (char *)task),
4748  upper_offset((char *)ub - (char *)task) {
4749  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4750  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4751  }
4752  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4753  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4754  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4755  size_t get_lower_offset() const { return lower_offset; }
4756  size_t get_upper_offset() const { return upper_offset; }
4757  kmp_uint64 get_lb() const {
4758  kmp_int64 retval;
4759 #if defined(KMP_GOMP_COMPAT)
4760  // Intel task just returns the lower bound normally
4761  if (!taskdata->td_flags.native) {
4762  retval = *(kmp_int64 *)((char *)task + lower_offset);
4763  } else {
4764  // GOMP task has to take into account the sizeof(long)
4765  if (taskdata->td_size_loop_bounds == 4) {
4766  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4767  retval = (kmp_int64)*lb;
4768  } else {
4769  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4770  retval = (kmp_int64)*lb;
4771  }
4772  }
4773 #else
4774  (void)taskdata;
4775  retval = *(kmp_int64 *)((char *)task + lower_offset);
4776 #endif // defined(KMP_GOMP_COMPAT)
4777  return retval;
4778  }
4779  kmp_uint64 get_ub() const {
4780  kmp_int64 retval;
4781 #if defined(KMP_GOMP_COMPAT)
4782  // Intel task just returns the upper bound normally
4783  if (!taskdata->td_flags.native) {
4784  retval = *(kmp_int64 *)((char *)task + upper_offset);
4785  } else {
4786  // GOMP task has to take into account the sizeof(long)
4787  if (taskdata->td_size_loop_bounds == 4) {
4788  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4789  retval = (kmp_int64)*ub;
4790  } else {
4791  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4792  retval = (kmp_int64)*ub;
4793  }
4794  }
4795 #else
4796  retval = *(kmp_int64 *)((char *)task + upper_offset);
4797 #endif // defined(KMP_GOMP_COMPAT)
4798  return retval;
4799  }
4800  void set_lb(kmp_uint64 lb) {
4801 #if defined(KMP_GOMP_COMPAT)
4802  // Intel task just sets the lower bound normally
4803  if (!taskdata->td_flags.native) {
4804  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4805  } else {
4806  // GOMP task has to take into account the sizeof(long)
4807  if (taskdata->td_size_loop_bounds == 4) {
4808  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4809  *lower = (kmp_uint32)lb;
4810  } else {
4811  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4812  *lower = (kmp_uint64)lb;
4813  }
4814  }
4815 #else
4816  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4817 #endif // defined(KMP_GOMP_COMPAT)
4818  }
4819  void set_ub(kmp_uint64 ub) {
4820 #if defined(KMP_GOMP_COMPAT)
4821  // Intel task just sets the upper bound normally
4822  if (!taskdata->td_flags.native) {
4823  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4824  } else {
4825  // GOMP task has to take into account the sizeof(long)
4826  if (taskdata->td_size_loop_bounds == 4) {
4827  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4828  *upper = (kmp_uint32)ub;
4829  } else {
4830  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4831  *upper = (kmp_uint64)ub;
4832  }
4833  }
4834 #else
4835  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4836 #endif // defined(KMP_GOMP_COMPAT)
4837  }
4838 };
4839 
4840 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4841 //
4842 // loc Source location information
4843 // gtid Global thread ID
4844 // task Pattern task, exposes the loop iteration range
4845 // lb Pointer to loop lower bound in task structure
4846 // ub Pointer to loop upper bound in task structure
4847 // st Loop stride
4848 // ub_glob Global upper bound (used for lastprivate check)
4849 // num_tasks Number of tasks to execute
4850 // grainsize Number of loop iterations per task
4851 // extras Number of chunks with grainsize+1 iterations
4852 // last_chunk Reduction of grainsize for last task
4853 // tc Iterations count
4854 // task_dup Tasks duplication routine
4855 // codeptr_ra Return address for OMPT events
4856 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4857  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4858  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4859  kmp_uint64 grainsize, kmp_uint64 extras,
4860  kmp_int64 last_chunk, kmp_uint64 tc,
4861 #if OMPT_SUPPORT
4862  void *codeptr_ra,
4863 #endif
4864  void *task_dup) {
4865  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4866  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4867  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4868  // compiler provides global bounds here
4869  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4870  kmp_uint64 lower = task_bounds.get_lb();
4871  kmp_uint64 upper = task_bounds.get_ub();
4872  kmp_uint64 i;
4873  kmp_info_t *thread = __kmp_threads[gtid];
4874  kmp_taskdata_t *current_task = thread->th.th_current_task;
4875  kmp_task_t *next_task;
4876  kmp_int32 lastpriv = 0;
4877 
4878  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4879  (last_chunk < 0 ? last_chunk : extras));
4880  KMP_DEBUG_ASSERT(num_tasks > extras);
4881  KMP_DEBUG_ASSERT(num_tasks > 0);
4882  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4883  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4884  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4885  ub_glob, st, task_dup));
4886 
4887  // Launch num_tasks tasks, assign grainsize iterations each task
4888  for (i = 0; i < num_tasks; ++i) {
4889  kmp_uint64 chunk_minus_1;
4890  if (extras == 0) {
4891  chunk_minus_1 = grainsize - 1;
4892  } else {
4893  chunk_minus_1 = grainsize;
4894  --extras; // first extras iterations get bigger chunk (grainsize+1)
4895  }
4896  upper = lower + st * chunk_minus_1;
4897  if (upper > *ub) {
4898  upper = *ub;
4899  }
4900  if (i == num_tasks - 1) {
4901  // schedule the last task, set lastprivate flag if needed
4902  if (st == 1) { // most common case
4903  KMP_DEBUG_ASSERT(upper == *ub);
4904  if (upper == ub_glob)
4905  lastpriv = 1;
4906  } else if (st > 0) { // positive loop stride
4907  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4908  if ((kmp_uint64)st > ub_glob - upper)
4909  lastpriv = 1;
4910  } else { // negative loop stride
4911  KMP_DEBUG_ASSERT(upper + st < *ub);
4912  if (upper - ub_glob < (kmp_uint64)(-st))
4913  lastpriv = 1;
4914  }
4915  }
4916 
4917 #if OMPX_TASKGRAPH
4918  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4919 #else
4920  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4921 #endif
4922 
4923  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4924  kmp_taskloop_bounds_t next_task_bounds =
4925  kmp_taskloop_bounds_t(next_task, task_bounds);
4926 
4927  // adjust task-specific bounds
4928  next_task_bounds.set_lb(lower);
4929  if (next_taskdata->td_flags.native) {
4930  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4931  } else {
4932  next_task_bounds.set_ub(upper);
4933  }
4934  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4935  // etc.
4936  ptask_dup(next_task, task, lastpriv);
4937  KA_TRACE(40,
4938  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4939  "upper %lld stride %lld, (offsets %p %p)\n",
4940  gtid, i, next_task, lower, upper, st,
4941  next_task_bounds.get_lower_offset(),
4942  next_task_bounds.get_upper_offset()));
4943 #if OMPT_SUPPORT
4944  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4945  codeptr_ra); // schedule new task
4946 #if OMPT_OPTIONAL
4947  if (ompt_enabled.ompt_callback_dispatch) {
4948  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4949  lower, upper, st);
4950  }
4951 #endif // OMPT_OPTIONAL
4952 #else
4953  __kmp_omp_task(gtid, next_task, true); // schedule new task
4954 #endif
4955  lower = upper + st; // adjust lower bound for the next iteration
4956  }
4957  // free the pattern task and exit
4958  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4959  // do not execute the pattern task, just do internal bookkeeping
4960  __kmp_task_finish<false>(gtid, task, current_task);
4961 }
4962 
4963 // Structure to keep taskloop parameters for auxiliary task
4964 // kept in the shareds of the task structure.
4965 typedef struct __taskloop_params {
4966  kmp_task_t *task;
4967  kmp_uint64 *lb;
4968  kmp_uint64 *ub;
4969  void *task_dup;
4970  kmp_int64 st;
4971  kmp_uint64 ub_glob;
4972  kmp_uint64 num_tasks;
4973  kmp_uint64 grainsize;
4974  kmp_uint64 extras;
4975  kmp_int64 last_chunk;
4976  kmp_uint64 tc;
4977  kmp_uint64 num_t_min;
4978 #if OMPT_SUPPORT
4979  void *codeptr_ra;
4980 #endif
4981 } __taskloop_params_t;
4982 
4983 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4984  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4985  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4986  kmp_uint64,
4987 #if OMPT_SUPPORT
4988  void *,
4989 #endif
4990  void *);
4991 
4992 // Execute part of the taskloop submitted as a task.
4993 int __kmp_taskloop_task(int gtid, void *ptask) {
4994  __taskloop_params_t *p =
4995  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4996  kmp_task_t *task = p->task;
4997  kmp_uint64 *lb = p->lb;
4998  kmp_uint64 *ub = p->ub;
4999  void *task_dup = p->task_dup;
5000  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5001  kmp_int64 st = p->st;
5002  kmp_uint64 ub_glob = p->ub_glob;
5003  kmp_uint64 num_tasks = p->num_tasks;
5004  kmp_uint64 grainsize = p->grainsize;
5005  kmp_uint64 extras = p->extras;
5006  kmp_int64 last_chunk = p->last_chunk;
5007  kmp_uint64 tc = p->tc;
5008  kmp_uint64 num_t_min = p->num_t_min;
5009 #if OMPT_SUPPORT
5010  void *codeptr_ra = p->codeptr_ra;
5011 #endif
5012 #if KMP_DEBUG
5013  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5014  KMP_DEBUG_ASSERT(task != NULL);
5015  KA_TRACE(20,
5016  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5017  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5018  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5019  st, task_dup));
5020 #endif
5021  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
5022  if (num_tasks > num_t_min)
5023  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5024  grainsize, extras, last_chunk, tc, num_t_min,
5025 #if OMPT_SUPPORT
5026  codeptr_ra,
5027 #endif
5028  task_dup);
5029  else
5030  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5031  grainsize, extras, last_chunk, tc,
5032 #if OMPT_SUPPORT
5033  codeptr_ra,
5034 #endif
5035  task_dup);
5036 
5037  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5038  return 0;
5039 }
5040 
5041 // Schedule part of the taskloop as a task,
5042 // execute the rest of the taskloop.
5043 //
5044 // loc Source location information
5045 // gtid Global thread ID
5046 // task Pattern task, exposes the loop iteration range
5047 // lb Pointer to loop lower bound in task structure
5048 // ub Pointer to loop upper bound in task structure
5049 // st Loop stride
5050 // ub_glob Global upper bound (used for lastprivate check)
5051 // num_tasks Number of tasks to execute
5052 // grainsize Number of loop iterations per task
5053 // extras Number of chunks with grainsize+1 iterations
5054 // last_chunk Reduction of grainsize for last task
5055 // tc Iterations count
5056 // num_t_min Threshold to launch tasks recursively
5057 // task_dup Tasks duplication routine
5058 // codeptr_ra Return address for OMPT events
5059 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
5060  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5061  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5062  kmp_uint64 grainsize, kmp_uint64 extras,
5063  kmp_int64 last_chunk, kmp_uint64 tc,
5064  kmp_uint64 num_t_min,
5065 #if OMPT_SUPPORT
5066  void *codeptr_ra,
5067 #endif
5068  void *task_dup) {
5069  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5070  KMP_DEBUG_ASSERT(task != NULL);
5071  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5072  KA_TRACE(20,
5073  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5074  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5075  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5076  st, task_dup));
5077  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5078  kmp_uint64 lower = *lb;
5079  kmp_info_t *thread = __kmp_threads[gtid];
5080  // kmp_taskdata_t *current_task = thread->th.th_current_task;
5081  kmp_task_t *next_task;
5082  size_t lower_offset =
5083  (char *)lb - (char *)task; // remember offset of lb in the task structure
5084  size_t upper_offset =
5085  (char *)ub - (char *)task; // remember offset of ub in the task structure
5086 
5087  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5088  (last_chunk < 0 ? last_chunk : extras));
5089  KMP_DEBUG_ASSERT(num_tasks > extras);
5090  KMP_DEBUG_ASSERT(num_tasks > 0);
5091 
5092  // split the loop in two halves
5093  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5094  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
5095  kmp_uint64 gr_size0 = grainsize;
5096  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
5097  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5098  if (last_chunk < 0) {
5099  ext0 = ext1 = 0;
5100  last_chunk1 = last_chunk;
5101  tc0 = grainsize * n_tsk0;
5102  tc1 = tc - tc0;
5103  } else if (n_tsk0 <= extras) {
5104  gr_size0++; // integrate extras into grainsize
5105  ext0 = 0; // no extra iters in 1st half
5106  ext1 = extras - n_tsk0; // remaining extras
5107  tc0 = gr_size0 * n_tsk0;
5108  tc1 = tc - tc0;
5109  } else { // n_tsk0 > extras
5110  ext1 = 0; // no extra iters in 2nd half
5111  ext0 = extras;
5112  tc1 = grainsize * n_tsk1;
5113  tc0 = tc - tc1;
5114  }
5115  ub0 = lower + st * (tc0 - 1);
5116  lb1 = ub0 + st;
5117 
5118  // create pattern task for 2nd half of the loop
5119 #if OMPX_TASKGRAPH
5120  next_task = __kmp_task_dup_alloc(thread, task,
5121  /* taskloop_recur */ 1);
5122 #else
5123  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
5124 #endif
5125  // adjust lower bound (upper bound is not changed) for the 2nd half
5126  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
5127  if (ptask_dup != NULL) // construct firstprivates, etc.
5128  ptask_dup(next_task, task, 0);
5129  *ub = ub0; // adjust upper bound for the 1st half
5130 
5131  // create auxiliary task for 2nd half of the loop
5132  // make sure new task has same parent task as the pattern task
5133  kmp_taskdata_t *current_task = thread->th.th_current_task;
5134  thread->th.th_current_task = taskdata->td_parent;
5135  kmp_task_t *new_task =
5136  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
5137  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
5138  // restore current task
5139  thread->th.th_current_task = current_task;
5140  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
5141  p->task = next_task;
5142  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
5143  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
5144  p->task_dup = task_dup;
5145  p->st = st;
5146  p->ub_glob = ub_glob;
5147  p->num_tasks = n_tsk1;
5148  p->grainsize = grainsize;
5149  p->extras = ext1;
5150  p->last_chunk = last_chunk1;
5151  p->tc = tc1;
5152  p->num_t_min = num_t_min;
5153 #if OMPT_SUPPORT
5154  p->codeptr_ra = codeptr_ra;
5155 #endif
5156 
5157 #if OMPX_TASKGRAPH
5158  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5159  new_task_data->tdg = taskdata->tdg;
5160  new_task_data->is_taskgraph = 0;
5161 #endif
5162 
5163 #if OMPT_SUPPORT
5164  // schedule new task with correct return address for OMPT events
5165  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5166 #else
5167  __kmp_omp_task(gtid, new_task, true); // schedule new task
5168 #endif
5169 
5170  // execute the 1st half of current subrange
5171  if (n_tsk0 > num_t_min)
5172  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
5173  ext0, last_chunk0, tc0, num_t_min,
5174 #if OMPT_SUPPORT
5175  codeptr_ra,
5176 #endif
5177  task_dup);
5178  else
5179  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
5180  gr_size0, ext0, last_chunk0, tc0,
5181 #if OMPT_SUPPORT
5182  codeptr_ra,
5183 #endif
5184  task_dup);
5185 
5186  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5187 }
5188 
5189 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5190  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5191  int nogroup, int sched, kmp_uint64 grainsize,
5192  int modifier, void *task_dup) {
5193  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5194  KMP_DEBUG_ASSERT(task != NULL);
5195  if (nogroup == 0) {
5196 #if OMPT_SUPPORT && OMPT_OPTIONAL
5197  OMPT_STORE_RETURN_ADDRESS(gtid);
5198 #endif
5199  __kmpc_taskgroup(loc, gtid);
5200  }
5201 
5202 #if OMPX_TASKGRAPH
5203  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5204 #endif
5205  // =========================================================================
5206  // calculate loop parameters
5207  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5208  kmp_uint64 tc;
5209  // compiler provides global bounds here
5210  kmp_uint64 lower = task_bounds.get_lb();
5211  kmp_uint64 upper = task_bounds.get_ub();
5212  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5213  kmp_uint64 num_tasks = 0, extras = 0;
5214  kmp_int64 last_chunk =
5215  0; // reduce grainsize of last task by last_chunk in strict mode
5216  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5217  kmp_info_t *thread = __kmp_threads[gtid];
5218  kmp_taskdata_t *current_task = thread->th.th_current_task;
5219 
5220  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5221  "grain %llu(%d, %d), dup %p\n",
5222  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5223  task_dup));
5224 
5225  // compute trip count
5226  if (st == 1) { // most common case
5227  tc = upper - lower + 1;
5228  } else if (st < 0) {
5229  tc = (lower - upper) / (-st) + 1;
5230  } else { // st > 0
5231  tc = (upper - lower) / st + 1;
5232  }
5233  if (tc == 0) {
5234  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5235  // free the pattern task and exit
5236  __kmp_task_start(gtid, task, current_task);
5237  // do not execute anything for zero-trip loop
5238  __kmp_task_finish<false>(gtid, task, current_task);
5239  return;
5240  }
5241 
5242 #if OMPT_SUPPORT && OMPT_OPTIONAL
5243  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5244  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5245  if (ompt_enabled.ompt_callback_work) {
5246  ompt_callbacks.ompt_callback(ompt_callback_work)(
5247  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5248  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5249  }
5250 #endif
5251 
5252  if (num_tasks_min == 0)
5253  // TODO: can we choose better default heuristic?
5254  num_tasks_min =
5255  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5256 
5257  // compute num_tasks/grainsize based on the input provided
5258  switch (sched) {
5259  case 0: // no schedule clause specified, we can choose the default
5260  // let's try to schedule (team_size*10) tasks
5261  grainsize = thread->th.th_team_nproc * 10;
5262  KMP_FALLTHROUGH();
5263  case 2: // num_tasks provided
5264  if (grainsize > tc) {
5265  num_tasks = tc; // too big num_tasks requested, adjust values
5266  grainsize = 1;
5267  extras = 0;
5268  } else {
5269  num_tasks = grainsize;
5270  grainsize = tc / num_tasks;
5271  extras = tc % num_tasks;
5272  }
5273  break;
5274  case 1: // grainsize provided
5275  if (grainsize > tc) {
5276  num_tasks = 1;
5277  grainsize = tc; // too big grainsize requested, adjust values
5278  extras = 0;
5279  } else {
5280  if (modifier) {
5281  num_tasks = (tc + grainsize - 1) / grainsize;
5282  last_chunk = tc - (num_tasks * grainsize);
5283  extras = 0;
5284  } else {
5285  num_tasks = tc / grainsize;
5286  // adjust grainsize for balanced distribution of iterations
5287  grainsize = tc / num_tasks;
5288  extras = tc % num_tasks;
5289  }
5290  }
5291  break;
5292  default:
5293  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5294  }
5295 
5296  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5297  (last_chunk < 0 ? last_chunk : extras));
5298  KMP_DEBUG_ASSERT(num_tasks > extras);
5299  KMP_DEBUG_ASSERT(num_tasks > 0);
5300  // =========================================================================
5301 
5302  // check if clause value first
5303  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5304  if (if_val == 0) { // if(0) specified, mark task as serial
5305  taskdata->td_flags.task_serial = 1;
5306  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5307  // always start serial tasks linearly
5308  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5309  grainsize, extras, last_chunk, tc,
5310 #if OMPT_SUPPORT
5311  OMPT_GET_RETURN_ADDRESS(0),
5312 #endif
5313  task_dup);
5314  // !taskdata->td_flags.native => currently force linear spawning of tasks
5315  // for GOMP_taskloop
5316  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5317  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5318  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5319  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5320  last_chunk));
5321  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5322  grainsize, extras, last_chunk, tc, num_tasks_min,
5323 #if OMPT_SUPPORT
5324  OMPT_GET_RETURN_ADDRESS(0),
5325 #endif
5326  task_dup);
5327  } else {
5328  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5329  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5330  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5331  last_chunk));
5332  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5333  grainsize, extras, last_chunk, tc,
5334 #if OMPT_SUPPORT
5335  OMPT_GET_RETURN_ADDRESS(0),
5336 #endif
5337  task_dup);
5338  }
5339 
5340 #if OMPT_SUPPORT && OMPT_OPTIONAL
5341  if (ompt_enabled.ompt_callback_work) {
5342  ompt_callbacks.ompt_callback(ompt_callback_work)(
5343  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5344  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5345  }
5346 #endif
5347 
5348  if (nogroup == 0) {
5349 #if OMPT_SUPPORT && OMPT_OPTIONAL
5350  OMPT_STORE_RETURN_ADDRESS(gtid);
5351 #endif
5352  __kmpc_end_taskgroup(loc, gtid);
5353  }
5354  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5355 }
5356 
5373 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5374  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5375  int sched, kmp_uint64 grainsize, void *task_dup) {
5376  __kmp_assert_valid_gtid(gtid);
5377  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5378  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5379  0, task_dup);
5380  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5381 }
5382 
5400 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5401  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5402  int nogroup, int sched, kmp_uint64 grainsize,
5403  int modifier, void *task_dup) {
5404  __kmp_assert_valid_gtid(gtid);
5405  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5406  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5407  modifier, task_dup);
5408  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5409 }
5410 
5420  if (gtid == KMP_GTID_DNE)
5421  return NULL;
5422 
5423  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5424  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5425 
5426  if (!taskdata)
5427  return NULL;
5428 
5429  return &taskdata->td_target_data.async_handle;
5430 }
5431 
5440 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5441  if (gtid == KMP_GTID_DNE)
5442  return FALSE;
5443 
5444  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5445  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5446 
5447  if (!taskdata)
5448  return FALSE;
5449 
5450  return taskdata->td_task_team != NULL;
5451 }
5452 
5453 #if OMPX_TASKGRAPH
5454 // __kmp_find_tdg: identify a TDG through its ID
5455 // gtid: Global Thread ID
5456 // tdg_id: ID of the TDG
5457 // returns: If a TDG corresponding to this ID is found and not
5458 // its initial state, return the pointer to it, otherwise nullptr
5459 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5460  kmp_tdg_info_t *res = nullptr;
5461  if (__kmp_max_tdgs == 0)
5462  return res;
5463 
5464  if (__kmp_global_tdgs == NULL)
5465  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5466  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5467 
5468  if ((__kmp_global_tdgs[tdg_id]) &&
5469  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5470  res = __kmp_global_tdgs[tdg_id];
5471  return res;
5472 }
5473 
5474 // __kmp_print_tdg_dot: prints the TDG to a dot file
5475 // tdg: ID of the TDG
5476 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) {
5477  kmp_int32 tdg_id = tdg->tdg_id;
5478  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5479 
5480  char file_name[20];
5481  sprintf(file_name, "tdg_%d.dot", tdg_id);
5482  kmp_safe_raii_file_t tdg_file(file_name, "w");
5483 
5484  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5485  fprintf(tdg_file,
5486  "digraph TDG {\n"
5487  " compound=true\n"
5488  " subgraph cluster {\n"
5489  " label=TDG_%d\n",
5490  tdg_id);
5491  for (kmp_int32 i = 0; i < num_tasks; i++) {
5492  fprintf(tdg_file, " %d[style=bold]\n", i);
5493  }
5494  fprintf(tdg_file, " }\n");
5495  for (kmp_int32 i = 0; i < num_tasks; i++) {
5496  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5497  kmp_int32 *successors = tdg->record_map[i].successors;
5498  if (nsuccessors > 0) {
5499  for (kmp_int32 j = 0; j < nsuccessors; j++)
5500  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5501  }
5502  }
5503  fprintf(tdg_file, "}");
5504  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5505 }
5506 
5507 // __kmp_start_record: launch the execution of a previous
5508 // recorded TDG
5509 // gtid: Global Thread ID
5510 // tdg: ID of the TDG
5511 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5512  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5513  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5514  tdg->tdg_id, tdg->num_roots));
5515  kmp_node_info_t *this_record_map = tdg->record_map;
5516  kmp_int32 *this_root_tasks = tdg->root_tasks;
5517  kmp_int32 this_num_roots = tdg->num_roots;
5518  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5519 
5520  kmp_info_t *thread = __kmp_threads[gtid];
5521  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5522 
5523  if (tdg->rec_taskred_data) {
5524  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5525  }
5526 
5527  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5528  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5529 
5530  td->td_parent = parent_task;
5531  this_record_map[j].parent_task = parent_task;
5532 
5533  kmp_taskgroup_t *parent_taskgroup =
5534  this_record_map[j].parent_task->td_taskgroup;
5535 
5536  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5537  this_record_map[j].npredecessors);
5538  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5539 
5540  if (parent_taskgroup) {
5541  KMP_ATOMIC_INC(&parent_taskgroup->count);
5542  // The taskgroup is different so we must update it
5543  td->td_taskgroup = parent_taskgroup;
5544  } else if (td->td_taskgroup != nullptr) {
5545  // If the parent doesnt have a taskgroup, remove it from the task
5546  td->td_taskgroup = nullptr;
5547  }
5548  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5549  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5550  }
5551 
5552  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5553  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5554  }
5555  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5556  tdg->tdg_id, tdg->num_roots));
5557 }
5558 
5559 // __kmp_start_record: set up a TDG structure and turn the
5560 // recording flag to true
5561 // gtid: Global Thread ID of the encountering thread
5562 // input_flags: Flags associated with the TDG
5563 // tdg_id: ID of the TDG to record
5564 static inline void __kmp_start_record(kmp_int32 gtid,
5565  kmp_taskgraph_flags_t *flags,
5566  kmp_int32 tdg_id) {
5567  kmp_tdg_info_t *tdg =
5568  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5569  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5570  // Initializing the TDG structure
5571  tdg->tdg_id = tdg_id;
5572  tdg->map_size = INIT_MAPSIZE;
5573  tdg->num_roots = -1;
5574  tdg->root_tasks = nullptr;
5575  tdg->tdg_status = KMP_TDG_RECORDING;
5576  tdg->rec_num_taskred = 0;
5577  tdg->rec_taskred_data = nullptr;
5578  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5579 
5580  // Initializing the list of nodes in this TDG
5581  kmp_node_info_t *this_record_map =
5582  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5583  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5584  kmp_int32 *successorsList =
5585  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5586  this_record_map[i].task = nullptr;
5587  this_record_map[i].successors = successorsList;
5588  this_record_map[i].nsuccessors = 0;
5589  this_record_map[i].npredecessors = 0;
5590  this_record_map[i].successors_size = __kmp_successors_size;
5591  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5592  }
5593 
5594  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5595 }
5596 
5597 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5598 // the beginning of the record process of a task region
5599 // loc_ref: Location of TDG, not used yet
5600 // gtid: Global Thread ID of the encountering thread
5601 // input_flags: Flags associated with the TDG
5602 // tdg_id: ID of the TDG to record, for now, incremental integer
5603 // returns: 1 if we record, otherwise, 0
5604 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5605  kmp_int32 input_flags, kmp_int32 tdg_id) {
5606 
5607  kmp_int32 res;
5608  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5609  KA_TRACE(10,
5610  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5611  gtid, loc_ref, input_flags, tdg_id));
5612 
5613  if (__kmp_max_tdgs == 0) {
5614  KA_TRACE(
5615  10,
5616  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5617  "__kmp_max_tdgs = 0\n",
5618  gtid, loc_ref, input_flags, tdg_id));
5619  return 1;
5620  }
5621 
5622  __kmpc_taskgroup(loc_ref, gtid);
5623  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5624  // TODO: use re_record flag
5625  __kmp_exec_tdg(gtid, tdg);
5626  res = 0;
5627  } else {
5628  __kmp_curr_tdg_idx = tdg_id;
5629  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5630  __kmp_start_record(gtid, flags, tdg_id);
5631  __kmp_num_tdg++;
5632  res = 1;
5633  }
5634  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5635  gtid, tdg_id, res ? "record" : "execute"));
5636  return res;
5637 }
5638 
5639 // __kmp_end_record: set up a TDG after recording it
5640 // gtid: Global thread ID
5641 // tdg: Pointer to the TDG
5642 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5643  // Store roots
5644  kmp_node_info_t *this_record_map = tdg->record_map;
5645  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5646  kmp_int32 *this_root_tasks =
5647  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5648  kmp_int32 this_map_size = tdg->map_size;
5649  kmp_int32 this_num_roots = 0;
5650  kmp_info_t *thread = __kmp_threads[gtid];
5651 
5652  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5653  if (this_record_map[i].npredecessors == 0) {
5654  this_root_tasks[this_num_roots++] = i;
5655  }
5656  }
5657 
5658  // Update with roots info and mapsize
5659  tdg->map_size = this_map_size;
5660  tdg->num_roots = this_num_roots;
5661  tdg->root_tasks = this_root_tasks;
5662  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5663  tdg->tdg_status = KMP_TDG_READY;
5664 
5665  if (thread->th.th_current_task->td_dephash) {
5666  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5667  thread->th.th_current_task->td_dephash = NULL;
5668  }
5669 
5670  // Reset predecessor counter
5671  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5672  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5673  this_record_map[i].npredecessors);
5674  }
5675  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5676 
5677  if (__kmp_tdg_dot)
5678  __kmp_print_tdg_dot(tdg);
5679 }
5680 
5681 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5682 // the end of recording phase
5683 //
5684 // loc_ref: Source location information
5685 // gtid: Global thread ID
5686 // input_flags: Flags attached to the graph
5687 // tdg_id: ID of the TDG just finished recording
5688 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5689  kmp_int32 input_flags, kmp_int32 tdg_id) {
5690  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5691 
5692  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5693  " tdg=%d with flags=%d\n",
5694  gtid, loc_ref, tdg_id, input_flags));
5695  if (__kmp_max_tdgs) {
5696  // TODO: use input_flags->nowait
5697  __kmpc_end_taskgroup(loc_ref, gtid);
5698  if (__kmp_tdg_is_recording(tdg->tdg_status))
5699  __kmp_end_record(gtid, tdg);
5700  }
5701  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5702  " tdg=%d, its status is now READY\n",
5703  gtid, loc_ref, tdg_id));
5704 }
5705 #endif
kmp_taskred_data::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2444
kmp_taskred_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2461
kmp_taskred_data::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2452
__kmpc_proxy_task_completed
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
Definition: kmp_tasking.cpp:4496
kmp_taskred_data_t
struct kmp_taskred_data kmp_taskred_data_t
kmp_taskred_data::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2445
__kmpc_omp_get_target_async_handle_ptr
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp_tasking.cpp:5419
kmp_taskred_data
Definition: kmp_tasking.cpp:2442
kmp_task_red_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2434
kmp_taskred_data::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2443
__kmpc_taskred_init
void * __kmpc_taskred_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2599
__kmpc_task_reduction_init
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2572
kmp_task_red_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2435
kmp_taskred_data::reduce_pend
void * reduce_pend
Definition: kmp_tasking.cpp:2447
kmp_safe_raii_file_t
Definition: kmp.h:4575
__kmpc_task_reduction_get_th_data
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp_tasking.cpp:2642
__kmpc_taskloop
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
Definition: kmp_tasking.cpp:5373
kmp_task_red_input_t
struct kmp_task_red_input kmp_task_red_input_t
__kmpc_omp_has_task_team
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
Definition: kmp_tasking.cpp:5440
__kmpc_taskloop_5
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
Definition: kmp_tasking.cpp:5400
kmp_taskred_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2463
kmp_taskred_data::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2449
kmp_taskred_data::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2450
kmp_task_red_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2436
__kmpc_proxy_task_completed_ooo
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
Definition: kmp_tasking.cpp:4560
ident
Definition: kmp.h:235
__kmpc_task_reduction_modifier_fini
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
Definition: kmp_tasking.cpp:2855
kmp_taskred_flags
Definition: kmp_tasking.cpp:2420
kmp_task_red_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2431
kmp_taskred_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2467
kmp_taskred_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2465
kmp_taskred_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2466
kmp_taskred_data::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2451
kmp_taskred_input_t
struct kmp_taskred_input kmp_taskred_input_t
kmp_taskred_input
Definition: kmp_tasking.cpp:2460
__kmpc_task_reduction_modifier_init
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2821
kmp_task_red_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2430
__kmpc_taskred_modifier_init
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2841
kmp_taskred_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2468
kmp_taskred_flags_t
struct kmp_taskred_flags kmp_taskred_flags_t
kmp_taskred_input::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2462
kmp_taskred_flags::lazy_priv
unsigned lazy_priv
Definition: kmp_tasking.cpp:2422
kmp_taskred_data::reduce_priv
void * reduce_priv
Definition: kmp_tasking.cpp:2446
kmp_task_red_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2433
KMP_COUNT_BLOCK
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
kmp_task_red_input
Definition: kmp_tasking.cpp:2429
__kmpc_omp_reg_task_with_affinity
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
Definition: kmp_tasking.cpp:1729