LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 #ifdef BUILD_TIED_TASK_STACK
46 
47 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
48 // from top do bottom
49 //
50 // gtid: global thread identifier for thread containing stack
51 // thread_data: thread data for task team thread containing stack
52 // threshold: value above which the trace statement triggers
53 // location: string identifying call site of this function (for trace)
54 static void __kmp_trace_task_stack(kmp_int32 gtid,
55  kmp_thread_data_t *thread_data,
56  int threshold, char *location) {
57  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58  kmp_taskdata_t **stack_top = task_stack->ts_top;
59  kmp_int32 entries = task_stack->ts_entries;
60  kmp_taskdata_t *tied_task;
61 
62  KA_TRACE(
63  threshold,
64  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65  "first_block = %p, stack_top = %p \n",
66  location, gtid, entries, task_stack->ts_first_block, stack_top));
67 
68  KMP_DEBUG_ASSERT(stack_top != NULL);
69  KMP_DEBUG_ASSERT(entries > 0);
70 
71  while (entries != 0) {
72  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
73  // fix up ts_top if we need to pop from previous block
74  if (entries & TASK_STACK_INDEX_MASK == 0) {
75  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
76 
77  stack_block = stack_block->sb_prev;
78  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79  }
80 
81  // finish bookkeeping
82  stack_top--;
83  entries--;
84 
85  tied_task = *stack_top;
86 
87  KMP_DEBUG_ASSERT(tied_task != NULL);
88  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89 
90  KA_TRACE(threshold,
91  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
92  "stack_top=%p, tied_task=%p\n",
93  location, gtid, entries, stack_top, tied_task));
94  }
95  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
96 
97  KA_TRACE(threshold,
98  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99  location, gtid));
100 }
101 
102 // __kmp_init_task_stack: initialize the task stack for the first time
103 // after a thread_data structure is created.
104 // It should not be necessary to do this again (assuming the stack works).
105 //
106 // gtid: global thread identifier of calling thread
107 // thread_data: thread data for task team thread containing stack
108 static void __kmp_init_task_stack(kmp_int32 gtid,
109  kmp_thread_data_t *thread_data) {
110  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111  kmp_stack_block_t *first_block;
112 
113  // set up the first block of the stack
114  first_block = &task_stack->ts_first_block;
115  task_stack->ts_top = (kmp_taskdata_t **)first_block;
116  memset((void *)first_block, '\0',
117  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118 
119  // initialize the stack to be empty
120  task_stack->ts_entries = TASK_STACK_EMPTY;
121  first_block->sb_next = NULL;
122  first_block->sb_prev = NULL;
123 }
124 
125 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 // gtid: global thread identifier for calling thread
128 // thread_data: thread info for thread containing stack
129 static void __kmp_free_task_stack(kmp_int32 gtid,
130  kmp_thread_data_t *thread_data) {
131  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133 
134  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135  // free from the second block of the stack
136  while (stack_block != NULL) {
137  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138 
139  stack_block->sb_next = NULL;
140  stack_block->sb_prev = NULL;
141  if (stack_block != &task_stack->ts_first_block) {
142  __kmp_thread_free(thread,
143  stack_block); // free the block, if not the first
144  }
145  stack_block = next_block;
146  }
147  // initialize the stack to be empty
148  task_stack->ts_entries = 0;
149  task_stack->ts_top = NULL;
150 }
151 
152 // __kmp_push_task_stack: Push the tied task onto the task stack.
153 // Grow the stack if necessary by allocating another block.
154 //
155 // gtid: global thread identifier for calling thread
156 // thread: thread info for thread containing stack
157 // tied_task: the task to push on the stack
158 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159  kmp_taskdata_t *tied_task) {
160  // GEH - need to consider what to do if tt_threads_data not allocated yet
161  kmp_thread_data_t *thread_data =
162  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164 
165  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
166  return; // Don't push anything on stack if team or team tasks are serialized
167  }
168 
169  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171 
172  KA_TRACE(20,
173  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174  gtid, thread, tied_task));
175  // Store entry
176  *(task_stack->ts_top) = tied_task;
177 
178  // Do bookkeeping for next push
179  task_stack->ts_top++;
180  task_stack->ts_entries++;
181 
182  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
183  // Find beginning of this task block
184  kmp_stack_block_t *stack_block =
185  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186 
187  // Check if we already have a block
188  if (stack_block->sb_next !=
189  NULL) { // reset ts_top to beginning of next block
190  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
191  } else { // Alloc new block and link it up
192  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
193  thread, sizeof(kmp_stack_block_t));
194 
195  task_stack->ts_top = &new_block->sb_block[0];
196  stack_block->sb_next = new_block;
197  new_block->sb_prev = stack_block;
198  new_block->sb_next = NULL;
199 
200  KA_TRACE(
201  30,
202  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203  gtid, tied_task, new_block));
204  }
205  }
206  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207  tied_task));
208 }
209 
210 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
211 // the task, just check to make sure it matches the ending task passed in.
212 //
213 // gtid: global thread identifier for the calling thread
214 // thread: thread info structure containing stack
215 // tied_task: the task popped off the stack
216 // ending_task: the task that is ending (should match popped task)
217 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218  kmp_taskdata_t *ending_task) {
219  // GEH - need to consider what to do if tt_threads_data not allocated yet
220  kmp_thread_data_t *thread_data =
221  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223  kmp_taskdata_t *tied_task;
224 
225  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
226  // Don't pop anything from stack if team or team tasks are serialized
227  return;
228  }
229 
230  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
232 
233  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234  thread));
235 
236  // fix up ts_top if we need to pop from previous block
237  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
238  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
239 
240  stack_block = stack_block->sb_prev;
241  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242  }
243 
244  // finish bookkeeping
245  task_stack->ts_top--;
246  task_stack->ts_entries--;
247 
248  tied_task = *(task_stack->ts_top);
249 
250  KMP_DEBUG_ASSERT(tied_task != NULL);
251  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253 
254  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255  tied_task));
256  return;
257 }
258 #endif /* BUILD_TIED_TASK_STACK */
259 
260 // returns 1 if new task is allowed to execute, 0 otherwise
261 // checks Task Scheduling constraint (if requested) and
262 // mutexinoutset dependencies if any
263 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264  const kmp_taskdata_t *tasknew,
265  const kmp_taskdata_t *taskcurr) {
266  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
268  // only descendant of all deferred tied tasks can be scheduled, checking
269  // the last one is enough, as it in turn is the descendant of all others
270  kmp_taskdata_t *current = taskcurr->td_last_tied;
271  KMP_DEBUG_ASSERT(current != NULL);
272  // check if the task is not suspended on barrier
273  if (current->td_flags.tasktype == TASK_EXPLICIT ||
274  current->td_taskwait_thread > 0) { // <= 0 on barrier
275  kmp_int32 level = current->td_level;
276  kmp_taskdata_t *parent = tasknew->td_parent;
277  while (parent != current && parent->td_level > level) {
278  // check generation up to the level of the current task
279  parent = parent->td_parent;
280  KMP_DEBUG_ASSERT(parent != NULL);
281  }
282  if (parent != current)
283  return false;
284  }
285  }
286  // Check mutexinoutset dependencies, acquire locks
287  kmp_depnode_t *node = tasknew->td_depnode;
288 #if OMPX_TASKGRAPH
289  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
290 #else
291  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
292 #endif
293  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
294  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
296  continue;
297  // could not get the lock, release previous locks
298  for (int j = i - 1; j >= 0; --j)
299  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
300  return false;
301  }
302  // negative num_locks means all locks acquired successfully
303  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304  }
305  return true;
306 }
307 
308 // __kmp_realloc_task_deque:
309 // Re-allocates a task deque for a particular thread, copies the content from
310 // the old deque and adjusts the necessary data structures relating to the
311 // deque. This operation must be done with the deque_lock being held
312 static void __kmp_realloc_task_deque(kmp_info_t *thread,
313  kmp_thread_data_t *thread_data) {
314  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316  kmp_int32 new_size = 2 * size;
317 
318  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319  "%d] for thread_data %p\n",
320  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
321 
322  kmp_taskdata_t **new_deque =
323  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
324 
325  int i, j;
326  for (i = thread_data->td.td_deque_head, j = 0; j < size;
327  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
328  new_deque[j] = thread_data->td.td_deque[i];
329 
330  __kmp_free(thread_data->td.td_deque);
331 
332  thread_data->td.td_deque_head = 0;
333  thread_data->td.td_deque_tail = size;
334  thread_data->td.td_deque = new_deque;
335  thread_data->td.td_deque_size = new_size;
336 }
337 
338 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
340  kmp_thread_data_t *thread_data = &l->td;
341  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
342  thread_data->td.td_deque_last_stolen = -1;
343  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344  "for thread_data %p\n",
345  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349  return l;
350 }
351 
352 // The function finds the deque of priority tasks with given priority, or
353 // allocates a new deque and put it into sorted (high -> low) list of deques.
354 // Deques of non-default priority tasks are shared between all threads in team,
355 // as opposed to per-thread deques of tasks with default priority.
356 // The function is called under the lock task_team->tt.tt_task_pri_lock.
357 static kmp_thread_data_t *
358 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359  kmp_thread_data_t *thread_data;
360  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361  if (lst->priority == pri) {
362  // Found queue of tasks with given priority.
363  thread_data = &lst->td;
364  } else if (lst->priority < pri) {
365  // All current priority queues contain tasks with lower priority.
366  // Allocate new one for given priority tasks.
367  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368  thread_data = &list->td;
369  list->priority = pri;
370  list->next = lst;
371  task_team->tt.tt_task_pri_list = list;
372  } else { // task_team->tt.tt_task_pri_list->priority > pri
373  kmp_task_pri_t *next_queue = lst->next;
374  while (next_queue && next_queue->priority > pri) {
375  lst = next_queue;
376  next_queue = lst->next;
377  }
378  // lst->priority > pri && (next == NULL || pri >= next->priority)
379  if (next_queue == NULL) {
380  // No queue with pri priority, need to allocate new one.
381  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382  thread_data = &list->td;
383  list->priority = pri;
384  list->next = NULL;
385  lst->next = list;
386  } else if (next_queue->priority == pri) {
387  // Found queue of tasks with given priority.
388  thread_data = &next_queue->td;
389  } else { // lst->priority > pri > next->priority
390  // insert newly allocated between existed queues
391  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392  thread_data = &list->td;
393  list->priority = pri;
394  list->next = next_queue;
395  lst->next = list;
396  }
397  }
398  return thread_data;
399 }
400 
401 // __kmp_push_priority_task: Add a task to the team's priority task deque
402 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403  kmp_taskdata_t *taskdata,
404  kmp_task_team_t *task_team,
405  kmp_int32 pri) {
406  kmp_thread_data_t *thread_data = NULL;
407  KA_TRACE(20,
408  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409  gtid, taskdata, pri));
410 
411  // Find task queue specific to priority value
412  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413  if (UNLIKELY(lst == NULL)) {
414  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
415  if (task_team->tt.tt_task_pri_list == NULL) {
416  // List of queues is still empty, allocate one.
417  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418  thread_data = &list->td;
419  list->priority = pri;
420  list->next = NULL;
421  task_team->tt.tt_task_pri_list = list;
422  } else {
423  // Other thread initialized a queue. Check if it fits and get thread_data.
424  thread_data = __kmp_get_priority_deque_data(task_team, pri);
425  }
426  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
427  } else {
428  if (lst->priority == pri) {
429  // Found queue of tasks with given priority.
430  thread_data = &lst->td;
431  } else {
432  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
433  thread_data = __kmp_get_priority_deque_data(task_team, pri);
434  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
435  }
436  }
437  KMP_DEBUG_ASSERT(thread_data);
438 
439  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
440  // Check if deque is full
441  if (TCR_4(thread_data->td.td_deque_ntasks) >=
442  TASK_DEQUE_SIZE(thread_data->td)) {
443  if (__kmp_enable_task_throttling &&
444  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
445  thread->th.th_current_task)) {
446  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
447  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
448  "TASK_NOT_PUSHED for task %p\n",
449  gtid, taskdata));
450  return TASK_NOT_PUSHED;
451  } else {
452  // expand deque to push the task which is not allowed to execute
453  __kmp_realloc_task_deque(thread, thread_data);
454  }
455  }
456  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457  TASK_DEQUE_SIZE(thread_data->td));
458  // Push taskdata.
459  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460  // Wrap index.
461  thread_data->td.td_deque_tail =
462  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
463  TCW_4(thread_data->td.td_deque_ntasks,
464  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
465  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466  KMP_FSYNC_RELEASING(taskdata); // releasing child
467  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
468  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469  gtid, taskdata, thread_data->td.td_deque_ntasks,
470  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
472  task_team->tt.tt_num_task_pri++; // atomic inc
473  return TASK_SUCCESSFULLY_PUSHED;
474 }
475 
476 // __kmp_push_task: Add a task to the thread's deque
477 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478  kmp_info_t *thread = __kmp_threads[gtid];
479  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480 
481  // If we encounter a hidden helper task, and the current thread is not a
482  // hidden helper thread, we have to give the task to any hidden helper thread
483  // starting from its shadow one.
484  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
486  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
488  // Signal the hidden helper threads.
489  __kmp_hidden_helper_worker_thread_signal();
490  return TASK_SUCCESSFULLY_PUSHED;
491  }
492 
493  kmp_task_team_t *task_team = thread->th.th_task_team;
494  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495  kmp_thread_data_t *thread_data;
496 
497  KA_TRACE(20,
498  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499 
500  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501  // untied task needs to increment counter so that the task structure is not
502  // freed prematurely
503  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504  KMP_DEBUG_USE_VAR(counter);
505  KA_TRACE(
506  20,
507  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508  gtid, counter, taskdata));
509  }
510 
511  // The first check avoids building task_team thread data if serialized
512  if (UNLIKELY(taskdata->td_flags.task_serial)) {
513  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
514  "TASK_NOT_PUSHED for task %p\n",
515  gtid, taskdata));
516  return TASK_NOT_PUSHED;
517  }
518 
519  // Now that serialized tasks have returned, we can assume that we are not in
520  // immediate exec mode
521  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523  __kmp_enable_tasking(task_team, thread);
524  }
525  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527 
528  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
529  __kmp_max_task_priority > 0) {
530  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532  }
533 
534  // Find tasking deque specific to encountering thread
535  thread_data = &task_team->tt.tt_threads_data[tid];
536 
537  // No lock needed since only owner can allocate. If the task is hidden_helper,
538  // we don't need it either because we have initialized the dequeue for hidden
539  // helper thread data.
540  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541  __kmp_alloc_task_deque(thread, thread_data);
542  }
543 
544  int locked = 0;
545  // Check if deque is full
546  if (TCR_4(thread_data->td.td_deque_ntasks) >=
547  TASK_DEQUE_SIZE(thread_data->td)) {
548  if (__kmp_enable_task_throttling &&
549  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
550  thread->th.th_current_task)) {
551  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
552  "TASK_NOT_PUSHED for task %p\n",
553  gtid, taskdata));
554  return TASK_NOT_PUSHED;
555  } else {
556  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
557  locked = 1;
558  if (TCR_4(thread_data->td.td_deque_ntasks) >=
559  TASK_DEQUE_SIZE(thread_data->td)) {
560  // expand deque to push the task which is not allowed to execute
561  __kmp_realloc_task_deque(thread, thread_data);
562  }
563  }
564  }
565  // Lock the deque for the task push operation
566  if (!locked) {
567  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
568  // Need to recheck as we can get a proxy task from thread outside of OpenMP
569  if (TCR_4(thread_data->td.td_deque_ntasks) >=
570  TASK_DEQUE_SIZE(thread_data->td)) {
571  if (__kmp_enable_task_throttling &&
572  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
573  thread->th.th_current_task)) {
574  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
575  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576  "returning TASK_NOT_PUSHED for task %p\n",
577  gtid, taskdata));
578  return TASK_NOT_PUSHED;
579  } else {
580  // expand deque to push the task which is not allowed to execute
581  __kmp_realloc_task_deque(thread, thread_data);
582  }
583  }
584  }
585  // Must have room since no thread can add tasks but calling thread
586  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587  TASK_DEQUE_SIZE(thread_data->td));
588 
589  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590  taskdata; // Push taskdata
591  // Wrap index.
592  thread_data->td.td_deque_tail =
593  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
594  TCW_4(thread_data->td.td_deque_ntasks,
595  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
596  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597  KMP_FSYNC_RELEASING(taskdata); // releasing child
598  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599  "task=%p ntasks=%d head=%u tail=%u\n",
600  gtid, taskdata, thread_data->td.td_deque_ntasks,
601  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602 
603  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
604 
605  return TASK_SUCCESSFULLY_PUSHED;
606 }
607 
608 // __kmp_pop_current_task_from_thread: set up current task from called thread
609 // when team ends
610 //
611 // this_thr: thread structure to set current_task in.
612 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614  "this_thread=%p, curtask=%p, "
615  "curtask_parent=%p\n",
616  0, this_thr, this_thr->th.th_current_task,
617  this_thr->th.th_current_task->td_parent));
618 
619  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620 
621  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622  "this_thread=%p, curtask=%p, "
623  "curtask_parent=%p\n",
624  0, this_thr, this_thr->th.th_current_task,
625  this_thr->th.th_current_task->td_parent));
626 }
627 
628 // __kmp_push_current_task_to_thread: set up current task in called thread for a
629 // new team
630 //
631 // this_thr: thread structure to set up
632 // team: team for implicit task data
633 // tid: thread within team to set up
634 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
635  int tid) {
636  // current task of the thread is a parent of the new just created implicit
637  // tasks of new team
638  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639  "curtask=%p "
640  "parent_task=%p\n",
641  tid, this_thr, this_thr->th.th_current_task,
642  team->t.t_implicit_task_taskdata[tid].td_parent));
643 
644  KMP_DEBUG_ASSERT(this_thr != NULL);
645 
646  if (tid == 0) {
647  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
648  team->t.t_implicit_task_taskdata[0].td_parent =
649  this_thr->th.th_current_task;
650  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
651  }
652  } else {
653  team->t.t_implicit_task_taskdata[tid].td_parent =
654  team->t.t_implicit_task_taskdata[0].td_parent;
655  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656  }
657 
658  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659  "curtask=%p "
660  "parent_task=%p\n",
661  tid, this_thr, this_thr->th.th_current_task,
662  team->t.t_implicit_task_taskdata[tid].td_parent));
663 }
664 
665 // __kmp_task_start: bookkeeping for a task starting execution
666 //
667 // GTID: global thread id of calling thread
668 // task: task starting execution
669 // current_task: task suspending
670 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671  kmp_taskdata_t *current_task) {
672  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673  kmp_info_t *thread = __kmp_threads[gtid];
674 
675  KA_TRACE(10,
676  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677  gtid, taskdata, current_task));
678 
679  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680 
681  // mark currently executing task as suspended
682  // TODO: GEH - make sure root team implicit task is initialized properly.
683  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684  current_task->td_flags.executing = 0;
685 
686 // Add task to stack if tied
687 #ifdef BUILD_TIED_TASK_STACK
688  if (taskdata->td_flags.tiedness == TASK_TIED) {
689  __kmp_push_task_stack(gtid, thread, taskdata);
690  }
691 #endif /* BUILD_TIED_TASK_STACK */
692 
693  // mark starting task as executing and as current task
694  thread->th.th_current_task = taskdata;
695 
696  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
697  taskdata->td_flags.tiedness == TASK_UNTIED);
698  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
699  taskdata->td_flags.tiedness == TASK_UNTIED);
700  taskdata->td_flags.started = 1;
701  taskdata->td_flags.executing = 1;
702  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
703  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
704 
705  // GEH TODO: shouldn't we pass some sort of location identifier here?
706  // APT: yes, we will pass location here.
707  // need to store current thread state (in a thread or taskdata structure)
708  // before setting work_state, otherwise wrong state is set after end of task
709 
710  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711 
712  return;
713 }
714 
715 #if OMPT_SUPPORT
716 //------------------------------------------------------------------------------
717 // __ompt_task_init:
718 // Initialize OMPT fields maintained by a task. This will only be called after
719 // ompt_start_tool, so we already know whether ompt is enabled or not.
720 
721 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
722  // The calls to __ompt_task_init already have the ompt_enabled condition.
723  task->ompt_task_info.task_data.value = 0;
724  task->ompt_task_info.frame.exit_frame = ompt_data_none;
725  task->ompt_task_info.frame.enter_frame = ompt_data_none;
726  task->ompt_task_info.frame.exit_frame_flags =
727  ompt_frame_runtime | ompt_frame_framepointer;
728  task->ompt_task_info.frame.enter_frame_flags =
729  ompt_frame_runtime | ompt_frame_framepointer;
730  task->ompt_task_info.dispatch_chunk.start = 0;
731  task->ompt_task_info.dispatch_chunk.iterations = 0;
732 }
733 
734 // __ompt_task_start:
735 // Build and trigger task-begin event
736 static inline void __ompt_task_start(kmp_task_t *task,
737  kmp_taskdata_t *current_task,
738  kmp_int32 gtid) {
739  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
740  ompt_task_status_t status = ompt_task_switch;
741  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
742  status = ompt_task_yield;
743  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
744  }
745  /* let OMPT know that we're about to run this task */
746  if (ompt_enabled.ompt_callback_task_schedule) {
747  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
748  &(current_task->ompt_task_info.task_data), status,
749  &(taskdata->ompt_task_info.task_data));
750  }
751  taskdata->ompt_task_info.scheduling_parent = current_task;
752 }
753 
754 // __ompt_task_finish:
755 // Build and trigger final task-schedule event
756 static inline void __ompt_task_finish(kmp_task_t *task,
757  kmp_taskdata_t *resumed_task,
758  ompt_task_status_t status) {
759  if (ompt_enabled.ompt_callback_task_schedule) {
760  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
761  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
762  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
763  status = ompt_task_cancel;
764  }
765 
766  /* let OMPT know that we're returning to the callee task */
767  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
768  &(taskdata->ompt_task_info.task_data), status,
769  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
770  }
771 }
772 #endif
773 
774 template <bool ompt>
775 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
776  kmp_task_t *task,
777  void *frame_address,
778  void *return_address) {
779  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
780  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
781 
782  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
783  "current_task=%p\n",
784  gtid, loc_ref, taskdata, current_task));
785 
786  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
787  // untied task needs to increment counter so that the task structure is not
788  // freed prematurely
789  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
790  KMP_DEBUG_USE_VAR(counter);
791  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
792  "incremented for task %p\n",
793  gtid, counter, taskdata));
794  }
795 
796  taskdata->td_flags.task_serial =
797  1; // Execute this task immediately, not deferred.
798  __kmp_task_start(gtid, task, current_task);
799 
800 #if OMPT_SUPPORT
801  if (ompt) {
802  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
803  current_task->ompt_task_info.frame.enter_frame.ptr =
804  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
805  current_task->ompt_task_info.frame.enter_frame_flags =
806  taskdata->ompt_task_info.frame.exit_frame_flags =
807  ompt_frame_application | ompt_frame_framepointer;
808  }
809  if (ompt_enabled.ompt_callback_task_create) {
810  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
811  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
812  &(parent_info->task_data), &(parent_info->frame),
813  &(taskdata->ompt_task_info.task_data),
814  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
815  return_address);
816  }
817  __ompt_task_start(task, current_task, gtid);
818  }
819 #endif // OMPT_SUPPORT
820 
821  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
822  loc_ref, taskdata));
823 }
824 
825 #if OMPT_SUPPORT
826 OMPT_NOINLINE
827 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
828  kmp_task_t *task,
829  void *frame_address,
830  void *return_address) {
831  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
832  return_address);
833 }
834 #endif // OMPT_SUPPORT
835 
836 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
837 // execution
838 //
839 // loc_ref: source location information; points to beginning of task block.
840 // gtid: global thread number.
841 // task: task thunk for the started task.
842 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
843  kmp_task_t *task) {
844 #if OMPT_SUPPORT
845  if (UNLIKELY(ompt_enabled.enabled)) {
846  OMPT_STORE_RETURN_ADDRESS(gtid);
847  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
848  OMPT_GET_FRAME_ADDRESS(1),
849  OMPT_LOAD_RETURN_ADDRESS(gtid));
850  return;
851  }
852 #endif
853  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
854 }
855 
856 #ifdef TASK_UNUSED
857 // __kmpc_omp_task_begin: report that a given task has started execution
858 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
859 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
860  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
861 
862  KA_TRACE(
863  10,
864  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
865  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
866 
867  __kmp_task_start(gtid, task, current_task);
868 
869  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
870  loc_ref, KMP_TASK_TO_TASKDATA(task)));
871  return;
872 }
873 #endif // TASK_UNUSED
874 
875 // __kmp_free_task: free the current task space and the space for shareds
876 //
877 // gtid: Global thread ID of calling thread
878 // taskdata: task to free
879 // thread: thread data structure of caller
880 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
881  kmp_info_t *thread) {
882  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
883  taskdata));
884 
885  // Check to make sure all flags and counters have the correct values
886  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
887  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
888  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
889  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
890  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
891  taskdata->td_flags.task_serial == 1);
892  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
893  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
894  // Clear data to not be re-used later by mistake.
895  task->data1.destructors = NULL;
896  task->data2.priority = 0;
897 
898  taskdata->td_flags.freed = 1;
899 #if OMPX_TASKGRAPH
900  // do not free tasks in taskgraph
901  if (!taskdata->is_taskgraph) {
902 #endif
903 // deallocate the taskdata and shared variable blocks associated with this task
904 #if USE_FAST_MEMORY
905  __kmp_fast_free(thread, taskdata);
906 #else /* ! USE_FAST_MEMORY */
907  __kmp_thread_free(thread, taskdata);
908 #endif
909 #if OMPX_TASKGRAPH
910  } else {
911  taskdata->td_flags.complete = 0;
912  taskdata->td_flags.started = 0;
913  taskdata->td_flags.freed = 0;
914  taskdata->td_flags.executing = 0;
915  taskdata->td_flags.task_serial =
916  (taskdata->td_parent->td_flags.final ||
917  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
918 
919  // taskdata->td_allow_completion_event.pending_events_count = 1;
920  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
921  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
922  // start at one because counts current task and children
923  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
924  }
925 #endif
926 
927  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
928 }
929 
930 // __kmp_free_task_and_ancestors: free the current task and ancestors without
931 // children
932 //
933 // gtid: Global thread ID of calling thread
934 // taskdata: task to free
935 // thread: thread data structure of caller
936 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
937  kmp_taskdata_t *taskdata,
938  kmp_info_t *thread) {
939  // Proxy tasks must always be allowed to free their parents
940  // because they can be run in background even in serial mode.
941  kmp_int32 team_serial =
942  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
943  !taskdata->td_flags.proxy;
944  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
945 
946  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
947  KMP_DEBUG_ASSERT(children >= 0);
948 
949  // Now, go up the ancestor tree to see if any ancestors can now be freed.
950  while (children == 0) {
951  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
952 
953  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
954  "and freeing itself\n",
955  gtid, taskdata));
956 
957  // --- Deallocate my ancestor task ---
958  __kmp_free_task(gtid, taskdata, thread);
959 
960  taskdata = parent_taskdata;
961 
962  if (team_serial)
963  return;
964  // Stop checking ancestors at implicit task instead of walking up ancestor
965  // tree to avoid premature deallocation of ancestors.
966  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
967  if (taskdata->td_dephash) { // do we need to cleanup dephash?
968  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
969  kmp_tasking_flags_t flags_old = taskdata->td_flags;
970  if (children == 0 && flags_old.complete == 1) {
971  kmp_tasking_flags_t flags_new = flags_old;
972  flags_new.complete = 0;
973  if (KMP_COMPARE_AND_STORE_ACQ32(
974  RCAST(kmp_int32 *, &taskdata->td_flags),
975  *RCAST(kmp_int32 *, &flags_old),
976  *RCAST(kmp_int32 *, &flags_new))) {
977  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
978  "dephash of implicit task %p\n",
979  gtid, taskdata));
980  // cleanup dephash of finished implicit task
981  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
982  }
983  }
984  }
985  return;
986  }
987  // Predecrement simulated by "- 1" calculation
988  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
989  KMP_DEBUG_ASSERT(children >= 0);
990  }
991 
992  KA_TRACE(
993  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
994  "not freeing it yet\n",
995  gtid, taskdata, children));
996 }
997 
998 // Only need to keep track of child task counts if any of the following:
999 // 1. team parallel and tasking not serialized;
1000 // 2. it is a proxy or detachable or hidden helper task
1001 // 3. the children counter of its parent task is greater than 0.
1002 // The reason for the 3rd one is for serialized team that found detached task,
1003 // hidden helper task, T. In this case, the execution of T is still deferred,
1004 // and it is also possible that a regular task depends on T. In this case, if we
1005 // don't track the children, task synchronization will be broken.
1006 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
1007  kmp_tasking_flags_t flags = taskdata->td_flags;
1008  bool ret = !(flags.team_serial || flags.tasking_ser);
1009  ret = ret || flags.proxy == TASK_PROXY ||
1010  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
1011  ret = ret ||
1012  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
1013 #if OMPX_TASKGRAPH
1014  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1015  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
1016 #endif
1017  return ret;
1018 }
1019 
1020 // __kmp_task_finish: bookkeeping to do when a task finishes execution
1021 //
1022 // gtid: global thread ID for calling thread
1023 // task: task to be finished
1024 // resumed_task: task to be resumed. (may be NULL if task is serialized)
1025 //
1026 // template<ompt>: effectively ompt_enabled.enabled!=0
1027 // the version with ompt=false is inlined, allowing to optimize away all ompt
1028 // code in this case
1029 template <bool ompt>
1030 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1031  kmp_taskdata_t *resumed_task) {
1032  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1033  kmp_info_t *thread = __kmp_threads[gtid];
1034  kmp_task_team_t *task_team =
1035  thread->th.th_task_team; // might be NULL for serial teams...
1036 #if OMPX_TASKGRAPH
1037  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1038  bool is_taskgraph;
1039 #endif
1040 #if KMP_DEBUG
1041  kmp_int32 children = 0;
1042 #endif
1043  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1044  "task %p\n",
1045  gtid, taskdata, resumed_task));
1046 
1047  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1048 
1049 #if OMPX_TASKGRAPH
1050  is_taskgraph = taskdata->is_taskgraph;
1051 #endif
1052 
1053 // Pop task from stack if tied
1054 #ifdef BUILD_TIED_TASK_STACK
1055  if (taskdata->td_flags.tiedness == TASK_TIED) {
1056  __kmp_pop_task_stack(gtid, thread, taskdata);
1057  }
1058 #endif /* BUILD_TIED_TASK_STACK */
1059 
1060  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1061  // untied task needs to check the counter so that the task structure is not
1062  // freed prematurely
1063  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1064  KA_TRACE(
1065  20,
1066  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1067  gtid, counter, taskdata));
1068  if (counter > 0) {
1069  // untied task is not done, to be continued possibly by other thread, do
1070  // not free it now
1071  if (resumed_task == NULL) {
1072  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1073  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1074  // task is the parent
1075  }
1076  thread->th.th_current_task = resumed_task; // restore current_task
1077  resumed_task->td_flags.executing = 1; // resume previous task
1078  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1079  "resuming task %p\n",
1080  gtid, taskdata, resumed_task));
1081  return;
1082  }
1083  }
1084 
1085  // bookkeeping for resuming task:
1086  // GEH - note tasking_ser => task_serial
1087  KMP_DEBUG_ASSERT(
1088  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1089  taskdata->td_flags.task_serial);
1090  if (taskdata->td_flags.task_serial) {
1091  if (resumed_task == NULL) {
1092  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1093  // task is the parent
1094  }
1095  } else {
1096  KMP_DEBUG_ASSERT(resumed_task !=
1097  NULL); // verify that resumed task is passed as argument
1098  }
1099 
1100  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1101  destructor thunk that has been generated by the compiler. The code is
1102  placed here, since at this point other tasks might have been released
1103  hence overlapping the destructor invocations with some other work in the
1104  released tasks. The OpenMP spec is not specific on when the destructors
1105  are invoked, so we should be free to choose. */
1106  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1107  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1108  KMP_ASSERT(destr_thunk);
1109  destr_thunk(gtid, task);
1110  }
1111 
1112  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1113  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1114  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1115 
1116  bool completed = true;
1117  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1118  if (taskdata->td_allow_completion_event.type ==
1119  KMP_EVENT_ALLOW_COMPLETION) {
1120  // event hasn't been fulfilled yet. Try to detach task.
1121  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1122  if (taskdata->td_allow_completion_event.type ==
1123  KMP_EVENT_ALLOW_COMPLETION) {
1124  // task finished execution
1125  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1126  taskdata->td_flags.executing = 0; // suspend the finishing task
1127 
1128 #if OMPT_SUPPORT
1129  // For a detached task, which is not completed, we switch back
1130  // the omp_fulfill_event signals completion
1131  // locking is necessary to avoid a race with ompt_task_late_fulfill
1132  if (ompt)
1133  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1134 #endif
1135 
1136  // no access to taskdata after this point!
1137  // __kmp_fulfill_event might free taskdata at any time from now
1138 
1139  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1140  completed = false;
1141  }
1142  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1143  }
1144  }
1145 
1146  // Tasks with valid target async handles must be re-enqueued.
1147  if (taskdata->td_target_data.async_handle != NULL) {
1148  // Note: no need to translate gtid to its shadow. If the current thread is a
1149  // hidden helper one, then the gtid is already correct. Otherwise, hidden
1150  // helper threads are disabled, and gtid refers to a OpenMP thread.
1151  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
1152  if (KMP_HIDDEN_HELPER_THREAD(gtid))
1153  __kmp_hidden_helper_worker_thread_signal();
1154  completed = false;
1155  }
1156 
1157  if (completed) {
1158  taskdata->td_flags.complete = 1; // mark the task as completed
1159 #if OMPX_TASKGRAPH
1160  taskdata->td_flags.onced = 1; // mark the task as ran once already
1161 #endif
1162 
1163 #if OMPT_SUPPORT
1164  // This is not a detached task, we are done here
1165  if (ompt)
1166  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1167 #endif
1168  // TODO: What would be the balance between the conditions in the function
1169  // and an atomic operation?
1170  if (__kmp_track_children_task(taskdata)) {
1171  __kmp_release_deps(gtid, taskdata);
1172  // Predecrement simulated by "- 1" calculation
1173 #if KMP_DEBUG
1174  children = -1 +
1175 #endif
1176  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1177  KMP_DEBUG_ASSERT(children >= 0);
1178 #if OMPX_TASKGRAPH
1179  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1180 #else
1181  if (taskdata->td_taskgroup)
1182 #endif
1183  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1184  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1185  task_team->tt.tt_hidden_helper_task_encountered)) {
1186  // if we found proxy or hidden helper tasks there could exist a dependency
1187  // chain with the proxy task as origin
1188  __kmp_release_deps(gtid, taskdata);
1189  }
1190  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1191  // called. Othertwise, if a task is executed immediately from the
1192  // release_deps code, the flag will be reset to 1 again by this same
1193  // function
1194  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1195  taskdata->td_flags.executing = 0; // suspend the finishing task
1196 
1197  // Decrement the counter of hidden helper tasks to be executed.
1198  if (taskdata->td_flags.hidden_helper) {
1199  // Hidden helper tasks can only be executed by hidden helper threads.
1200  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1201  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1202  }
1203  }
1204 
1205  KA_TRACE(
1206  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1207  gtid, taskdata, children));
1208 
1209  // Free this task and then ancestor tasks if they have no children.
1210  // Restore th_current_task first as suggested by John:
1211  // johnmc: if an asynchronous inquiry peers into the runtime system
1212  // it doesn't see the freed task as the current task.
1213  thread->th.th_current_task = resumed_task;
1214  if (completed)
1215  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1216 
1217  // TODO: GEH - make sure root team implicit task is initialized properly.
1218  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1219  resumed_task->td_flags.executing = 1; // resume previous task
1220 
1221 #if OMPX_TASKGRAPH
1222  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1223  taskdata->td_taskgroup) {
1224  // TDG: we only release taskgroup barrier here because
1225  // free_task_and_ancestors will call
1226  // __kmp_free_task, which resets all task parameters such as
1227  // taskdata->started, etc. If we release the barrier earlier, these
1228  // parameters could be read before being reset. This is not an issue for
1229  // non-TDG implementation because we never reuse a task(data) structure
1230  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1231  }
1232 #endif
1233 
1234  KA_TRACE(
1235  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1236  gtid, taskdata, resumed_task));
1237 
1238  return;
1239 }
1240 
1241 template <bool ompt>
1242 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1243  kmp_int32 gtid,
1244  kmp_task_t *task) {
1245  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1246  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1247  KMP_DEBUG_ASSERT(gtid >= 0);
1248  // this routine will provide task to resume
1249  __kmp_task_finish<ompt>(gtid, task, NULL);
1250 
1251  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1252  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1253 
1254 #if OMPT_SUPPORT
1255  if (ompt) {
1256  ompt_frame_t *ompt_frame;
1257  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1258  ompt_frame->enter_frame = ompt_data_none;
1259  ompt_frame->enter_frame_flags =
1260  ompt_frame_runtime | ompt_frame_framepointer;
1261  }
1262 #endif
1263 
1264  return;
1265 }
1266 
1267 #if OMPT_SUPPORT
1268 OMPT_NOINLINE
1269 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1270  kmp_task_t *task) {
1271  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1272 }
1273 #endif // OMPT_SUPPORT
1274 
1275 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1276 //
1277 // loc_ref: source location information; points to end of task block.
1278 // gtid: global thread number.
1279 // task: task thunk for the completed task.
1280 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1281  kmp_task_t *task) {
1282 #if OMPT_SUPPORT
1283  if (UNLIKELY(ompt_enabled.enabled)) {
1284  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1285  return;
1286  }
1287 #endif
1288  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1289 }
1290 
1291 #ifdef TASK_UNUSED
1292 // __kmpc_omp_task_complete: report that a task has completed execution
1293 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1294 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1295  kmp_task_t *task) {
1296  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1297  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1298 
1299  __kmp_task_finish<false>(gtid, task,
1300  NULL); // Not sure how to find task to resume
1301 
1302  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1303  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1304  return;
1305 }
1306 #endif // TASK_UNUSED
1307 
1308 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1309 // task for a given thread
1310 //
1311 // loc_ref: reference to source location of parallel region
1312 // this_thr: thread data structure corresponding to implicit task
1313 // team: team for this_thr
1314 // tid: thread id of given thread within team
1315 // set_curr_task: TRUE if need to push current task to thread
1316 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1317 // have already been done elsewhere.
1318 // TODO: Get better loc_ref. Value passed in may be NULL
1319 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1320  kmp_team_t *team, int tid, int set_curr_task) {
1321  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1322 
1323  KF_TRACE(
1324  10,
1325  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1326  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1327 
1328  task->td_task_id = KMP_GEN_TASK_ID();
1329  task->td_team = team;
1330  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1331  // in debugger)
1332  task->td_ident = loc_ref;
1333  task->td_taskwait_ident = NULL;
1334  task->td_taskwait_counter = 0;
1335  task->td_taskwait_thread = 0;
1336 
1337  task->td_flags.tiedness = TASK_TIED;
1338  task->td_flags.tasktype = TASK_IMPLICIT;
1339  task->td_flags.proxy = TASK_FULL;
1340 
1341  // All implicit tasks are executed immediately, not deferred
1342  task->td_flags.task_serial = 1;
1343  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1344  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1345 
1346  task->td_flags.started = 1;
1347  task->td_flags.executing = 1;
1348  task->td_flags.complete = 0;
1349  task->td_flags.freed = 0;
1350 #if OMPX_TASKGRAPH
1351  task->td_flags.onced = 0;
1352 #endif
1353 
1354  task->td_depnode = NULL;
1355  task->td_last_tied = task;
1356  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1357 
1358  if (set_curr_task) { // only do this init first time thread is created
1359  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1360  // Not used: don't need to deallocate implicit task
1361  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1362  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1363  task->td_dephash = NULL;
1364  __kmp_push_current_task_to_thread(this_thr, team, tid);
1365  } else {
1366  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1367  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1368  }
1369 
1370 #if OMPT_SUPPORT
1371  if (UNLIKELY(ompt_enabled.enabled))
1372  __ompt_task_init(task, tid);
1373 #endif
1374 
1375  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1376  team, task));
1377 }
1378 
1379 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1380 // at the end of parallel regions. Some resources are kept for reuse in the next
1381 // parallel region.
1382 //
1383 // thread: thread data structure corresponding to implicit task
1384 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1385  kmp_taskdata_t *task = thread->th.th_current_task;
1386  if (task->td_dephash) {
1387  int children;
1388  task->td_flags.complete = 1;
1389 #if OMPX_TASKGRAPH
1390  task->td_flags.onced = 1;
1391 #endif
1392  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1393  kmp_tasking_flags_t flags_old = task->td_flags;
1394  if (children == 0 && flags_old.complete == 1) {
1395  kmp_tasking_flags_t flags_new = flags_old;
1396  flags_new.complete = 0;
1397  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1398  *RCAST(kmp_int32 *, &flags_old),
1399  *RCAST(kmp_int32 *, &flags_new))) {
1400  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1401  "dephash of implicit task %p\n",
1402  thread->th.th_info.ds.ds_gtid, task));
1403  __kmp_dephash_free_entries(thread, task->td_dephash);
1404  }
1405  }
1406  }
1407 }
1408 
1409 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1410 // when these are destroyed regions
1411 //
1412 // thread: thread data structure corresponding to implicit task
1413 void __kmp_free_implicit_task(kmp_info_t *thread) {
1414  kmp_taskdata_t *task = thread->th.th_current_task;
1415  if (task && task->td_dephash) {
1416  __kmp_dephash_free(thread, task->td_dephash);
1417  task->td_dephash = NULL;
1418  }
1419 }
1420 
1421 // Round up a size to a power of two specified by val: Used to insert padding
1422 // between structures co-allocated using a single malloc() call
1423 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1424  if (size & (val - 1)) {
1425  size &= ~(val - 1);
1426  if (size <= KMP_SIZE_T_MAX - val) {
1427  size += val; // Round up if there is no overflow.
1428  }
1429  }
1430  return size;
1431 } // __kmp_round_up_to_va
1432 
1433 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1434 //
1435 // loc_ref: source location information
1436 // gtid: global thread number.
1437 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1438 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1439 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1440 // private vars accessed in task.
1441 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1442 // in task.
1443 // task_entry: Pointer to task code entry point generated by compiler.
1444 // returns: a pointer to the allocated kmp_task_t structure (task).
1445 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1446  kmp_tasking_flags_t *flags,
1447  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1448  kmp_routine_entry_t task_entry) {
1449  kmp_task_t *task;
1450  kmp_taskdata_t *taskdata;
1451  kmp_info_t *thread = __kmp_threads[gtid];
1452  kmp_team_t *team = thread->th.th_team;
1453  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1454  size_t shareds_offset;
1455 
1456  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1457  __kmp_middle_initialize();
1458 
1459  if (flags->hidden_helper) {
1460  if (__kmp_enable_hidden_helper) {
1461  if (!TCR_4(__kmp_init_hidden_helper))
1462  __kmp_hidden_helper_initialize();
1463  } else {
1464  // If the hidden helper task is not enabled, reset the flag to FALSE.
1465  flags->hidden_helper = FALSE;
1466  }
1467  }
1468 
1469  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1470  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1471  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1472  sizeof_shareds, task_entry));
1473 
1474  KMP_DEBUG_ASSERT(parent_task);
1475  if (parent_task->td_flags.final) {
1476  if (flags->merged_if0) {
1477  }
1478  flags->final = 1;
1479  }
1480 
1481  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1482  // Untied task encountered causes the TSC algorithm to check entire deque of
1483  // the victim thread. If no untied task encountered, then checking the head
1484  // of the deque should be enough.
1485  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1486  }
1487 
1488  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1489  // the tasking setup
1490  // when that happens is too late.
1491  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1492  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1493  if (flags->proxy == TASK_PROXY) {
1494  flags->tiedness = TASK_UNTIED;
1495  flags->merged_if0 = 1;
1496  }
1497  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1498  tasking support enabled */
1499  if ((thread->th.th_task_team) == NULL) {
1500  /* This should only happen if the team is serialized
1501  setup a task team and propagate it to the thread */
1502  KMP_DEBUG_ASSERT(team->t.t_serialized);
1503  KA_TRACE(30,
1504  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1505  gtid));
1506  // 1 indicates setup the current team regardless of nthreads
1507  __kmp_task_team_setup(thread, team, 1);
1508  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1509  }
1510  kmp_task_team_t *task_team = thread->th.th_task_team;
1511 
1512  /* tasking must be enabled now as the task might not be pushed */
1513  if (!KMP_TASKING_ENABLED(task_team)) {
1514  KA_TRACE(
1515  30,
1516  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1517  __kmp_enable_tasking(task_team, thread);
1518  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1519  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1520  // No lock needed since only owner can allocate
1521  if (thread_data->td.td_deque == NULL) {
1522  __kmp_alloc_task_deque(thread, thread_data);
1523  }
1524  }
1525 
1526  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1527  task_team->tt.tt_found_proxy_tasks == FALSE)
1528  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1529  if (flags->hidden_helper &&
1530  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1531  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1532  }
1533 
1534  // Calculate shared structure offset including padding after kmp_task_t struct
1535  // to align pointers in shared struct
1536  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1537  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1538 
1539  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1540  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1541  shareds_offset));
1542  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1543  sizeof_shareds));
1544 
1545  // Avoid double allocation here by combining shareds with taskdata
1546 #if USE_FAST_MEMORY
1547  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1548  sizeof_shareds);
1549 #else /* ! USE_FAST_MEMORY */
1550  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1551  sizeof_shareds);
1552 #endif /* USE_FAST_MEMORY */
1553 
1554  task = KMP_TASKDATA_TO_TASK(taskdata);
1555 
1556 // Make sure task & taskdata are aligned appropriately
1557 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1558  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1559  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1560 #else
1561  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1562  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1563 #endif
1564  if (sizeof_shareds > 0) {
1565  // Avoid double allocation here by combining shareds with taskdata
1566  task->shareds = &((char *)taskdata)[shareds_offset];
1567  // Make sure shareds struct is aligned to pointer size
1568  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1569  0);
1570  } else {
1571  task->shareds = NULL;
1572  }
1573  task->routine = task_entry;
1574  task->part_id = 0; // AC: Always start with 0 part id
1575 
1576  taskdata->td_task_id = KMP_GEN_TASK_ID();
1577  taskdata->td_team = thread->th.th_team;
1578  taskdata->td_alloc_thread = thread;
1579  taskdata->td_parent = parent_task;
1580  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1581  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1582  taskdata->td_ident = loc_ref;
1583  taskdata->td_taskwait_ident = NULL;
1584  taskdata->td_taskwait_counter = 0;
1585  taskdata->td_taskwait_thread = 0;
1586  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1587  // avoid copying icvs for proxy tasks
1588  if (flags->proxy == TASK_FULL)
1589  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1590 
1591  taskdata->td_flags = *flags;
1592  taskdata->td_task_team = thread->th.th_task_team;
1593  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1594  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1595  // If it is hidden helper task, we need to set the team and task team
1596  // correspondingly.
1597  if (flags->hidden_helper) {
1598  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1599  taskdata->td_team = shadow_thread->th.th_team;
1600  taskdata->td_task_team = shadow_thread->th.th_task_team;
1601  }
1602 
1603  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1604  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1605 
1606  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1607  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1608 
1609  // GEH - Note we serialize the task if the team is serialized to make sure
1610  // implicit parallel region tasks are not left until program termination to
1611  // execute. Also, it helps locality to execute immediately.
1612 
1613  taskdata->td_flags.task_serial =
1614  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1615  taskdata->td_flags.tasking_ser || flags->merged_if0);
1616 
1617  taskdata->td_flags.started = 0;
1618  taskdata->td_flags.executing = 0;
1619  taskdata->td_flags.complete = 0;
1620  taskdata->td_flags.freed = 0;
1621 #if OMPX_TASKGRAPH
1622  taskdata->td_flags.onced = 0;
1623 #endif
1624  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1625  // start at one because counts current task and children
1626  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1627  taskdata->td_taskgroup =
1628  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1629  taskdata->td_dephash = NULL;
1630  taskdata->td_depnode = NULL;
1631  taskdata->td_target_data.async_handle = NULL;
1632  if (flags->tiedness == TASK_UNTIED)
1633  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1634  else
1635  taskdata->td_last_tied = taskdata;
1636  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1637 #if OMPT_SUPPORT
1638  if (UNLIKELY(ompt_enabled.enabled))
1639  __ompt_task_init(taskdata, gtid);
1640 #endif
1641  // TODO: What would be the balance between the conditions in the function and
1642  // an atomic operation?
1643  if (__kmp_track_children_task(taskdata)) {
1644  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1645  if (parent_task->td_taskgroup)
1646  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1647  // Only need to keep track of allocated child tasks for explicit tasks since
1648  // implicit not deallocated
1649  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1650  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1651  }
1652  if (flags->hidden_helper) {
1653  taskdata->td_flags.task_serial = FALSE;
1654  // Increment the number of hidden helper tasks to be executed
1655  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1656  }
1657  }
1658 
1659 #if OMPX_TASKGRAPH
1660  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1661  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1662  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1663  taskdata->is_taskgraph = 1;
1664  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1665  taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1666  }
1667 #endif
1668  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1669  gtid, taskdata, taskdata->td_parent));
1670 
1671  return task;
1672 }
1673 
1674 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1675  kmp_int32 flags, size_t sizeof_kmp_task_t,
1676  size_t sizeof_shareds,
1677  kmp_routine_entry_t task_entry) {
1678  kmp_task_t *retval;
1679  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1680  __kmp_assert_valid_gtid(gtid);
1681  input_flags->native = FALSE;
1682  // __kmp_task_alloc() sets up all other runtime flags
1683  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1684  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1685  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1686  input_flags->proxy ? "proxy" : "",
1687  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1688  sizeof_shareds, task_entry));
1689 
1690  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1691  sizeof_shareds, task_entry);
1692 
1693  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1694 
1695  return retval;
1696 }
1697 
1698 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1699  kmp_int32 flags,
1700  size_t sizeof_kmp_task_t,
1701  size_t sizeof_shareds,
1702  kmp_routine_entry_t task_entry,
1703  kmp_int64 device_id) {
1704  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1705  // target task is untied defined in the specification
1706  input_flags.tiedness = TASK_UNTIED;
1707 
1708  if (__kmp_enable_hidden_helper)
1709  input_flags.hidden_helper = TRUE;
1710 
1711  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1712  sizeof_shareds, task_entry);
1713 }
1714 
1728 kmp_int32
1730  kmp_task_t *new_task, kmp_int32 naffins,
1731  kmp_task_affinity_info_t *affin_list) {
1732  return 0;
1733 }
1734 
1735 // __kmp_invoke_task: invoke the specified task
1736 //
1737 // gtid: global thread ID of caller
1738 // task: the task to invoke
1739 // current_task: the task to resume after task invocation
1740 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1741  kmp_taskdata_t *current_task) {
1742  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1743  kmp_info_t *thread;
1744  int discard = 0 /* false */;
1745  KA_TRACE(
1746  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1747  gtid, taskdata, current_task));
1748  KMP_DEBUG_ASSERT(task);
1749  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1750  taskdata->td_flags.complete == 1)) {
1751  // This is a proxy task that was already completed but it needs to run
1752  // its bottom-half finish
1753  KA_TRACE(
1754  30,
1755  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1756  gtid, taskdata));
1757 
1758  __kmp_bottom_half_finish_proxy(gtid, task);
1759 
1760  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1761  "proxy task %p, resuming task %p\n",
1762  gtid, taskdata, current_task));
1763 
1764  return;
1765  }
1766 
1767 #if OMPT_SUPPORT
1768  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1769  // does not execute code.
1770  ompt_thread_info_t oldInfo;
1771  if (UNLIKELY(ompt_enabled.enabled)) {
1772  // Store the threads states and restore them after the task
1773  thread = __kmp_threads[gtid];
1774  oldInfo = thread->th.ompt_thread_info;
1775  thread->th.ompt_thread_info.wait_id = 0;
1776  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1777  ? ompt_state_work_serial
1778  : ompt_state_work_parallel;
1779  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1780  }
1781 #endif
1782 
1783  // Proxy tasks are not handled by the runtime
1784  if (taskdata->td_flags.proxy != TASK_PROXY) {
1785  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1786  }
1787 
1788  // TODO: cancel tasks if the parallel region has also been cancelled
1789  // TODO: check if this sequence can be hoisted above __kmp_task_start
1790  // if cancellation has been enabled for this run ...
1791  if (UNLIKELY(__kmp_omp_cancellation)) {
1792  thread = __kmp_threads[gtid];
1793  kmp_team_t *this_team = thread->th.th_team;
1794  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1795  if ((taskgroup && taskgroup->cancel_request) ||
1796  (this_team->t.t_cancel_request == cancel_parallel)) {
1797 #if OMPT_SUPPORT && OMPT_OPTIONAL
1798  ompt_data_t *task_data;
1799  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1800  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1801  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1802  task_data,
1803  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1804  : ompt_cancel_parallel) |
1805  ompt_cancel_discarded_task,
1806  NULL);
1807  }
1808 #endif
1809  KMP_COUNT_BLOCK(TASK_cancelled);
1810  // this task belongs to a task group and we need to cancel it
1811  discard = 1 /* true */;
1812  }
1813  }
1814 
1815  // Invoke the task routine and pass in relevant data.
1816  // Thunks generated by gcc take a different argument list.
1817  if (!discard) {
1818  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1819  taskdata->td_last_tied = current_task->td_last_tied;
1820  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1821  }
1822 #if KMP_STATS_ENABLED
1823  KMP_COUNT_BLOCK(TASK_executed);
1824  switch (KMP_GET_THREAD_STATE()) {
1825  case FORK_JOIN_BARRIER:
1826  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1827  break;
1828  case PLAIN_BARRIER:
1829  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1830  break;
1831  case TASKYIELD:
1832  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1833  break;
1834  case TASKWAIT:
1835  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1836  break;
1837  case TASKGROUP:
1838  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1839  break;
1840  default:
1841  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1842  break;
1843  }
1844 #endif // KMP_STATS_ENABLED
1845 
1846 // OMPT task begin
1847 #if OMPT_SUPPORT
1848  if (UNLIKELY(ompt_enabled.enabled))
1849  __ompt_task_start(task, current_task, gtid);
1850 #endif
1851 #if OMPT_SUPPORT && OMPT_OPTIONAL
1852  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1853  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1854  ompt_data_t instance = ompt_data_none;
1855  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1856  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1857  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1858  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1859  ompt_dispatch_taskloop_chunk, instance);
1860  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1861  }
1862 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1863 
1864 #if OMPD_SUPPORT
1865  if (ompd_state & OMPD_ENABLE_BP)
1866  ompd_bp_task_begin();
1867 #endif
1868 
1869 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1870  kmp_uint64 cur_time;
1871  kmp_int32 kmp_itt_count_task =
1872  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1873  current_task->td_flags.tasktype == TASK_IMPLICIT;
1874  if (kmp_itt_count_task) {
1875  thread = __kmp_threads[gtid];
1876  // Time outer level explicit task on barrier for adjusting imbalance time
1877  if (thread->th.th_bar_arrive_time)
1878  cur_time = __itt_get_timestamp();
1879  else
1880  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1881  }
1882  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1883 #endif
1884 
1885 #if ENABLE_LIBOMPTARGET
1886  if (taskdata->td_target_data.async_handle != NULL) {
1887  // If we have a valid target async handle, that means that we have already
1888  // executed the task routine once. We must query for the handle completion
1889  // instead of re-executing the routine.
1890  KMP_ASSERT(tgt_target_nowait_query);
1891  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1892  } else
1893 #endif
1894  if (task->routine != NULL) {
1895 #ifdef KMP_GOMP_COMPAT
1896  if (taskdata->td_flags.native) {
1897  ((void (*)(void *))(*(task->routine)))(task->shareds);
1898  } else
1899 #endif /* KMP_GOMP_COMPAT */
1900  {
1901  (*(task->routine))(gtid, task);
1902  }
1903  }
1904  KMP_POP_PARTITIONED_TIMER();
1905 
1906 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1907  if (kmp_itt_count_task) {
1908  // Barrier imbalance - adjust arrive time with the task duration
1909  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1910  }
1911  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1912  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1913 #endif
1914  }
1915 
1916 #if OMPD_SUPPORT
1917  if (ompd_state & OMPD_ENABLE_BP)
1918  ompd_bp_task_end();
1919 #endif
1920 
1921  // Proxy tasks are not handled by the runtime
1922  if (taskdata->td_flags.proxy != TASK_PROXY) {
1923 #if OMPT_SUPPORT
1924  if (UNLIKELY(ompt_enabled.enabled)) {
1925  thread->th.ompt_thread_info = oldInfo;
1926  if (taskdata->td_flags.tiedness == TASK_TIED) {
1927  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1928  }
1929  __kmp_task_finish<true>(gtid, task, current_task);
1930  } else
1931 #endif
1932  __kmp_task_finish<false>(gtid, task, current_task);
1933  }
1934 
1935  KA_TRACE(
1936  30,
1937  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1938  gtid, taskdata, current_task));
1939  return;
1940 }
1941 
1942 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1943 //
1944 // loc_ref: location of original task pragma (ignored)
1945 // gtid: Global Thread ID of encountering thread
1946 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1947 // Returns:
1948 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1949 // be resumed later.
1950 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1951 // resumed later.
1952 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1953  kmp_task_t *new_task) {
1954  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1955 
1956  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1957  loc_ref, new_taskdata));
1958 
1959 #if OMPT_SUPPORT
1960  kmp_taskdata_t *parent;
1961  if (UNLIKELY(ompt_enabled.enabled)) {
1962  parent = new_taskdata->td_parent;
1963  if (ompt_enabled.ompt_callback_task_create) {
1964  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1965  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1966  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1967  OMPT_GET_RETURN_ADDRESS(0));
1968  }
1969  }
1970 #endif
1971 
1972  /* Should we execute the new task or queue it? For now, let's just always try
1973  to queue it. If the queue fills up, then we'll execute it. */
1974 
1975  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1976  { // Execute this task immediately
1977  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1978  new_taskdata->td_flags.task_serial = 1;
1979  __kmp_invoke_task(gtid, new_task, current_task);
1980  }
1981 
1982  KA_TRACE(
1983  10,
1984  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1985  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1986  gtid, loc_ref, new_taskdata));
1987 
1988 #if OMPT_SUPPORT
1989  if (UNLIKELY(ompt_enabled.enabled)) {
1990  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1991  }
1992 #endif
1993  return TASK_CURRENT_NOT_QUEUED;
1994 }
1995 
1996 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1997 //
1998 // gtid: Global Thread ID of encountering thread
1999 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2000 // serialize_immediate: if TRUE then if the task is executed immediately its
2001 // execution will be serialized
2002 // Returns:
2003 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2004 // be resumed later.
2005 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2006 // resumed later.
2007 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2008  bool serialize_immediate) {
2009  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2010 
2011 #if OMPX_TASKGRAPH
2012  if (new_taskdata->is_taskgraph &&
2013  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2014  kmp_tdg_info_t *tdg = new_taskdata->tdg;
2015  // extend the record_map if needed
2016  if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
2017  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2018  // map_size could have been updated by another thread if recursive
2019  // taskloop
2020  if (new_taskdata->td_task_id >= tdg->map_size) {
2021  kmp_uint old_size = tdg->map_size;
2022  kmp_uint new_size = old_size * 2;
2023  kmp_node_info_t *old_record = tdg->record_map;
2024  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
2025  new_size * sizeof(kmp_node_info_t));
2026 
2027  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2028  tdg->record_map = new_record;
2029 
2030  __kmp_free(old_record);
2031 
2032  for (kmp_int i = old_size; i < new_size; i++) {
2033  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
2034  __kmp_successors_size * sizeof(kmp_int32));
2035  new_record[i].task = nullptr;
2036  new_record[i].successors = successorsList;
2037  new_record[i].nsuccessors = 0;
2038  new_record[i].npredecessors = 0;
2039  new_record[i].successors_size = __kmp_successors_size;
2040  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
2041  }
2042  // update the size at the end, so that we avoid other
2043  // threads use old_record while map_size is already updated
2044  tdg->map_size = new_size;
2045  }
2046  __kmp_release_bootstrap_lock(&tdg->graph_lock);
2047  }
2048  // record a task
2049  if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
2050  tdg->record_map[new_taskdata->td_task_id].task = new_task;
2051  tdg->record_map[new_taskdata->td_task_id].parent_task =
2052  new_taskdata->td_parent;
2053  KMP_ATOMIC_INC(&tdg->num_tasks);
2054  }
2055  }
2056 #endif
2057 
2058  /* Should we execute the new task or queue it? For now, let's just always try
2059  to queue it. If the queue fills up, then we'll execute it. */
2060  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
2061  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
2062  { // Execute this task immediately
2063  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2064  if (serialize_immediate)
2065  new_taskdata->td_flags.task_serial = 1;
2066  __kmp_invoke_task(gtid, new_task, current_task);
2067  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2068  __kmp_wpolicy_passive) {
2069  kmp_info_t *this_thr = __kmp_threads[gtid];
2070  kmp_team_t *team = this_thr->th.th_team;
2071  kmp_int32 nthreads = this_thr->th.th_team_nproc;
2072  for (int i = 0; i < nthreads; ++i) {
2073  kmp_info_t *thread = team->t.t_threads[i];
2074  if (thread == this_thr)
2075  continue;
2076  if (thread->th.th_sleep_loc != NULL) {
2077  __kmp_null_resume_wrapper(thread);
2078  break; // awake one thread at a time
2079  }
2080  }
2081  }
2082  return TASK_CURRENT_NOT_QUEUED;
2083 }
2084 
2085 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2086 // non-thread-switchable task from the parent thread only!
2087 //
2088 // loc_ref: location of original task pragma (ignored)
2089 // gtid: Global Thread ID of encountering thread
2090 // new_task: non-thread-switchable task thunk allocated by
2091 // __kmp_omp_task_alloc()
2092 // Returns:
2093 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2094 // be resumed later.
2095 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2096 // resumed later.
2097 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2098  kmp_task_t *new_task) {
2099  kmp_int32 res;
2100  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2101 
2102 #if KMP_DEBUG || OMPT_SUPPORT
2103  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2104 #endif
2105  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2106  new_taskdata));
2107  __kmp_assert_valid_gtid(gtid);
2108 
2109 #if OMPT_SUPPORT
2110  kmp_taskdata_t *parent = NULL;
2111  if (UNLIKELY(ompt_enabled.enabled)) {
2112  if (!new_taskdata->td_flags.started) {
2113  OMPT_STORE_RETURN_ADDRESS(gtid);
2114  parent = new_taskdata->td_parent;
2115  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2116  parent->ompt_task_info.frame.enter_frame.ptr =
2117  OMPT_GET_FRAME_ADDRESS(0);
2118  }
2119  if (ompt_enabled.ompt_callback_task_create) {
2120  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2121  &(parent->ompt_task_info.task_data),
2122  &(parent->ompt_task_info.frame),
2123  &(new_taskdata->ompt_task_info.task_data),
2124  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2125  OMPT_LOAD_RETURN_ADDRESS(gtid));
2126  }
2127  } else {
2128  // We are scheduling the continuation of an UNTIED task.
2129  // Scheduling back to the parent task.
2130  __ompt_task_finish(new_task,
2131  new_taskdata->ompt_task_info.scheduling_parent,
2132  ompt_task_switch);
2133  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2134  }
2135  }
2136 #endif
2137 
2138  res = __kmp_omp_task(gtid, new_task, true);
2139 
2140  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2141  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2142  gtid, loc_ref, new_taskdata));
2143 #if OMPT_SUPPORT
2144  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2145  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2146  }
2147 #endif
2148  return res;
2149 }
2150 
2151 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2152 // a taskloop task with the correct OMPT return address
2153 //
2154 // loc_ref: location of original task pragma (ignored)
2155 // gtid: Global Thread ID of encountering thread
2156 // new_task: non-thread-switchable task thunk allocated by
2157 // __kmp_omp_task_alloc()
2158 // codeptr_ra: return address for OMPT callback
2159 // Returns:
2160 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2161 // be resumed later.
2162 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2163 // resumed later.
2164 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2165  kmp_task_t *new_task, void *codeptr_ra) {
2166  kmp_int32 res;
2167  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2168 
2169 #if KMP_DEBUG || OMPT_SUPPORT
2170  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2171 #endif
2172  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2173  new_taskdata));
2174 
2175 #if OMPT_SUPPORT
2176  kmp_taskdata_t *parent = NULL;
2177  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2178  parent = new_taskdata->td_parent;
2179  if (!parent->ompt_task_info.frame.enter_frame.ptr)
2180  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2181  if (ompt_enabled.ompt_callback_task_create) {
2182  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2183  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2184  &(new_taskdata->ompt_task_info.task_data),
2185  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2186  codeptr_ra);
2187  }
2188  }
2189 #endif
2190 
2191  res = __kmp_omp_task(gtid, new_task, true);
2192 
2193  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2194  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2195  gtid, loc_ref, new_taskdata));
2196 #if OMPT_SUPPORT
2197  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2198  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2199  }
2200 #endif
2201  return res;
2202 }
2203 
2204 template <bool ompt>
2205 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2206  void *frame_address,
2207  void *return_address) {
2208  kmp_taskdata_t *taskdata = nullptr;
2209  kmp_info_t *thread;
2210  int thread_finished = FALSE;
2211  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2212 
2213  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2214  KMP_DEBUG_ASSERT(gtid >= 0);
2215 
2216  if (__kmp_tasking_mode != tskm_immediate_exec) {
2217  thread = __kmp_threads[gtid];
2218  taskdata = thread->th.th_current_task;
2219 
2220 #if OMPT_SUPPORT && OMPT_OPTIONAL
2221  ompt_data_t *my_task_data;
2222  ompt_data_t *my_parallel_data;
2223 
2224  if (ompt) {
2225  my_task_data = &(taskdata->ompt_task_info.task_data);
2226  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2227 
2228  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2229 
2230  if (ompt_enabled.ompt_callback_sync_region) {
2231  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2232  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2233  my_task_data, return_address);
2234  }
2235 
2236  if (ompt_enabled.ompt_callback_sync_region_wait) {
2237  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2238  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2239  my_task_data, return_address);
2240  }
2241  }
2242 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2243 
2244 // Debugger: The taskwait is active. Store location and thread encountered the
2245 // taskwait.
2246 #if USE_ITT_BUILD
2247 // Note: These values are used by ITT events as well.
2248 #endif /* USE_ITT_BUILD */
2249  taskdata->td_taskwait_counter += 1;
2250  taskdata->td_taskwait_ident = loc_ref;
2251  taskdata->td_taskwait_thread = gtid + 1;
2252 
2253 #if USE_ITT_BUILD
2254  void *itt_sync_obj = NULL;
2255 #if USE_ITT_NOTIFY
2256  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2257 #endif /* USE_ITT_NOTIFY */
2258 #endif /* USE_ITT_BUILD */
2259 
2260  bool must_wait =
2261  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2262 
2263  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2264  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2265  // If hidden helper thread is encountered, we must enable wait here.
2266  must_wait =
2267  must_wait ||
2268  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2269  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2270 
2271  if (must_wait) {
2272  kmp_flag_32<false, false> flag(
2273  RCAST(std::atomic<kmp_uint32> *,
2274  &(taskdata->td_incomplete_child_tasks)),
2275  0U);
2276  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2277  flag.execute_tasks(thread, gtid, FALSE,
2278  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2279  __kmp_task_stealing_constraint);
2280  }
2281  }
2282 #if USE_ITT_BUILD
2283  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2284  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2285 #endif /* USE_ITT_BUILD */
2286 
2287  // Debugger: The taskwait is completed. Location remains, but thread is
2288  // negated.
2289  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2290 
2291 #if OMPT_SUPPORT && OMPT_OPTIONAL
2292  if (ompt) {
2293  if (ompt_enabled.ompt_callback_sync_region_wait) {
2294  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2295  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2296  my_task_data, return_address);
2297  }
2298  if (ompt_enabled.ompt_callback_sync_region) {
2299  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2300  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2301  my_task_data, return_address);
2302  }
2303  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2304  }
2305 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2306  }
2307 
2308  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2309  "returning TASK_CURRENT_NOT_QUEUED\n",
2310  gtid, taskdata));
2311 
2312  return TASK_CURRENT_NOT_QUEUED;
2313 }
2314 
2315 #if OMPT_SUPPORT && OMPT_OPTIONAL
2316 OMPT_NOINLINE
2317 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2318  void *frame_address,
2319  void *return_address) {
2320  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2321  return_address);
2322 }
2323 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2324 
2325 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2326 // complete
2327 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2328 #if OMPT_SUPPORT && OMPT_OPTIONAL
2329  if (UNLIKELY(ompt_enabled.enabled)) {
2330  OMPT_STORE_RETURN_ADDRESS(gtid);
2331  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2332  OMPT_LOAD_RETURN_ADDRESS(gtid));
2333  }
2334 #endif
2335  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2336 }
2337 
2338 // __kmpc_omp_taskyield: switch to a different task
2339 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2340  kmp_taskdata_t *taskdata = NULL;
2341  kmp_info_t *thread;
2342  int thread_finished = FALSE;
2343 
2344  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2345  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2346 
2347  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2348  gtid, loc_ref, end_part));
2349  __kmp_assert_valid_gtid(gtid);
2350 
2351  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2352  thread = __kmp_threads[gtid];
2353  taskdata = thread->th.th_current_task;
2354 // Should we model this as a task wait or not?
2355 // Debugger: The taskwait is active. Store location and thread encountered the
2356 // taskwait.
2357 #if USE_ITT_BUILD
2358 // Note: These values are used by ITT events as well.
2359 #endif /* USE_ITT_BUILD */
2360  taskdata->td_taskwait_counter += 1;
2361  taskdata->td_taskwait_ident = loc_ref;
2362  taskdata->td_taskwait_thread = gtid + 1;
2363 
2364 #if USE_ITT_BUILD
2365  void *itt_sync_obj = NULL;
2366 #if USE_ITT_NOTIFY
2367  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2368 #endif /* USE_ITT_NOTIFY */
2369 #endif /* USE_ITT_BUILD */
2370  if (!taskdata->td_flags.team_serial) {
2371  kmp_task_team_t *task_team = thread->th.th_task_team;
2372  if (task_team != NULL) {
2373  if (KMP_TASKING_ENABLED(task_team)) {
2374 #if OMPT_SUPPORT
2375  if (UNLIKELY(ompt_enabled.enabled))
2376  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2377 #endif
2378  __kmp_execute_tasks_32(
2379  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2380  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2381  __kmp_task_stealing_constraint);
2382 #if OMPT_SUPPORT
2383  if (UNLIKELY(ompt_enabled.enabled))
2384  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2385 #endif
2386  }
2387  }
2388  }
2389 #if USE_ITT_BUILD
2390  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2391 #endif /* USE_ITT_BUILD */
2392 
2393  // Debugger: The taskwait is completed. Location remains, but thread is
2394  // negated.
2395  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2396  }
2397 
2398  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2399  "returning TASK_CURRENT_NOT_QUEUED\n",
2400  gtid, taskdata));
2401 
2402  return TASK_CURRENT_NOT_QUEUED;
2403 }
2404 
2405 // Task Reduction implementation
2406 //
2407 // Note: initial implementation didn't take into account the possibility
2408 // to specify omp_orig for initializer of the UDR (user defined reduction).
2409 // Corrected implementation takes into account the omp_orig object.
2410 // Compiler is free to use old implementation if omp_orig is not specified.
2411 
2420 typedef struct kmp_taskred_flags {
2422  unsigned lazy_priv : 1;
2423  unsigned reserved31 : 31;
2425 
2429 typedef struct kmp_task_red_input {
2430  void *reduce_shar;
2431  size_t reduce_size;
2432  // three compiler-generated routines (init, fini are optional):
2433  void *reduce_init;
2434  void *reduce_fini;
2435  void *reduce_comb;
2438 
2442 typedef struct kmp_taskred_data {
2443  void *reduce_shar;
2444  size_t reduce_size;
2446  void *reduce_priv;
2447  void *reduce_pend;
2448  // three compiler-generated routines (init, fini are optional):
2449  void *reduce_comb;
2450  void *reduce_init;
2451  void *reduce_fini;
2452  void *reduce_orig;
2454 
2460 typedef struct kmp_taskred_input {
2461  void *reduce_shar;
2462  void *reduce_orig;
2463  size_t reduce_size;
2464  // three compiler-generated routines (init, fini are optional):
2465  void *reduce_init;
2466  void *reduce_fini;
2467  void *reduce_comb;
2474 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2475 template <>
2476 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2477  kmp_task_red_input_t &src) {
2478  item.reduce_orig = NULL;
2479 }
2480 template <>
2481 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2482  kmp_taskred_input_t &src) {
2483  if (src.reduce_orig != NULL) {
2484  item.reduce_orig = src.reduce_orig;
2485  } else {
2486  item.reduce_orig = src.reduce_shar;
2487  } // non-NULL reduce_orig means new interface used
2488 }
2489 
2490 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2491 template <>
2492 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2493  size_t offset) {
2494  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2495 }
2496 template <>
2497 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2498  size_t offset) {
2499  ((void (*)(void *, void *))item.reduce_init)(
2500  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2501 }
2502 
2503 template <typename T>
2504 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2505  __kmp_assert_valid_gtid(gtid);
2506  kmp_info_t *thread = __kmp_threads[gtid];
2507  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2508  kmp_uint32 nth = thread->th.th_team_nproc;
2509  kmp_taskred_data_t *arr;
2510 
2511  // check input data just in case
2512  KMP_ASSERT(tg != NULL);
2513  KMP_ASSERT(data != NULL);
2514  KMP_ASSERT(num > 0);
2515  if (nth == 1) {
2516  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2517  gtid, tg));
2518  return (void *)tg;
2519  }
2520  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2521  gtid, tg, num));
2522  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2523  thread, num * sizeof(kmp_taskred_data_t));
2524  for (int i = 0; i < num; ++i) {
2525  size_t size = data[i].reduce_size - 1;
2526  // round the size up to cache line per thread-specific item
2527  size += CACHE_LINE - size % CACHE_LINE;
2528  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2529  arr[i].reduce_shar = data[i].reduce_shar;
2530  arr[i].reduce_size = size;
2531  arr[i].flags = data[i].flags;
2532  arr[i].reduce_comb = data[i].reduce_comb;
2533  arr[i].reduce_init = data[i].reduce_init;
2534  arr[i].reduce_fini = data[i].reduce_fini;
2535  __kmp_assign_orig<T>(arr[i], data[i]);
2536  if (!arr[i].flags.lazy_priv) {
2537  // allocate cache-line aligned block and fill it with zeros
2538  arr[i].reduce_priv = __kmp_allocate(nth * size);
2539  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2540  if (arr[i].reduce_init != NULL) {
2541  // initialize all thread-specific items
2542  for (size_t j = 0; j < nth; ++j) {
2543  __kmp_call_init<T>(arr[i], j * size);
2544  }
2545  }
2546  } else {
2547  // only allocate space for pointers now,
2548  // objects will be lazily allocated/initialized if/when requested
2549  // note that __kmp_allocate zeroes the allocated memory
2550  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2551  }
2552  }
2553  tg->reduce_data = (void *)arr;
2554  tg->reduce_num_data = num;
2555  return (void *)tg;
2556 }
2557 
2572 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2573 #if OMPX_TASKGRAPH
2574  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2575  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2576  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2577  this_tdg->rec_taskred_data =
2578  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2579  this_tdg->rec_num_taskred = num;
2580  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2581  sizeof(kmp_task_red_input_t) * num);
2582  }
2583 #endif
2584  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2585 }
2586 
2599 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2600 #if OMPX_TASKGRAPH
2601  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2602  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2603  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2604  this_tdg->rec_taskred_data =
2605  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2606  this_tdg->rec_num_taskred = num;
2607  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2608  sizeof(kmp_task_red_input_t) * num);
2609  }
2610 #endif
2611  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2612 }
2613 
2614 // Copy task reduction data (except for shared pointers).
2615 template <typename T>
2616 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2617  kmp_taskgroup_t *tg, void *reduce_data) {
2618  kmp_taskred_data_t *arr;
2619  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2620  " from data %p\n",
2621  thr, tg, reduce_data));
2622  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2623  thr, num * sizeof(kmp_taskred_data_t));
2624  // threads will share private copies, thunk routines, sizes, flags, etc.:
2625  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2626  for (int i = 0; i < num; ++i) {
2627  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2628  }
2629  tg->reduce_data = (void *)arr;
2630  tg->reduce_num_data = num;
2631 }
2632 
2642 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2643  __kmp_assert_valid_gtid(gtid);
2644  kmp_info_t *thread = __kmp_threads[gtid];
2645  kmp_int32 nth = thread->th.th_team_nproc;
2646  if (nth == 1)
2647  return data; // nothing to do
2648 
2649  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2650  if (tg == NULL)
2651  tg = thread->th.th_current_task->td_taskgroup;
2652  KMP_ASSERT(tg != NULL);
2653  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2654  kmp_int32 num = tg->reduce_num_data;
2655  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2656 
2657 #if OMPX_TASKGRAPH
2658  if ((thread->th.th_current_task->is_taskgraph) &&
2659  (!__kmp_tdg_is_recording(
2660  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2661  tg = thread->th.th_current_task->td_taskgroup;
2662  KMP_ASSERT(tg != NULL);
2663  KMP_ASSERT(tg->reduce_data != NULL);
2664  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2665  num = tg->reduce_num_data;
2666  }
2667 #endif
2668 
2669  KMP_ASSERT(data != NULL);
2670  while (tg != NULL) {
2671  for (int i = 0; i < num; ++i) {
2672  if (!arr[i].flags.lazy_priv) {
2673  if (data == arr[i].reduce_shar ||
2674  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2675  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2676  } else {
2677  // check shared location first
2678  void **p_priv = (void **)(arr[i].reduce_priv);
2679  if (data == arr[i].reduce_shar)
2680  goto found;
2681  // check if we get some thread specific location as parameter
2682  for (int j = 0; j < nth; ++j)
2683  if (data == p_priv[j])
2684  goto found;
2685  continue; // not found, continue search
2686  found:
2687  if (p_priv[tid] == NULL) {
2688  // allocate thread specific object lazily
2689  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2690  if (arr[i].reduce_init != NULL) {
2691  if (arr[i].reduce_orig != NULL) { // new interface
2692  ((void (*)(void *, void *))arr[i].reduce_init)(
2693  p_priv[tid], arr[i].reduce_orig);
2694  } else { // old interface (single parameter)
2695  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2696  }
2697  }
2698  }
2699  return p_priv[tid];
2700  }
2701  }
2702  tg = tg->parent;
2703  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2704  num = tg->reduce_num_data;
2705  }
2706  KMP_ASSERT2(0, "Unknown task reduction item");
2707  return NULL; // ERROR, this line never executed
2708 }
2709 
2710 // Finalize task reduction.
2711 // Called from __kmpc_end_taskgroup()
2712 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2713  kmp_int32 nth = th->th.th_team_nproc;
2714  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2715  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2716  kmp_int32 num = tg->reduce_num_data;
2717  for (int i = 0; i < num; ++i) {
2718  void *sh_data = arr[i].reduce_shar;
2719  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2720  void (*f_comb)(void *, void *) =
2721  (void (*)(void *, void *))(arr[i].reduce_comb);
2722  if (!arr[i].flags.lazy_priv) {
2723  void *pr_data = arr[i].reduce_priv;
2724  size_t size = arr[i].reduce_size;
2725  for (int j = 0; j < nth; ++j) {
2726  void *priv_data = (char *)pr_data + j * size;
2727  f_comb(sh_data, priv_data); // combine results
2728  if (f_fini)
2729  f_fini(priv_data); // finalize if needed
2730  }
2731  } else {
2732  void **pr_data = (void **)(arr[i].reduce_priv);
2733  for (int j = 0; j < nth; ++j) {
2734  if (pr_data[j] != NULL) {
2735  f_comb(sh_data, pr_data[j]); // combine results
2736  if (f_fini)
2737  f_fini(pr_data[j]); // finalize if needed
2738  __kmp_free(pr_data[j]);
2739  }
2740  }
2741  }
2742  __kmp_free(arr[i].reduce_priv);
2743  }
2744  __kmp_thread_free(th, arr);
2745  tg->reduce_data = NULL;
2746  tg->reduce_num_data = 0;
2747 }
2748 
2749 // Cleanup task reduction data for parallel or worksharing,
2750 // do not touch task private data other threads still working with.
2751 // Called from __kmpc_end_taskgroup()
2752 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2753  __kmp_thread_free(th, tg->reduce_data);
2754  tg->reduce_data = NULL;
2755  tg->reduce_num_data = 0;
2756 }
2757 
2758 template <typename T>
2759 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2760  int num, T *data) {
2761  __kmp_assert_valid_gtid(gtid);
2762  kmp_info_t *thr = __kmp_threads[gtid];
2763  kmp_int32 nth = thr->th.th_team_nproc;
2764  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2765  if (nth == 1) {
2766  KA_TRACE(10,
2767  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2768  gtid, thr->th.th_current_task->td_taskgroup));
2769  return (void *)thr->th.th_current_task->td_taskgroup;
2770  }
2771  kmp_team_t *team = thr->th.th_team;
2772  void *reduce_data;
2773  kmp_taskgroup_t *tg;
2774  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2775  if (reduce_data == NULL &&
2776  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2777  (void *)1)) {
2778  // single thread enters this block to initialize common reduction data
2779  KMP_DEBUG_ASSERT(reduce_data == NULL);
2780  // first initialize own data, then make a copy other threads can use
2781  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2782  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2783  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2784  // fini counters should be 0 at this point
2785  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2786  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2787  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2788  } else {
2789  while (
2790  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2791  (void *)1) { // wait for task reduction initialization
2792  KMP_CPU_PAUSE();
2793  }
2794  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2795  tg = thr->th.th_current_task->td_taskgroup;
2796  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2797  }
2798  return tg;
2799 }
2800 
2817 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2818  int num, void *data) {
2819  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2820  (kmp_task_red_input_t *)data);
2821 }
2822 
2837 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2838  void *data) {
2839  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2840  (kmp_taskred_input_t *)data);
2841 }
2842 
2851 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2852  __kmpc_end_taskgroup(loc, gtid);
2853 }
2854 
2855 // __kmpc_taskgroup: Start a new taskgroup
2856 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2857  __kmp_assert_valid_gtid(gtid);
2858  kmp_info_t *thread = __kmp_threads[gtid];
2859  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2860  kmp_taskgroup_t *tg_new =
2861  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2862  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2863  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2864  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2865  tg_new->parent = taskdata->td_taskgroup;
2866  tg_new->reduce_data = NULL;
2867  tg_new->reduce_num_data = 0;
2868  tg_new->gomp_data = NULL;
2869  taskdata->td_taskgroup = tg_new;
2870 
2871 #if OMPT_SUPPORT && OMPT_OPTIONAL
2872  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2873  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2874  if (!codeptr)
2875  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2876  kmp_team_t *team = thread->th.th_team;
2877  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2878  // FIXME: I think this is wrong for lwt!
2879  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2880 
2881  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2882  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2883  &(my_task_data), codeptr);
2884  }
2885 #endif
2886 }
2887 
2888 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2889 // and its descendants are complete
2890 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2891  __kmp_assert_valid_gtid(gtid);
2892  kmp_info_t *thread = __kmp_threads[gtid];
2893  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2894  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2895  int thread_finished = FALSE;
2896 
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898  kmp_team_t *team;
2899  ompt_data_t my_task_data;
2900  ompt_data_t my_parallel_data;
2901  void *codeptr = nullptr;
2902  if (UNLIKELY(ompt_enabled.enabled)) {
2903  team = thread->th.th_team;
2904  my_task_data = taskdata->ompt_task_info.task_data;
2905  // FIXME: I think this is wrong for lwt!
2906  my_parallel_data = team->t.ompt_team_info.parallel_data;
2907  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2908  if (!codeptr)
2909  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2910  }
2911 #endif
2912 
2913  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2914  KMP_DEBUG_ASSERT(taskgroup != NULL);
2915  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2916 
2917  if (__kmp_tasking_mode != tskm_immediate_exec) {
2918  // mark task as waiting not on a barrier
2919  taskdata->td_taskwait_counter += 1;
2920  taskdata->td_taskwait_ident = loc;
2921  taskdata->td_taskwait_thread = gtid + 1;
2922 #if USE_ITT_BUILD
2923  // For ITT the taskgroup wait is similar to taskwait until we need to
2924  // distinguish them
2925  void *itt_sync_obj = NULL;
2926 #if USE_ITT_NOTIFY
2927  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2928 #endif /* USE_ITT_NOTIFY */
2929 #endif /* USE_ITT_BUILD */
2930 
2931 #if OMPT_SUPPORT && OMPT_OPTIONAL
2932  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2933  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2934  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2935  &(my_task_data), codeptr);
2936  }
2937 #endif
2938 
2939  if (!taskdata->td_flags.team_serial ||
2940  (thread->th.th_task_team != NULL &&
2941  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2942  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2943  kmp_flag_32<false, false> flag(
2944  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2945  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2946  flag.execute_tasks(thread, gtid, FALSE,
2947  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2948  __kmp_task_stealing_constraint);
2949  }
2950  }
2951  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2952 
2953 #if OMPT_SUPPORT && OMPT_OPTIONAL
2954  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2955  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2956  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2957  &(my_task_data), codeptr);
2958  }
2959 #endif
2960 
2961 #if USE_ITT_BUILD
2962  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2963  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2964 #endif /* USE_ITT_BUILD */
2965  }
2966  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2967 
2968  if (taskgroup->reduce_data != NULL &&
2969  !taskgroup->gomp_data) { // need to reduce?
2970  int cnt;
2971  void *reduce_data;
2972  kmp_team_t *t = thread->th.th_team;
2973  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2974  // check if <priv> data of the first reduction variable shared for the team
2975  void *priv0 = arr[0].reduce_priv;
2976  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2977  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2978  // finishing task reduction on parallel
2979  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2980  if (cnt == thread->th.th_team_nproc - 1) {
2981  // we are the last thread passing __kmpc_reduction_modifier_fini()
2982  // finalize task reduction:
2983  __kmp_task_reduction_fini(thread, taskgroup);
2984  // cleanup fields in the team structure:
2985  // TODO: is relaxed store enough here (whole barrier should follow)?
2986  __kmp_thread_free(thread, reduce_data);
2987  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2988  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2989  } else {
2990  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2991  // so do not finalize reduction, just clean own copy of the data
2992  __kmp_task_reduction_clean(thread, taskgroup);
2993  }
2994  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2995  NULL &&
2996  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2997  // finishing task reduction on worksharing
2998  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2999  if (cnt == thread->th.th_team_nproc - 1) {
3000  // we are the last thread passing __kmpc_reduction_modifier_fini()
3001  __kmp_task_reduction_fini(thread, taskgroup);
3002  // cleanup fields in team structure:
3003  // TODO: is relaxed store enough here (whole barrier should follow)?
3004  __kmp_thread_free(thread, reduce_data);
3005  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
3006  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
3007  } else {
3008  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3009  // so do not finalize reduction, just clean own copy of the data
3010  __kmp_task_reduction_clean(thread, taskgroup);
3011  }
3012  } else {
3013  // finishing task reduction on taskgroup
3014  __kmp_task_reduction_fini(thread, taskgroup);
3015  }
3016  }
3017  // Restore parent taskgroup for the current task
3018  taskdata->td_taskgroup = taskgroup->parent;
3019  __kmp_thread_free(thread, taskgroup);
3020 
3021  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3022  gtid, taskdata));
3023 
3024 #if OMPT_SUPPORT && OMPT_OPTIONAL
3025  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3026  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3027  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3028  &(my_task_data), codeptr);
3029  }
3030 #endif
3031 }
3032 
3033 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3034  kmp_task_team_t *task_team,
3035  kmp_int32 is_constrained) {
3036  kmp_task_t *task = NULL;
3037  kmp_taskdata_t *taskdata;
3038  kmp_taskdata_t *current;
3039  kmp_thread_data_t *thread_data;
3040  int ntasks = task_team->tt.tt_num_task_pri;
3041  if (ntasks == 0) {
3042  KA_TRACE(
3043  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3044  return NULL;
3045  }
3046  do {
3047  // decrement num_tasks to "reserve" one task to get for execution
3048  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
3049  ntasks - 1))
3050  break;
3051  ntasks = task_team->tt.tt_num_task_pri;
3052  } while (ntasks > 0);
3053  if (ntasks == 0) {
3054  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3055  __kmp_get_gtid()));
3056  return NULL;
3057  }
3058  // We got a "ticket" to get a "reserved" priority task
3059  int deque_ntasks;
3060  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3061  do {
3062  KMP_ASSERT(list != NULL);
3063  thread_data = &list->td;
3064  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3065  deque_ntasks = thread_data->td.td_deque_ntasks;
3066  if (deque_ntasks == 0) {
3067  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3068  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3069  __kmp_get_gtid(), thread_data));
3070  list = list->next;
3071  }
3072  } while (deque_ntasks == 0);
3073  KMP_DEBUG_ASSERT(deque_ntasks);
3074  int target = thread_data->td.td_deque_head;
3075  current = __kmp_threads[gtid]->th.th_current_task;
3076  taskdata = thread_data->td.td_deque[target];
3077  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3078  // Bump head pointer and Wrap.
3079  thread_data->td.td_deque_head =
3080  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3081  } else {
3082  if (!task_team->tt.tt_untied_task_encountered) {
3083  // The TSC does not allow to steal victim task
3084  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3085  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3086  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3087  gtid, thread_data, task_team, deque_ntasks, target,
3088  thread_data->td.td_deque_tail));
3089  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3090  return NULL;
3091  }
3092  int i;
3093  // walk through the deque trying to steal any task
3094  taskdata = NULL;
3095  for (i = 1; i < deque_ntasks; ++i) {
3096  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3097  taskdata = thread_data->td.td_deque[target];
3098  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3099  break; // found task to execute
3100  } else {
3101  taskdata = NULL;
3102  }
3103  }
3104  if (taskdata == NULL) {
3105  // No appropriate candidate found to execute
3106  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3107  KA_TRACE(
3108  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3109  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3110  gtid, thread_data, task_team, deque_ntasks,
3111  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3112  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3113  return NULL;
3114  }
3115  int prev = target;
3116  for (i = i + 1; i < deque_ntasks; ++i) {
3117  // shift remaining tasks in the deque left by 1
3118  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3119  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3120  prev = target;
3121  }
3122  KMP_DEBUG_ASSERT(
3123  thread_data->td.td_deque_tail ==
3124  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
3125  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3126  }
3127  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
3128  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3129  task = KMP_TASKDATA_TO_TASK(taskdata);
3130  return task;
3131 }
3132 
3133 // __kmp_remove_my_task: remove a task from my own deque
3134 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
3135  kmp_task_team_t *task_team,
3136  kmp_int32 is_constrained) {
3137  kmp_task_t *task;
3138  kmp_taskdata_t *taskdata;
3139  kmp_thread_data_t *thread_data;
3140  kmp_uint32 tail;
3141 
3142  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3143  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3144  NULL); // Caller should check this condition
3145 
3146  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3147 
3148  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3149  gtid, thread_data->td.td_deque_ntasks,
3150  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3151 
3152  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3153  KA_TRACE(10,
3154  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3155  "ntasks=%d head=%u tail=%u\n",
3156  gtid, thread_data->td.td_deque_ntasks,
3157  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3158  return NULL;
3159  }
3160 
3161  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3162 
3163  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3164  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3165  KA_TRACE(10,
3166  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3167  "ntasks=%d head=%u tail=%u\n",
3168  gtid, thread_data->td.td_deque_ntasks,
3169  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3170  return NULL;
3171  }
3172 
3173  tail = (thread_data->td.td_deque_tail - 1) &
3174  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3175  taskdata = thread_data->td.td_deque[tail];
3176 
3177  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
3178  thread->th.th_current_task)) {
3179  // The TSC does not allow to steal victim task
3180  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3181  KA_TRACE(10,
3182  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3183  "ntasks=%d head=%u tail=%u\n",
3184  gtid, thread_data->td.td_deque_ntasks,
3185  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3186  return NULL;
3187  }
3188 
3189  thread_data->td.td_deque_tail = tail;
3190  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3191 
3192  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3193 
3194  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3195  "ntasks=%d head=%u tail=%u\n",
3196  gtid, taskdata, thread_data->td.td_deque_ntasks,
3197  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3198 
3199  task = KMP_TASKDATA_TO_TASK(taskdata);
3200  return task;
3201 }
3202 
3203 // __kmp_steal_task: remove a task from another thread's deque
3204 // Assume that calling thread has already checked existence of
3205 // task_team thread_data before calling this routine.
3206 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
3207  kmp_task_team_t *task_team,
3208  std::atomic<kmp_int32> *unfinished_threads,
3209  int *thread_finished,
3210  kmp_int32 is_constrained) {
3211  kmp_task_t *task;
3212  kmp_taskdata_t *taskdata;
3213  kmp_taskdata_t *current;
3214  kmp_thread_data_t *victim_td, *threads_data;
3215  kmp_int32 target;
3216  kmp_int32 victim_tid;
3217 
3218  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3219 
3220  threads_data = task_team->tt.tt_threads_data;
3221  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3222 
3223  victim_tid = victim_thr->th.th_info.ds.ds_tid;
3224  victim_td = &threads_data[victim_tid];
3225 
3226  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3227  "task_team=%p ntasks=%d head=%u tail=%u\n",
3228  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3229  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3230  victim_td->td.td_deque_tail));
3231 
3232  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3233  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3234  "task_team=%p ntasks=%d head=%u tail=%u\n",
3235  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3236  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3237  victim_td->td.td_deque_tail));
3238  return NULL;
3239  }
3240 
3241  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3242 
3243  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3244  // Check again after we acquire the lock
3245  if (ntasks == 0) {
3246  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3247  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3248  "task_team=%p ntasks=%d head=%u tail=%u\n",
3249  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3250  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3251  return NULL;
3252  }
3253 
3254  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3255  current = __kmp_threads[gtid]->th.th_current_task;
3256  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3257  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3258  // Bump head pointer and Wrap.
3259  victim_td->td.td_deque_head =
3260  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3261  } else {
3262  if (!task_team->tt.tt_untied_task_encountered) {
3263  // The TSC does not allow to steal victim task
3264  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3265  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3266  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3267  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3268  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3269  return NULL;
3270  }
3271  int i;
3272  // walk through victim's deque trying to steal any task
3273  target = victim_td->td.td_deque_head;
3274  taskdata = NULL;
3275  for (i = 1; i < ntasks; ++i) {
3276  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3277  taskdata = victim_td->td.td_deque[target];
3278  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3279  break; // found victim task
3280  } else {
3281  taskdata = NULL;
3282  }
3283  }
3284  if (taskdata == NULL) {
3285  // No appropriate candidate to steal found
3286  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3287  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3288  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3289  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3290  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3291  return NULL;
3292  }
3293  int prev = target;
3294  for (i = i + 1; i < ntasks; ++i) {
3295  // shift remaining tasks in the deque left by 1
3296  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3297  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3298  prev = target;
3299  }
3300  KMP_DEBUG_ASSERT(
3301  victim_td->td.td_deque_tail ==
3302  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3303  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3304  }
3305  if (*thread_finished) {
3306  // We need to un-mark this victim as a finished victim. This must be done
3307  // before releasing the lock, or else other threads (starting with the
3308  // primary thread victim) might be prematurely released from the barrier!!!
3309 #if KMP_DEBUG
3310  kmp_int32 count =
3311 #endif
3312  KMP_ATOMIC_INC(unfinished_threads);
3313  KA_TRACE(
3314  20,
3315  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3316  gtid, count + 1, task_team));
3317  *thread_finished = FALSE;
3318  }
3319  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3320 
3321  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3322 
3323  KMP_COUNT_BLOCK(TASK_stolen);
3324  KA_TRACE(10,
3325  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3326  "task_team=%p ntasks=%d head=%u tail=%u\n",
3327  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3328  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3329 
3330  task = KMP_TASKDATA_TO_TASK(taskdata);
3331  return task;
3332 }
3333 
3334 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3335 // condition is statisfied (return true) or there are none left (return false).
3336 //
3337 // final_spin is TRUE if this is the spin at the release barrier.
3338 // thread_finished indicates whether the thread is finished executing all
3339 // the tasks it has on its deque, and is at the release barrier.
3340 // spinner is the location on which to spin.
3341 // spinner == NULL means only execute a single task and return.
3342 // checker is the value to check to terminate the spin.
3343 template <class C>
3344 static inline int __kmp_execute_tasks_template(
3345  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3346  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3347  kmp_int32 is_constrained) {
3348  kmp_task_team_t *task_team = thread->th.th_task_team;
3349  kmp_thread_data_t *threads_data;
3350  kmp_task_t *task;
3351  kmp_info_t *other_thread;
3352  kmp_taskdata_t *current_task = thread->th.th_current_task;
3353  std::atomic<kmp_int32> *unfinished_threads;
3354  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3355  tid = thread->th.th_info.ds.ds_tid;
3356 
3357  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3358  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3359 
3360  if (task_team == NULL || current_task == NULL)
3361  return FALSE;
3362 
3363  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3364  "*thread_finished=%d\n",
3365  gtid, final_spin, *thread_finished));
3366 
3367  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3368  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3369 
3370  KMP_DEBUG_ASSERT(threads_data != NULL);
3371 
3372  nthreads = task_team->tt.tt_nproc;
3373  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3374  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
3375  task_team->tt.tt_hidden_helper_task_encountered);
3376  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3377 
3378  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3379  // getting tasks from target constructs
3380  while (1) { // Inner loop to find a task and execute it
3381  task = NULL;
3382  if (task_team->tt.tt_num_task_pri) { // get priority task first
3383  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3384  }
3385  if (task == NULL && use_own_tasks) { // check own queue next
3386  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3387  }
3388  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3389  int asleep = 1;
3390  use_own_tasks = 0;
3391  // Try to steal from the last place I stole from successfully.
3392  if (victim_tid == -2) { // haven't stolen anything yet
3393  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3394  if (victim_tid !=
3395  -1) // if we have a last stolen from victim, get the thread
3396  other_thread = threads_data[victim_tid].td.td_thr;
3397  }
3398  if (victim_tid != -1) { // found last victim
3399  asleep = 0;
3400  } else if (!new_victim) { // no recent steals and we haven't already
3401  // used a new victim; select a random thread
3402  do { // Find a different thread to steal work from.
3403  // Pick a random thread. Initial plan was to cycle through all the
3404  // threads, and only return if we tried to steal from every thread,
3405  // and failed. Arch says that's not such a great idea.
3406  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3407  if (victim_tid >= tid) {
3408  ++victim_tid; // Adjusts random distribution to exclude self
3409  }
3410  // Found a potential victim
3411  other_thread = threads_data[victim_tid].td.td_thr;
3412  // There is a slight chance that __kmp_enable_tasking() did not wake
3413  // up all threads waiting at the barrier. If victim is sleeping,
3414  // then wake it up. Since we were going to pay the cache miss
3415  // penalty for referencing another thread's kmp_info_t struct
3416  // anyway,
3417  // the check shouldn't cost too much performance at this point. In
3418  // extra barrier mode, tasks do not sleep at the separate tasking
3419  // barrier, so this isn't a problem.
3420  asleep = 0;
3421  if ((__kmp_tasking_mode == tskm_task_teams) &&
3422  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3423  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3424  NULL)) {
3425  asleep = 1;
3426  __kmp_null_resume_wrapper(other_thread);
3427  // A sleeping thread should not have any tasks on it's queue.
3428  // There is a slight possibility that it resumes, steals a task
3429  // from another thread, which spawns more tasks, all in the time
3430  // that it takes this thread to check => don't write an assertion
3431  // that the victim's queue is empty. Try stealing from a
3432  // different thread.
3433  }
3434  } while (asleep);
3435  }
3436 
3437  if (!asleep) {
3438  // We have a victim to try to steal from
3439  task = __kmp_steal_task(other_thread, gtid, task_team,
3440  unfinished_threads, thread_finished,
3441  is_constrained);
3442  }
3443  if (task != NULL) { // set last stolen to victim
3444  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3445  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3446  // The pre-refactored code did not try more than 1 successful new
3447  // vicitm, unless the last one generated more local tasks;
3448  // new_victim keeps track of this
3449  new_victim = 1;
3450  }
3451  } else { // No tasks found; unset last_stolen
3452  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3453  victim_tid = -2; // no successful victim found
3454  }
3455  }
3456 
3457  if (task == NULL)
3458  break; // break out of tasking loop
3459 
3460 // Found a task; execute it
3461 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3462  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3463  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3464  // get the object reliably
3465  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3466  }
3467  __kmp_itt_task_starting(itt_sync_obj);
3468  }
3469 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3470  __kmp_invoke_task(gtid, task, current_task);
3471 #if USE_ITT_BUILD
3472  if (itt_sync_obj != NULL)
3473  __kmp_itt_task_finished(itt_sync_obj);
3474 #endif /* USE_ITT_BUILD */
3475  // If this thread is only partway through the barrier and the condition is
3476  // met, then return now, so that the barrier gather/release pattern can
3477  // proceed. If this thread is in the last spin loop in the barrier,
3478  // waiting to be released, we know that the termination condition will not
3479  // be satisfied, so don't waste any cycles checking it.
3480  if (flag == NULL || (!final_spin && flag->done_check())) {
3481  KA_TRACE(
3482  15,
3483  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3484  gtid));
3485  return TRUE;
3486  }
3487  if (thread->th.th_task_team == NULL) {
3488  break;
3489  }
3490  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3491  // If execution of a stolen task results in more tasks being placed on our
3492  // run queue, reset use_own_tasks
3493  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3494  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3495  "other tasks, restart\n",
3496  gtid));
3497  use_own_tasks = 1;
3498  new_victim = 0;
3499  }
3500  }
3501 
3502  // The task source has been exhausted. If in final spin loop of barrier,
3503  // check if termination condition is satisfied. The work queue may be empty
3504  // but there might be proxy tasks still executing.
3505  if (final_spin &&
3506  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3507  // First, decrement the #unfinished threads, if that has not already been
3508  // done. This decrement might be to the spin location, and result in the
3509  // termination condition being satisfied.
3510  if (!*thread_finished) {
3511 #if KMP_DEBUG
3512  kmp_int32 count = -1 +
3513 #endif
3514  KMP_ATOMIC_DEC(unfinished_threads);
3515  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3516  "unfinished_threads to %d task_team=%p\n",
3517  gtid, count, task_team));
3518  *thread_finished = TRUE;
3519  }
3520 
3521  // It is now unsafe to reference thread->th.th_team !!!
3522  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3523  // thread to pass through the barrier, where it might reset each thread's
3524  // th.th_team field for the next parallel region. If we can steal more
3525  // work, we know that this has not happened yet.
3526  if (flag != NULL && flag->done_check()) {
3527  KA_TRACE(
3528  15,
3529  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3530  gtid));
3531  return TRUE;
3532  }
3533  }
3534 
3535  // If this thread's task team is NULL, primary thread has recognized that
3536  // there are no more tasks; bail out
3537  if (thread->th.th_task_team == NULL) {
3538  KA_TRACE(15,
3539  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3540  return FALSE;
3541  }
3542 
3543  // Check the flag again to see if it has already done in case to be trapped
3544  // into infinite loop when a if0 task depends on a hidden helper task
3545  // outside any parallel region. Detached tasks are not impacted in this case
3546  // because the only thread executing this function has to execute the proxy
3547  // task so it is in another code path that has the same check.
3548  if (flag == NULL || (!final_spin && flag->done_check())) {
3549  KA_TRACE(15,
3550  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3551  gtid));
3552  return TRUE;
3553  }
3554 
3555  // We could be getting tasks from target constructs; if this is the only
3556  // thread, keep trying to execute tasks from own queue
3557  if (nthreads == 1 &&
3558  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3559  use_own_tasks = 1;
3560  else {
3561  KA_TRACE(15,
3562  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3563  return FALSE;
3564  }
3565  }
3566 }
3567 
3568 template <bool C, bool S>
3569 int __kmp_execute_tasks_32(
3570  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3571  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3572  kmp_int32 is_constrained) {
3573  return __kmp_execute_tasks_template(
3574  thread, gtid, flag, final_spin,
3575  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3576 }
3577 
3578 template <bool C, bool S>
3579 int __kmp_execute_tasks_64(
3580  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3581  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3582  kmp_int32 is_constrained) {
3583  return __kmp_execute_tasks_template(
3584  thread, gtid, flag, final_spin,
3585  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3586 }
3587 
3588 template <bool C, bool S>
3589 int __kmp_atomic_execute_tasks_64(
3590  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3591  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3592  kmp_int32 is_constrained) {
3593  return __kmp_execute_tasks_template(
3594  thread, gtid, flag, final_spin,
3595  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3596 }
3597 
3598 int __kmp_execute_tasks_oncore(
3599  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3600  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3601  kmp_int32 is_constrained) {
3602  return __kmp_execute_tasks_template(
3603  thread, gtid, flag, final_spin,
3604  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3605 }
3606 
3607 template int
3608 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3609  kmp_flag_32<false, false> *, int,
3610  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3611 
3612 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3613  kmp_flag_64<false, true> *,
3614  int,
3615  int *USE_ITT_BUILD_ARG(void *),
3616  kmp_int32);
3617 
3618 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3619  kmp_flag_64<true, false> *,
3620  int,
3621  int *USE_ITT_BUILD_ARG(void *),
3622  kmp_int32);
3623 
3624 template int __kmp_atomic_execute_tasks_64<false, true>(
3625  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3626  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3627 
3628 template int __kmp_atomic_execute_tasks_64<true, false>(
3629  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3630  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3631 
3632 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3633 // next barrier so they can assist in executing enqueued tasks.
3634 // First thread in allocates the task team atomically.
3635 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3636  kmp_info_t *this_thr) {
3637  kmp_thread_data_t *threads_data;
3638  int nthreads, i, is_init_thread;
3639 
3640  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3641  __kmp_gtid_from_thread(this_thr)));
3642 
3643  KMP_DEBUG_ASSERT(task_team != NULL);
3644  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3645 
3646  nthreads = task_team->tt.tt_nproc;
3647  KMP_DEBUG_ASSERT(nthreads > 0);
3648  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3649 
3650  // Allocate or increase the size of threads_data if necessary
3651  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3652 
3653  if (!is_init_thread) {
3654  // Some other thread already set up the array.
3655  KA_TRACE(
3656  20,
3657  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3658  __kmp_gtid_from_thread(this_thr)));
3659  return;
3660  }
3661  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3662  KMP_DEBUG_ASSERT(threads_data != NULL);
3663 
3664  if (__kmp_tasking_mode == tskm_task_teams &&
3665  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3666  // Release any threads sleeping at the barrier, so that they can steal
3667  // tasks and execute them. In extra barrier mode, tasks do not sleep
3668  // at the separate tasking barrier, so this isn't a problem.
3669  for (i = 0; i < nthreads; i++) {
3670  void *sleep_loc;
3671  kmp_info_t *thread = threads_data[i].td.td_thr;
3672 
3673  if (i == this_thr->th.th_info.ds.ds_tid) {
3674  continue;
3675  }
3676  // Since we haven't locked the thread's suspend mutex lock at this
3677  // point, there is a small window where a thread might be putting
3678  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3679  // To work around this, __kmp_execute_tasks_template() periodically checks
3680  // see if other threads are sleeping (using the same random mechanism that
3681  // is used for task stealing) and awakens them if they are.
3682  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3683  NULL) {
3684  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3685  __kmp_gtid_from_thread(this_thr),
3686  __kmp_gtid_from_thread(thread)));
3687  __kmp_null_resume_wrapper(thread);
3688  } else {
3689  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3690  __kmp_gtid_from_thread(this_thr),
3691  __kmp_gtid_from_thread(thread)));
3692  }
3693  }
3694  }
3695 
3696  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3697  __kmp_gtid_from_thread(this_thr)));
3698 }
3699 
3700 /* // TODO: Check the comment consistency
3701  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3702  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3703  * After a child * thread checks into a barrier and calls __kmp_release() from
3704  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3705  * longer assume that the kmp_team_t structure is intact (at any moment, the
3706  * primary thread may exit the barrier code and free the team data structure,
3707  * and return the threads to the thread pool).
3708  *
3709  * This does not work with the tasking code, as the thread is still
3710  * expected to participate in the execution of any tasks that may have been
3711  * spawned my a member of the team, and the thread still needs access to all
3712  * to each thread in the team, so that it can steal work from it.
3713  *
3714  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3715  * counting mechanism, and is allocated by the primary thread before calling
3716  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3717  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3718  * of the kmp_task_team_t structs for consecutive barriers can overlap
3719  * (and will, unless the primary thread is the last thread to exit the barrier
3720  * release phase, which is not typical). The existence of such a struct is
3721  * useful outside the context of tasking.
3722  *
3723  * We currently use the existence of the threads array as an indicator that
3724  * tasks were spawned since the last barrier. If the structure is to be
3725  * useful outside the context of tasking, then this will have to change, but
3726  * not setting the field minimizes the performance impact of tasking on
3727  * barriers, when no explicit tasks were spawned (pushed, actually).
3728  */
3729 
3730 static kmp_task_team_t *__kmp_free_task_teams =
3731  NULL; // Free list for task_team data structures
3732 // Lock for task team data structures
3733 kmp_bootstrap_lock_t __kmp_task_team_lock =
3734  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3735 
3736 // __kmp_alloc_task_deque:
3737 // Allocates a task deque for a particular thread, and initialize the necessary
3738 // data structures relating to the deque. This only happens once per thread
3739 // per task team since task teams are recycled. No lock is needed during
3740 // allocation since each thread allocates its own deque.
3741 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3742  kmp_thread_data_t *thread_data) {
3743  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3744  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3745 
3746  // Initialize last stolen task field to "none"
3747  thread_data->td.td_deque_last_stolen = -1;
3748 
3749  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3750  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3751  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3752 
3753  KE_TRACE(
3754  10,
3755  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3756  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3757  // Allocate space for task deque, and zero the deque
3758  // Cannot use __kmp_thread_calloc() because threads not around for
3759  // kmp_reap_task_team( ).
3760  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3761  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3762  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3763 }
3764 
3765 // __kmp_free_task_deque:
3766 // Deallocates a task deque for a particular thread. Happens at library
3767 // deallocation so don't need to reset all thread data fields.
3768 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3769  if (thread_data->td.td_deque != NULL) {
3770  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3771  TCW_4(thread_data->td.td_deque_ntasks, 0);
3772  __kmp_free(thread_data->td.td_deque);
3773  thread_data->td.td_deque = NULL;
3774  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3775  }
3776 
3777 #ifdef BUILD_TIED_TASK_STACK
3778  // GEH: Figure out what to do here for td_susp_tied_tasks
3779  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3780  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3781  }
3782 #endif // BUILD_TIED_TASK_STACK
3783 }
3784 
3785 // __kmp_realloc_task_threads_data:
3786 // Allocates a threads_data array for a task team, either by allocating an
3787 // initial array or enlarging an existing array. Only the first thread to get
3788 // the lock allocs or enlarges the array and re-initializes the array elements.
3789 // That thread returns "TRUE", the rest return "FALSE".
3790 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3791 // The current size is given by task_team -> tt.tt_max_threads.
3792 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3793  kmp_task_team_t *task_team) {
3794  kmp_thread_data_t **threads_data_p;
3795  kmp_int32 nthreads, maxthreads;
3796  int is_init_thread = FALSE;
3797 
3798  if (TCR_4(task_team->tt.tt_found_tasks)) {
3799  // Already reallocated and initialized.
3800  return FALSE;
3801  }
3802 
3803  threads_data_p = &task_team->tt.tt_threads_data;
3804  nthreads = task_team->tt.tt_nproc;
3805  maxthreads = task_team->tt.tt_max_threads;
3806 
3807  // All threads must lock when they encounter the first task of the implicit
3808  // task region to make sure threads_data fields are (re)initialized before
3809  // used.
3810  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3811 
3812  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3813  // first thread to enable tasking
3814  kmp_team_t *team = thread->th.th_team;
3815  int i;
3816 
3817  is_init_thread = TRUE;
3818  if (maxthreads < nthreads) {
3819 
3820  if (*threads_data_p != NULL) {
3821  kmp_thread_data_t *old_data = *threads_data_p;
3822  kmp_thread_data_t *new_data = NULL;
3823 
3824  KE_TRACE(
3825  10,
3826  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3827  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3828  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3829  // Reallocate threads_data to have more elements than current array
3830  // Cannot use __kmp_thread_realloc() because threads not around for
3831  // kmp_reap_task_team( ). Note all new array entries are initialized
3832  // to zero by __kmp_allocate().
3833  new_data = (kmp_thread_data_t *)__kmp_allocate(
3834  nthreads * sizeof(kmp_thread_data_t));
3835  // copy old data to new data
3836  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3837  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3838 
3839 #ifdef BUILD_TIED_TASK_STACK
3840  // GEH: Figure out if this is the right thing to do
3841  for (i = maxthreads; i < nthreads; i++) {
3842  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3843  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3844  }
3845 #endif // BUILD_TIED_TASK_STACK
3846  // Install the new data and free the old data
3847  (*threads_data_p) = new_data;
3848  __kmp_free(old_data);
3849  } else {
3850  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3851  "threads data for task_team %p, size = %d\n",
3852  __kmp_gtid_from_thread(thread), task_team, nthreads));
3853  // Make the initial allocate for threads_data array, and zero entries
3854  // Cannot use __kmp_thread_calloc() because threads not around for
3855  // kmp_reap_task_team( ).
3856  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3857  nthreads * sizeof(kmp_thread_data_t));
3858 #ifdef BUILD_TIED_TASK_STACK
3859  // GEH: Figure out if this is the right thing to do
3860  for (i = 0; i < nthreads; i++) {
3861  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3862  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3863  }
3864 #endif // BUILD_TIED_TASK_STACK
3865  }
3866  task_team->tt.tt_max_threads = nthreads;
3867  } else {
3868  // If array has (more than) enough elements, go ahead and use it
3869  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3870  }
3871 
3872  // initialize threads_data pointers back to thread_info structures
3873  for (i = 0; i < nthreads; i++) {
3874  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3875  thread_data->td.td_thr = team->t.t_threads[i];
3876 
3877  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3878  // The last stolen field survives across teams / barrier, and the number
3879  // of threads may have changed. It's possible (likely?) that a new
3880  // parallel region will exhibit the same behavior as previous region.
3881  thread_data->td.td_deque_last_stolen = -1;
3882  }
3883  }
3884 
3885  KMP_MB();
3886  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3887  }
3888 
3889  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3890  return is_init_thread;
3891 }
3892 
3893 // __kmp_free_task_threads_data:
3894 // Deallocates a threads_data array for a task team, including any attached
3895 // tasking deques. Only occurs at library shutdown.
3896 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3897  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3898  if (task_team->tt.tt_threads_data != NULL) {
3899  int i;
3900  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3901  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3902  }
3903  __kmp_free(task_team->tt.tt_threads_data);
3904  task_team->tt.tt_threads_data = NULL;
3905  }
3906  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3907 }
3908 
3909 // __kmp_free_task_pri_list:
3910 // Deallocates tasking deques used for priority tasks.
3911 // Only occurs at library shutdown.
3912 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3913  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3914  if (task_team->tt.tt_task_pri_list != NULL) {
3915  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3916  while (list != NULL) {
3917  kmp_task_pri_t *next = list->next;
3918  __kmp_free_task_deque(&list->td);
3919  __kmp_free(list);
3920  list = next;
3921  }
3922  task_team->tt.tt_task_pri_list = NULL;
3923  }
3924  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3925 }
3926 
3927 // __kmp_allocate_task_team:
3928 // Allocates a task team associated with a specific team, taking it from
3929 // the global task team free list if possible. Also initializes data
3930 // structures.
3931 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3932  kmp_team_t *team) {
3933  kmp_task_team_t *task_team = NULL;
3934  int nthreads;
3935 
3936  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3937  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3938 
3939  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3940  // Take a task team from the task team pool
3941  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3942  if (__kmp_free_task_teams != NULL) {
3943  task_team = __kmp_free_task_teams;
3944  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3945  task_team->tt.tt_next = NULL;
3946  }
3947  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3948  }
3949 
3950  if (task_team == NULL) {
3951  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3952  "task team for team %p\n",
3953  __kmp_gtid_from_thread(thread), team));
3954  // Allocate a new task team if one is not available. Cannot use
3955  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3956  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3957  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3958  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3959 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3960  // suppress race conditions detection on synchronization flags in debug mode
3961  // this helps to analyze library internals eliminating false positives
3962  __itt_suppress_mark_range(
3963  __itt_suppress_range, __itt_suppress_threading_errors,
3964  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3965  __itt_suppress_mark_range(__itt_suppress_range,
3966  __itt_suppress_threading_errors,
3967  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3968  sizeof(task_team->tt.tt_active));
3969 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3970  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3971  // task_team->tt.tt_threads_data = NULL;
3972  // task_team->tt.tt_max_threads = 0;
3973  // task_team->tt.tt_next = NULL;
3974  }
3975 
3976  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3977  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3978  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3979  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3980 
3981  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3982  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3983  TCW_4(task_team->tt.tt_active, TRUE);
3984 
3985  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3986  "unfinished_threads init'd to %d\n",
3987  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3988  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3989  return task_team;
3990 }
3991 
3992 // __kmp_free_task_team:
3993 // Frees the task team associated with a specific thread, and adds it
3994 // to the global task team free list.
3995 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3996  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3997  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3998 
3999  // Put task team back on free list
4000  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4001 
4002  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4003  task_team->tt.tt_next = __kmp_free_task_teams;
4004  TCW_PTR(__kmp_free_task_teams, task_team);
4005 
4006  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4007 }
4008 
4009 // __kmp_reap_task_teams:
4010 // Free all the task teams on the task team free list.
4011 // Should only be done during library shutdown.
4012 // Cannot do anything that needs a thread structure or gtid since they are
4013 // already gone.
4014 void __kmp_reap_task_teams(void) {
4015  kmp_task_team_t *task_team;
4016 
4017  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4018  // Free all task_teams on the free list
4019  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4020  while ((task_team = __kmp_free_task_teams) != NULL) {
4021  __kmp_free_task_teams = task_team->tt.tt_next;
4022  task_team->tt.tt_next = NULL;
4023 
4024  // Free threads_data if necessary
4025  if (task_team->tt.tt_threads_data != NULL) {
4026  __kmp_free_task_threads_data(task_team);
4027  }
4028  if (task_team->tt.tt_task_pri_list != NULL) {
4029  __kmp_free_task_pri_list(task_team);
4030  }
4031  __kmp_free(task_team);
4032  }
4033  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4034  }
4035 }
4036 
4037 // __kmp_wait_to_unref_task_teams:
4038 // Some threads could still be in the fork barrier release code, possibly
4039 // trying to steal tasks. Wait for each thread to unreference its task team.
4040 void __kmp_wait_to_unref_task_teams(void) {
4041  kmp_info_t *thread;
4042  kmp_uint32 spins;
4043  kmp_uint64 time;
4044  int done;
4045 
4046  KMP_INIT_YIELD(spins);
4047  KMP_INIT_BACKOFF(time);
4048 
4049  for (;;) {
4050  done = TRUE;
4051 
4052  // TODO: GEH - this may be is wrong because some sync would be necessary
4053  // in case threads are added to the pool during the traversal. Need to
4054  // verify that lock for thread pool is held when calling this routine.
4055  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4056  thread = thread->th.th_next_pool) {
4057 #if KMP_OS_WINDOWS
4058  DWORD exit_val;
4059 #endif
4060  if (TCR_PTR(thread->th.th_task_team) == NULL) {
4061  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4062  __kmp_gtid_from_thread(thread)));
4063  continue;
4064  }
4065 #if KMP_OS_WINDOWS
4066  // TODO: GEH - add this check for Linux* OS / OS X* as well?
4067  if (!__kmp_is_thread_alive(thread, &exit_val)) {
4068  thread->th.th_task_team = NULL;
4069  continue;
4070  }
4071 #endif
4072 
4073  done = FALSE; // Because th_task_team pointer is not NULL for this thread
4074 
4075  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4076  "unreference task_team\n",
4077  __kmp_gtid_from_thread(thread)));
4078 
4079  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4080  void *sleep_loc;
4081  // If the thread is sleeping, awaken it.
4082  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4083  NULL) {
4084  KA_TRACE(
4085  10,
4086  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4087  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4088  __kmp_null_resume_wrapper(thread);
4089  }
4090  }
4091  }
4092  if (done) {
4093  break;
4094  }
4095 
4096  // If oversubscribed or have waited a bit, yield.
4097  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4098  }
4099 }
4100 
4101 void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) {
4102  // Shift values from th_task_state_top+1 to task_state_stack_sz
4103  if (this_thr->th.th_task_state_top + 1 >=
4104  this_thr->th.th_task_state_stack_sz) { // increase size
4105  kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz;
4106  kmp_uint8 *old_stack, *new_stack;
4107  kmp_uint32 i;
4108  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
4109  for (i = 0; i <= this_thr->th.th_task_state_top; ++i) {
4110  new_stack[i] = this_thr->th.th_task_state_memo_stack[i];
4111  }
4112  // If we need to reallocate do the shift at the same time.
4113  for (; i < this_thr->th.th_task_state_stack_sz; ++i) {
4114  new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i];
4115  }
4116  for (i = this_thr->th.th_task_state_stack_sz; i < new_size;
4117  ++i) { // zero-init rest of stack
4118  new_stack[i] = 0;
4119  }
4120  old_stack = this_thr->th.th_task_state_memo_stack;
4121  this_thr->th.th_task_state_memo_stack = new_stack;
4122  this_thr->th.th_task_state_stack_sz = new_size;
4123  __kmp_free(old_stack);
4124  } else {
4125  kmp_uint8 *end;
4126  kmp_uint32 i;
4127 
4128  end = &this_thr->th
4129  .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz];
4130 
4131  for (i = this_thr->th.th_task_state_stack_sz - 1;
4132  i > this_thr->th.th_task_state_top; i--, end--)
4133  end[0] = end[-1];
4134  }
4135  this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] =
4136  value;
4137 }
4138 
4139 // __kmp_task_team_setup: Create a task_team for the current team, but use
4140 // an already created, unused one if it already exists.
4141 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
4142  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4143 
4144  // If this task_team hasn't been created yet, allocate it. It will be used in
4145  // the region after the next.
4146  // If it exists, it is the current task team and shouldn't be touched yet as
4147  // it may still be in use.
4148  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
4149  (always || team->t.t_nproc > 1)) {
4150  team->t.t_task_team[this_thr->th.th_task_state] =
4151  __kmp_allocate_task_team(this_thr, team);
4152  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4153  " for team %d at parity=%d\n",
4154  __kmp_gtid_from_thread(this_thr),
4155  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4156  this_thr->th.th_task_state));
4157  }
4158  if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) {
4159  // fix task state stack to adjust for proxy and helper tasks
4160  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack"
4161  " for team %d at parity=%d\n",
4162  __kmp_gtid_from_thread(this_thr), team->t.t_id,
4163  this_thr->th.th_task_state));
4164  __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state);
4165  }
4166 
4167  // After threads exit the release, they will call sync, and then point to this
4168  // other task_team; make sure it is allocated and properly initialized. As
4169  // threads spin in the barrier release phase, they will continue to use the
4170  // previous task_team struct(above), until they receive the signal to stop
4171  // checking for tasks (they can't safely reference the kmp_team_t struct,
4172  // which could be reallocated by the primary thread). No task teams are formed
4173  // for serialized teams.
4174  if (team->t.t_nproc > 1) {
4175  int other_team = 1 - this_thr->th.th_task_state;
4176  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
4177  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4178  team->t.t_task_team[other_team] =
4179  __kmp_allocate_task_team(this_thr, team);
4180  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
4181  "task_team %p for team %d at parity=%d\n",
4182  __kmp_gtid_from_thread(this_thr),
4183  team->t.t_task_team[other_team], team->t.t_id, other_team));
4184  } else { // Leave the old task team struct in place for the upcoming region;
4185  // adjust as needed
4186  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4187  if (!task_team->tt.tt_active ||
4188  team->t.t_nproc != task_team->tt.tt_nproc) {
4189  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
4190  TCW_4(task_team->tt.tt_found_tasks, FALSE);
4191  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4192  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4193  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
4194  team->t.t_nproc);
4195  TCW_4(task_team->tt.tt_active, TRUE);
4196  }
4197  // if team size has changed, the first thread to enable tasking will
4198  // realloc threads_data if necessary
4199  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4200  "%p for team %d at parity=%d\n",
4201  __kmp_gtid_from_thread(this_thr),
4202  team->t.t_task_team[other_team], team->t.t_id, other_team));
4203  }
4204  }
4205 
4206  // For regular thread, task enabling should be called when the task is going
4207  // to be pushed to a dequeue. However, for the hidden helper thread, we need
4208  // it ahead of time so that some operations can be performed without race
4209  // condition.
4210  if (this_thr == __kmp_hidden_helper_main_thread) {
4211  for (int i = 0; i < 2; ++i) {
4212  kmp_task_team_t *task_team = team->t.t_task_team[i];
4213  if (KMP_TASKING_ENABLED(task_team)) {
4214  continue;
4215  }
4216  __kmp_enable_tasking(task_team, this_thr);
4217  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4218  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4219  if (thread_data->td.td_deque == NULL) {
4220  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4221  }
4222  }
4223  }
4224  }
4225 }
4226 
4227 // __kmp_task_team_sync: Propagation of task team data from team to threads
4228 // which happens just after the release phase of a team barrier. This may be
4229 // called by any thread, but only for teams with # threads > 1.
4230 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4231  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4232 
4233  // Toggle the th_task_state field, to switch which task_team this thread
4234  // refers to
4235  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4236 
4237  // It is now safe to propagate the task team pointer from the team struct to
4238  // the current thread.
4239  TCW_PTR(this_thr->th.th_task_team,
4240  team->t.t_task_team[this_thr->th.th_task_state]);
4241  KA_TRACE(20,
4242  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4243  "%p from Team #%d (parity=%d)\n",
4244  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4245  team->t.t_id, this_thr->th.th_task_state));
4246 }
4247 
4248 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4249 // barrier gather phase. Only called by primary thread if #threads in team > 1
4250 // or if proxy tasks were created.
4251 //
4252 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4253 // by passing in 0 optionally as the last argument. When wait is zero, primary
4254 // thread does not wait for unfinished_threads to reach 0.
4255 void __kmp_task_team_wait(
4256  kmp_info_t *this_thr,
4257  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4258  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4259 
4260  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4261  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4262 
4263  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4264  if (wait) {
4265  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4266  "(for unfinished_threads to reach 0) on task_team = %p\n",
4267  __kmp_gtid_from_thread(this_thr), task_team));
4268  // Worker threads may have dropped through to release phase, but could
4269  // still be executing tasks. Wait here for tasks to complete. To avoid
4270  // memory contention, only primary thread checks termination condition.
4271  kmp_flag_32<false, false> flag(
4272  RCAST(std::atomic<kmp_uint32> *,
4273  &task_team->tt.tt_unfinished_threads),
4274  0U);
4275  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4276  }
4277  // Deactivate the old task team, so that the worker threads will stop
4278  // referencing it while spinning.
4279  KA_TRACE(
4280  20,
4281  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4282  "setting active to false, setting local and team's pointer to NULL\n",
4283  __kmp_gtid_from_thread(this_thr), task_team));
4284  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
4285  task_team->tt.tt_found_proxy_tasks == TRUE ||
4286  task_team->tt.tt_hidden_helper_task_encountered == TRUE);
4287  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4288  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4289  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4290  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4291  KMP_MB();
4292 
4293  TCW_PTR(this_thr->th.th_task_team, NULL);
4294  }
4295 }
4296 
4297 // __kmp_tasking_barrier:
4298 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4299 // Internal function to execute all tasks prior to a regular barrier or a join
4300 // barrier. It is a full barrier itself, which unfortunately turns regular
4301 // barriers into double barriers and join barriers into 1 1/2 barriers.
4302 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4303  std::atomic<kmp_uint32> *spin = RCAST(
4304  std::atomic<kmp_uint32> *,
4305  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4306  int flag = FALSE;
4307  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4308 
4309 #if USE_ITT_BUILD
4310  KMP_FSYNC_SPIN_INIT(spin, NULL);
4311 #endif /* USE_ITT_BUILD */
4312  kmp_flag_32<false, false> spin_flag(spin, 0U);
4313  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4314  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4315 #if USE_ITT_BUILD
4316  // TODO: What about itt_sync_obj??
4317  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4318 #endif /* USE_ITT_BUILD */
4319 
4320  if (TCR_4(__kmp_global.g.g_done)) {
4321  if (__kmp_global.g.g_abort)
4322  __kmp_abort_thread();
4323  break;
4324  }
4325  KMP_YIELD(TRUE);
4326  }
4327 #if USE_ITT_BUILD
4328  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4329 #endif /* USE_ITT_BUILD */
4330 }
4331 
4332 // __kmp_give_task puts a task into a given thread queue if:
4333 // - the queue for that thread was created
4334 // - there's space in that queue
4335 // Because of this, __kmp_push_task needs to check if there's space after
4336 // getting the lock
4337 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4338  kmp_int32 pass) {
4339  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4340  kmp_task_team_t *task_team = taskdata->td_task_team;
4341 
4342  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4343  taskdata, tid));
4344 
4345  // If task_team is NULL something went really bad...
4346  KMP_DEBUG_ASSERT(task_team != NULL);
4347 
4348  bool result = false;
4349  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4350 
4351  if (thread_data->td.td_deque == NULL) {
4352  // There's no queue in this thread, go find another one
4353  // We're guaranteed that at least one thread has a queue
4354  KA_TRACE(30,
4355  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4356  tid, taskdata));
4357  return result;
4358  }
4359 
4360  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4361  TASK_DEQUE_SIZE(thread_data->td)) {
4362  KA_TRACE(
4363  30,
4364  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4365  taskdata, tid));
4366 
4367  // if this deque is bigger than the pass ratio give a chance to another
4368  // thread
4369  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4370  return result;
4371 
4372  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4373  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4374  TASK_DEQUE_SIZE(thread_data->td)) {
4375  // expand deque to push the task which is not allowed to execute
4376  __kmp_realloc_task_deque(thread, thread_data);
4377  }
4378 
4379  } else {
4380 
4381  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4382 
4383  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4384  TASK_DEQUE_SIZE(thread_data->td)) {
4385  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4386  "thread %d.\n",
4387  taskdata, tid));
4388 
4389  // if this deque is bigger than the pass ratio give a chance to another
4390  // thread
4391  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4392  goto release_and_exit;
4393 
4394  __kmp_realloc_task_deque(thread, thread_data);
4395  }
4396  }
4397 
4398  // lock is held here, and there is space in the deque
4399 
4400  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4401  // Wrap index.
4402  thread_data->td.td_deque_tail =
4403  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4404  TCW_4(thread_data->td.td_deque_ntasks,
4405  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4406 
4407  result = true;
4408  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4409  taskdata, tid));
4410 
4411 release_and_exit:
4412  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4413 
4414  return result;
4415 }
4416 
4417 #define PROXY_TASK_FLAG 0x40000000
4418 /* The finish of the proxy tasks is divided in two pieces:
4419  - the top half is the one that can be done from a thread outside the team
4420  - the bottom half must be run from a thread within the team
4421 
4422  In order to run the bottom half the task gets queued back into one of the
4423  threads of the team. Once the td_incomplete_child_task counter of the parent
4424  is decremented the threads can leave the barriers. So, the bottom half needs
4425  to be queued before the counter is decremented. The top half is therefore
4426  divided in two parts:
4427  - things that can be run before queuing the bottom half
4428  - things that must be run after queuing the bottom half
4429 
4430  This creates a second race as the bottom half can free the task before the
4431  second top half is executed. To avoid this we use the
4432  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4433  half. */
4434 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4435  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4436  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4437  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4438  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4439 
4440  taskdata->td_flags.complete = 1; // mark the task as completed
4441 #if OMPX_TASKGRAPH
4442  taskdata->td_flags.onced = 1;
4443 #endif
4444 
4445  if (taskdata->td_taskgroup)
4446  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4447 
4448  // Create an imaginary children for this task so the bottom half cannot
4449  // release the task before we have completed the second top half
4450  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4451 }
4452 
4453 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4454 #if KMP_DEBUG
4455  kmp_int32 children = 0;
4456  // Predecrement simulated by "- 1" calculation
4457  children = -1 +
4458 #endif
4459  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4460  KMP_DEBUG_ASSERT(children >= 0);
4461 
4462  // Remove the imaginary children
4463  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4464 }
4465 
4466 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4467  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4468  kmp_info_t *thread = __kmp_threads[gtid];
4469 
4470  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4471  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4472  1); // top half must run before bottom half
4473 
4474  // We need to wait to make sure the top half is finished
4475  // Spinning here should be ok as this should happen quickly
4476  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4477  PROXY_TASK_FLAG) > 0)
4478  ;
4479 
4480  __kmp_release_deps(gtid, taskdata);
4481  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4482 }
4483 
4492 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4493  KMP_DEBUG_ASSERT(ptask != NULL);
4494  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4495  KA_TRACE(
4496  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4497  gtid, taskdata));
4498  __kmp_assert_valid_gtid(gtid);
4499  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4500 
4501  __kmp_first_top_half_finish_proxy(taskdata);
4502  __kmp_second_top_half_finish_proxy(taskdata);
4503  __kmp_bottom_half_finish_proxy(gtid, ptask);
4504 
4505  KA_TRACE(10,
4506  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4507  gtid, taskdata));
4508 }
4509 
4510 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4511  KMP_DEBUG_ASSERT(ptask != NULL);
4512  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4513 
4514  // Enqueue task to complete bottom half completion from a thread within the
4515  // corresponding team
4516  kmp_team_t *team = taskdata->td_team;
4517  kmp_int32 nthreads = team->t.t_nproc;
4518  kmp_info_t *thread;
4519 
4520  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4521  // but we cannot use __kmp_get_random here
4522  kmp_int32 start_k = start % nthreads;
4523  kmp_int32 pass = 1;
4524  kmp_int32 k = start_k;
4525 
4526  do {
4527  // For now we're just linearly trying to find a thread
4528  thread = team->t.t_threads[k];
4529  k = (k + 1) % nthreads;
4530 
4531  // we did a full pass through all the threads
4532  if (k == start_k)
4533  pass = pass << 1;
4534 
4535  } while (!__kmp_give_task(thread, k, ptask, pass));
4536 
4537  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4538  // awake at least one thread to execute given task
4539  for (int i = 0; i < nthreads; ++i) {
4540  thread = team->t.t_threads[i];
4541  if (thread->th.th_sleep_loc != NULL) {
4542  __kmp_null_resume_wrapper(thread);
4543  break;
4544  }
4545  }
4546  }
4547 }
4548 
4556 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4557  KMP_DEBUG_ASSERT(ptask != NULL);
4558  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4559 
4560  KA_TRACE(
4561  10,
4562  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4563  taskdata));
4564 
4565  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4566 
4567  __kmp_first_top_half_finish_proxy(taskdata);
4568 
4569  __kmpc_give_task(ptask);
4570 
4571  __kmp_second_top_half_finish_proxy(taskdata);
4572 
4573  KA_TRACE(
4574  10,
4575  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4576  taskdata));
4577 }
4578 
4579 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4580  kmp_task_t *task) {
4581  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4582  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4583  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4584  td->td_allow_completion_event.ed.task = task;
4585  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4586  }
4587  return &td->td_allow_completion_event;
4588 }
4589 
4590 void __kmp_fulfill_event(kmp_event_t *event) {
4591  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4592  kmp_task_t *ptask = event->ed.task;
4593  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4594  bool detached = false;
4595  int gtid = __kmp_get_gtid();
4596 
4597  // The associated task might have completed or could be completing at this
4598  // point.
4599  // We need to take the lock to avoid races
4600  __kmp_acquire_tas_lock(&event->lock, gtid);
4601  if (taskdata->td_flags.proxy == TASK_PROXY) {
4602  detached = true;
4603  } else {
4604 #if OMPT_SUPPORT
4605  // The OMPT event must occur under mutual exclusion,
4606  // otherwise the tool might access ptask after free
4607  if (UNLIKELY(ompt_enabled.enabled))
4608  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4609 #endif
4610  }
4611  event->type = KMP_EVENT_UNINITIALIZED;
4612  __kmp_release_tas_lock(&event->lock, gtid);
4613 
4614  if (detached) {
4615 #if OMPT_SUPPORT
4616  // We free ptask afterwards and know the task is finished,
4617  // so locking is not necessary
4618  if (UNLIKELY(ompt_enabled.enabled))
4619  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4620 #endif
4621  // If the task detached complete the proxy task
4622  if (gtid >= 0) {
4623  kmp_team_t *team = taskdata->td_team;
4624  kmp_info_t *thread = __kmp_get_thread();
4625  if (thread->th.th_team == team) {
4626  __kmpc_proxy_task_completed(gtid, ptask);
4627  return;
4628  }
4629  }
4630 
4631  // fallback
4633  }
4634  }
4635 }
4636 
4637 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4638 // for taskloop
4639 //
4640 // thread: allocating thread
4641 // task_src: pointer to source task to be duplicated
4642 // taskloop_recur: used only when dealing with taskgraph,
4643 // indicating whether we need to update task->td_task_id
4644 // returns: a pointer to the allocated kmp_task_t structure (task).
4645 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4646 #if OMPX_TASKGRAPH
4647  , int taskloop_recur
4648 #endif
4649 ) {
4650  kmp_task_t *task;
4651  kmp_taskdata_t *taskdata;
4652  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4653  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4654  size_t shareds_offset;
4655  size_t task_size;
4656 
4657  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4658  task_src));
4659  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4660  TASK_FULL); // it should not be proxy task
4661  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4662  task_size = taskdata_src->td_size_alloc;
4663 
4664  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4665  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4666  task_size));
4667 #if USE_FAST_MEMORY
4668  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4669 #else
4670  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4671 #endif /* USE_FAST_MEMORY */
4672  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4673 
4674  task = KMP_TASKDATA_TO_TASK(taskdata);
4675 
4676  // Initialize new task (only specific fields not affected by memcpy)
4677 #if OMPX_TASKGRAPH
4678  if (!taskdata->is_taskgraph || taskloop_recur)
4679  taskdata->td_task_id = KMP_GEN_TASK_ID();
4680  else if (taskdata->is_taskgraph &&
4681  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4682  taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4683 #else
4684  taskdata->td_task_id = KMP_GEN_TASK_ID();
4685 #endif
4686  if (task->shareds != NULL) { // need setup shareds pointer
4687  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4688  task->shareds = &((char *)taskdata)[shareds_offset];
4689  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4690  0);
4691  }
4692  taskdata->td_alloc_thread = thread;
4693  taskdata->td_parent = parent_task;
4694  // task inherits the taskgroup from the parent task
4695  taskdata->td_taskgroup = parent_task->td_taskgroup;
4696  // tied task needs to initialize the td_last_tied at creation,
4697  // untied one does this when it is scheduled for execution
4698  if (taskdata->td_flags.tiedness == TASK_TIED)
4699  taskdata->td_last_tied = taskdata;
4700 
4701  // Only need to keep track of child task counts if team parallel and tasking
4702  // not serialized
4703  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4704  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4705  if (parent_task->td_taskgroup)
4706  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4707  // Only need to keep track of allocated child tasks for explicit tasks since
4708  // implicit not deallocated
4709  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4710  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4711  }
4712 
4713  KA_TRACE(20,
4714  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4715  thread, taskdata, taskdata->td_parent));
4716 #if OMPT_SUPPORT
4717  if (UNLIKELY(ompt_enabled.enabled))
4718  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4719 #endif
4720  return task;
4721 }
4722 
4723 // Routine optionally generated by the compiler for setting the lastprivate flag
4724 // and calling needed constructors for private/firstprivate objects
4725 // (used to form taskloop tasks from pattern task)
4726 // Parameters: dest task, src task, lastprivate flag.
4727 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4728 
4729 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4730 
4731 // class to encapsulate manipulating loop bounds in a taskloop task.
4732 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4733 // the loop bound variables.
4734 class kmp_taskloop_bounds_t {
4735  kmp_task_t *task;
4736  const kmp_taskdata_t *taskdata;
4737  size_t lower_offset;
4738  size_t upper_offset;
4739 
4740 public:
4741  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4742  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4743  lower_offset((char *)lb - (char *)task),
4744  upper_offset((char *)ub - (char *)task) {
4745  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4746  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4747  }
4748  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4749  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4750  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4751  size_t get_lower_offset() const { return lower_offset; }
4752  size_t get_upper_offset() const { return upper_offset; }
4753  kmp_uint64 get_lb() const {
4754  kmp_int64 retval;
4755 #if defined(KMP_GOMP_COMPAT)
4756  // Intel task just returns the lower bound normally
4757  if (!taskdata->td_flags.native) {
4758  retval = *(kmp_int64 *)((char *)task + lower_offset);
4759  } else {
4760  // GOMP task has to take into account the sizeof(long)
4761  if (taskdata->td_size_loop_bounds == 4) {
4762  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4763  retval = (kmp_int64)*lb;
4764  } else {
4765  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4766  retval = (kmp_int64)*lb;
4767  }
4768  }
4769 #else
4770  (void)taskdata;
4771  retval = *(kmp_int64 *)((char *)task + lower_offset);
4772 #endif // defined(KMP_GOMP_COMPAT)
4773  return retval;
4774  }
4775  kmp_uint64 get_ub() const {
4776  kmp_int64 retval;
4777 #if defined(KMP_GOMP_COMPAT)
4778  // Intel task just returns the upper bound normally
4779  if (!taskdata->td_flags.native) {
4780  retval = *(kmp_int64 *)((char *)task + upper_offset);
4781  } else {
4782  // GOMP task has to take into account the sizeof(long)
4783  if (taskdata->td_size_loop_bounds == 4) {
4784  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4785  retval = (kmp_int64)*ub;
4786  } else {
4787  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4788  retval = (kmp_int64)*ub;
4789  }
4790  }
4791 #else
4792  retval = *(kmp_int64 *)((char *)task + upper_offset);
4793 #endif // defined(KMP_GOMP_COMPAT)
4794  return retval;
4795  }
4796  void set_lb(kmp_uint64 lb) {
4797 #if defined(KMP_GOMP_COMPAT)
4798  // Intel task just sets the lower bound normally
4799  if (!taskdata->td_flags.native) {
4800  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4801  } else {
4802  // GOMP task has to take into account the sizeof(long)
4803  if (taskdata->td_size_loop_bounds == 4) {
4804  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4805  *lower = (kmp_uint32)lb;
4806  } else {
4807  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4808  *lower = (kmp_uint64)lb;
4809  }
4810  }
4811 #else
4812  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4813 #endif // defined(KMP_GOMP_COMPAT)
4814  }
4815  void set_ub(kmp_uint64 ub) {
4816 #if defined(KMP_GOMP_COMPAT)
4817  // Intel task just sets the upper bound normally
4818  if (!taskdata->td_flags.native) {
4819  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4820  } else {
4821  // GOMP task has to take into account the sizeof(long)
4822  if (taskdata->td_size_loop_bounds == 4) {
4823  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4824  *upper = (kmp_uint32)ub;
4825  } else {
4826  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4827  *upper = (kmp_uint64)ub;
4828  }
4829  }
4830 #else
4831  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4832 #endif // defined(KMP_GOMP_COMPAT)
4833  }
4834 };
4835 
4836 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4837 //
4838 // loc Source location information
4839 // gtid Global thread ID
4840 // task Pattern task, exposes the loop iteration range
4841 // lb Pointer to loop lower bound in task structure
4842 // ub Pointer to loop upper bound in task structure
4843 // st Loop stride
4844 // ub_glob Global upper bound (used for lastprivate check)
4845 // num_tasks Number of tasks to execute
4846 // grainsize Number of loop iterations per task
4847 // extras Number of chunks with grainsize+1 iterations
4848 // last_chunk Reduction of grainsize for last task
4849 // tc Iterations count
4850 // task_dup Tasks duplication routine
4851 // codeptr_ra Return address for OMPT events
4852 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4853  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4854  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4855  kmp_uint64 grainsize, kmp_uint64 extras,
4856  kmp_int64 last_chunk, kmp_uint64 tc,
4857 #if OMPT_SUPPORT
4858  void *codeptr_ra,
4859 #endif
4860  void *task_dup) {
4861  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4862  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4863  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4864  // compiler provides global bounds here
4865  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4866  kmp_uint64 lower = task_bounds.get_lb();
4867  kmp_uint64 upper = task_bounds.get_ub();
4868  kmp_uint64 i;
4869  kmp_info_t *thread = __kmp_threads[gtid];
4870  kmp_taskdata_t *current_task = thread->th.th_current_task;
4871  kmp_task_t *next_task;
4872  kmp_int32 lastpriv = 0;
4873 
4874  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4875  (last_chunk < 0 ? last_chunk : extras));
4876  KMP_DEBUG_ASSERT(num_tasks > extras);
4877  KMP_DEBUG_ASSERT(num_tasks > 0);
4878  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4879  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4880  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4881  ub_glob, st, task_dup));
4882 
4883  // Launch num_tasks tasks, assign grainsize iterations each task
4884  for (i = 0; i < num_tasks; ++i) {
4885  kmp_uint64 chunk_minus_1;
4886  if (extras == 0) {
4887  chunk_minus_1 = grainsize - 1;
4888  } else {
4889  chunk_minus_1 = grainsize;
4890  --extras; // first extras iterations get bigger chunk (grainsize+1)
4891  }
4892  upper = lower + st * chunk_minus_1;
4893  if (upper > *ub) {
4894  upper = *ub;
4895  }
4896  if (i == num_tasks - 1) {
4897  // schedule the last task, set lastprivate flag if needed
4898  if (st == 1) { // most common case
4899  KMP_DEBUG_ASSERT(upper == *ub);
4900  if (upper == ub_glob)
4901  lastpriv = 1;
4902  } else if (st > 0) { // positive loop stride
4903  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4904  if ((kmp_uint64)st > ub_glob - upper)
4905  lastpriv = 1;
4906  } else { // negative loop stride
4907  KMP_DEBUG_ASSERT(upper + st < *ub);
4908  if (upper - ub_glob < (kmp_uint64)(-st))
4909  lastpriv = 1;
4910  }
4911  }
4912 
4913 #if OMPX_TASKGRAPH
4914  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4915 #else
4916  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4917 #endif
4918 
4919  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4920  kmp_taskloop_bounds_t next_task_bounds =
4921  kmp_taskloop_bounds_t(next_task, task_bounds);
4922 
4923  // adjust task-specific bounds
4924  next_task_bounds.set_lb(lower);
4925  if (next_taskdata->td_flags.native) {
4926  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4927  } else {
4928  next_task_bounds.set_ub(upper);
4929  }
4930  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4931  // etc.
4932  ptask_dup(next_task, task, lastpriv);
4933  KA_TRACE(40,
4934  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4935  "upper %lld stride %lld, (offsets %p %p)\n",
4936  gtid, i, next_task, lower, upper, st,
4937  next_task_bounds.get_lower_offset(),
4938  next_task_bounds.get_upper_offset()));
4939 #if OMPT_SUPPORT
4940  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4941  codeptr_ra); // schedule new task
4942 #if OMPT_OPTIONAL
4943  if (ompt_enabled.ompt_callback_dispatch) {
4944  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4945  lower, upper, st);
4946  }
4947 #endif // OMPT_OPTIONAL
4948 #else
4949  __kmp_omp_task(gtid, next_task, true); // schedule new task
4950 #endif
4951  lower = upper + st; // adjust lower bound for the next iteration
4952  }
4953  // free the pattern task and exit
4954  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4955  // do not execute the pattern task, just do internal bookkeeping
4956  __kmp_task_finish<false>(gtid, task, current_task);
4957 }
4958 
4959 // Structure to keep taskloop parameters for auxiliary task
4960 // kept in the shareds of the task structure.
4961 typedef struct __taskloop_params {
4962  kmp_task_t *task;
4963  kmp_uint64 *lb;
4964  kmp_uint64 *ub;
4965  void *task_dup;
4966  kmp_int64 st;
4967  kmp_uint64 ub_glob;
4968  kmp_uint64 num_tasks;
4969  kmp_uint64 grainsize;
4970  kmp_uint64 extras;
4971  kmp_int64 last_chunk;
4972  kmp_uint64 tc;
4973  kmp_uint64 num_t_min;
4974 #if OMPT_SUPPORT
4975  void *codeptr_ra;
4976 #endif
4977 } __taskloop_params_t;
4978 
4979 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4980  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4981  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4982  kmp_uint64,
4983 #if OMPT_SUPPORT
4984  void *,
4985 #endif
4986  void *);
4987 
4988 // Execute part of the taskloop submitted as a task.
4989 int __kmp_taskloop_task(int gtid, void *ptask) {
4990  __taskloop_params_t *p =
4991  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4992  kmp_task_t *task = p->task;
4993  kmp_uint64 *lb = p->lb;
4994  kmp_uint64 *ub = p->ub;
4995  void *task_dup = p->task_dup;
4996  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4997  kmp_int64 st = p->st;
4998  kmp_uint64 ub_glob = p->ub_glob;
4999  kmp_uint64 num_tasks = p->num_tasks;
5000  kmp_uint64 grainsize = p->grainsize;
5001  kmp_uint64 extras = p->extras;
5002  kmp_int64 last_chunk = p->last_chunk;
5003  kmp_uint64 tc = p->tc;
5004  kmp_uint64 num_t_min = p->num_t_min;
5005 #if OMPT_SUPPORT
5006  void *codeptr_ra = p->codeptr_ra;
5007 #endif
5008 #if KMP_DEBUG
5009  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5010  KMP_DEBUG_ASSERT(task != NULL);
5011  KA_TRACE(20,
5012  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5013  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5014  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5015  st, task_dup));
5016 #endif
5017  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
5018  if (num_tasks > num_t_min)
5019  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5020  grainsize, extras, last_chunk, tc, num_t_min,
5021 #if OMPT_SUPPORT
5022  codeptr_ra,
5023 #endif
5024  task_dup);
5025  else
5026  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5027  grainsize, extras, last_chunk, tc,
5028 #if OMPT_SUPPORT
5029  codeptr_ra,
5030 #endif
5031  task_dup);
5032 
5033  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5034  return 0;
5035 }
5036 
5037 // Schedule part of the taskloop as a task,
5038 // execute the rest of the taskloop.
5039 //
5040 // loc Source location information
5041 // gtid Global thread ID
5042 // task Pattern task, exposes the loop iteration range
5043 // lb Pointer to loop lower bound in task structure
5044 // ub Pointer to loop upper bound in task structure
5045 // st Loop stride
5046 // ub_glob Global upper bound (used for lastprivate check)
5047 // num_tasks Number of tasks to execute
5048 // grainsize Number of loop iterations per task
5049 // extras Number of chunks with grainsize+1 iterations
5050 // last_chunk Reduction of grainsize for last task
5051 // tc Iterations count
5052 // num_t_min Threshold to launch tasks recursively
5053 // task_dup Tasks duplication routine
5054 // codeptr_ra Return address for OMPT events
5055 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
5056  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5057  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5058  kmp_uint64 grainsize, kmp_uint64 extras,
5059  kmp_int64 last_chunk, kmp_uint64 tc,
5060  kmp_uint64 num_t_min,
5061 #if OMPT_SUPPORT
5062  void *codeptr_ra,
5063 #endif
5064  void *task_dup) {
5065  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5066  KMP_DEBUG_ASSERT(task != NULL);
5067  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5068  KA_TRACE(20,
5069  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5070  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5071  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5072  st, task_dup));
5073  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5074  kmp_uint64 lower = *lb;
5075  kmp_info_t *thread = __kmp_threads[gtid];
5076  // kmp_taskdata_t *current_task = thread->th.th_current_task;
5077  kmp_task_t *next_task;
5078  size_t lower_offset =
5079  (char *)lb - (char *)task; // remember offset of lb in the task structure
5080  size_t upper_offset =
5081  (char *)ub - (char *)task; // remember offset of ub in the task structure
5082 
5083  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5084  (last_chunk < 0 ? last_chunk : extras));
5085  KMP_DEBUG_ASSERT(num_tasks > extras);
5086  KMP_DEBUG_ASSERT(num_tasks > 0);
5087 
5088  // split the loop in two halves
5089  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5090  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
5091  kmp_uint64 gr_size0 = grainsize;
5092  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
5093  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5094  if (last_chunk < 0) {
5095  ext0 = ext1 = 0;
5096  last_chunk1 = last_chunk;
5097  tc0 = grainsize * n_tsk0;
5098  tc1 = tc - tc0;
5099  } else if (n_tsk0 <= extras) {
5100  gr_size0++; // integrate extras into grainsize
5101  ext0 = 0; // no extra iters in 1st half
5102  ext1 = extras - n_tsk0; // remaining extras
5103  tc0 = gr_size0 * n_tsk0;
5104  tc1 = tc - tc0;
5105  } else { // n_tsk0 > extras
5106  ext1 = 0; // no extra iters in 2nd half
5107  ext0 = extras;
5108  tc1 = grainsize * n_tsk1;
5109  tc0 = tc - tc1;
5110  }
5111  ub0 = lower + st * (tc0 - 1);
5112  lb1 = ub0 + st;
5113 
5114  // create pattern task for 2nd half of the loop
5115 #if OMPX_TASKGRAPH
5116  next_task = __kmp_task_dup_alloc(thread, task,
5117  /* taskloop_recur */ 1);
5118 #else
5119  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
5120 #endif
5121  // adjust lower bound (upper bound is not changed) for the 2nd half
5122  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
5123  if (ptask_dup != NULL) // construct firstprivates, etc.
5124  ptask_dup(next_task, task, 0);
5125  *ub = ub0; // adjust upper bound for the 1st half
5126 
5127  // create auxiliary task for 2nd half of the loop
5128  // make sure new task has same parent task as the pattern task
5129  kmp_taskdata_t *current_task = thread->th.th_current_task;
5130  thread->th.th_current_task = taskdata->td_parent;
5131  kmp_task_t *new_task =
5132  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
5133  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
5134  // restore current task
5135  thread->th.th_current_task = current_task;
5136  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
5137  p->task = next_task;
5138  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
5139  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
5140  p->task_dup = task_dup;
5141  p->st = st;
5142  p->ub_glob = ub_glob;
5143  p->num_tasks = n_tsk1;
5144  p->grainsize = grainsize;
5145  p->extras = ext1;
5146  p->last_chunk = last_chunk1;
5147  p->tc = tc1;
5148  p->num_t_min = num_t_min;
5149 #if OMPT_SUPPORT
5150  p->codeptr_ra = codeptr_ra;
5151 #endif
5152 
5153 #if OMPX_TASKGRAPH
5154  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5155  new_task_data->tdg = taskdata->tdg;
5156  new_task_data->is_taskgraph = 0;
5157 #endif
5158 
5159 #if OMPT_SUPPORT
5160  // schedule new task with correct return address for OMPT events
5161  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5162 #else
5163  __kmp_omp_task(gtid, new_task, true); // schedule new task
5164 #endif
5165 
5166  // execute the 1st half of current subrange
5167  if (n_tsk0 > num_t_min)
5168  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
5169  ext0, last_chunk0, tc0, num_t_min,
5170 #if OMPT_SUPPORT
5171  codeptr_ra,
5172 #endif
5173  task_dup);
5174  else
5175  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
5176  gr_size0, ext0, last_chunk0, tc0,
5177 #if OMPT_SUPPORT
5178  codeptr_ra,
5179 #endif
5180  task_dup);
5181 
5182  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5183 }
5184 
5185 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5186  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5187  int nogroup, int sched, kmp_uint64 grainsize,
5188  int modifier, void *task_dup) {
5189  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5190  KMP_DEBUG_ASSERT(task != NULL);
5191  if (nogroup == 0) {
5192 #if OMPT_SUPPORT && OMPT_OPTIONAL
5193  OMPT_STORE_RETURN_ADDRESS(gtid);
5194 #endif
5195  __kmpc_taskgroup(loc, gtid);
5196  }
5197 
5198 #if OMPX_TASKGRAPH
5199  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5200 #endif
5201  // =========================================================================
5202  // calculate loop parameters
5203  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5204  kmp_uint64 tc;
5205  // compiler provides global bounds here
5206  kmp_uint64 lower = task_bounds.get_lb();
5207  kmp_uint64 upper = task_bounds.get_ub();
5208  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5209  kmp_uint64 num_tasks = 0, extras = 0;
5210  kmp_int64 last_chunk =
5211  0; // reduce grainsize of last task by last_chunk in strict mode
5212  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5213  kmp_info_t *thread = __kmp_threads[gtid];
5214  kmp_taskdata_t *current_task = thread->th.th_current_task;
5215 
5216  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5217  "grain %llu(%d, %d), dup %p\n",
5218  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5219  task_dup));
5220 
5221  // compute trip count
5222  if (st == 1) { // most common case
5223  tc = upper - lower + 1;
5224  } else if (st < 0) {
5225  tc = (lower - upper) / (-st) + 1;
5226  } else { // st > 0
5227  tc = (upper - lower) / st + 1;
5228  }
5229  if (tc == 0) {
5230  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5231  // free the pattern task and exit
5232  __kmp_task_start(gtid, task, current_task);
5233  // do not execute anything for zero-trip loop
5234  __kmp_task_finish<false>(gtid, task, current_task);
5235  return;
5236  }
5237 
5238 #if OMPT_SUPPORT && OMPT_OPTIONAL
5239  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5240  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5241  if (ompt_enabled.ompt_callback_work) {
5242  ompt_callbacks.ompt_callback(ompt_callback_work)(
5243  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5244  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5245  }
5246 #endif
5247 
5248  if (num_tasks_min == 0)
5249  // TODO: can we choose better default heuristic?
5250  num_tasks_min =
5251  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5252 
5253  // compute num_tasks/grainsize based on the input provided
5254  switch (sched) {
5255  case 0: // no schedule clause specified, we can choose the default
5256  // let's try to schedule (team_size*10) tasks
5257  grainsize = thread->th.th_team_nproc * 10;
5258  KMP_FALLTHROUGH();
5259  case 2: // num_tasks provided
5260  if (grainsize > tc) {
5261  num_tasks = tc; // too big num_tasks requested, adjust values
5262  grainsize = 1;
5263  extras = 0;
5264  } else {
5265  num_tasks = grainsize;
5266  grainsize = tc / num_tasks;
5267  extras = tc % num_tasks;
5268  }
5269  break;
5270  case 1: // grainsize provided
5271  if (grainsize > tc) {
5272  num_tasks = 1;
5273  grainsize = tc; // too big grainsize requested, adjust values
5274  extras = 0;
5275  } else {
5276  if (modifier) {
5277  num_tasks = (tc + grainsize - 1) / grainsize;
5278  last_chunk = tc - (num_tasks * grainsize);
5279  extras = 0;
5280  } else {
5281  num_tasks = tc / grainsize;
5282  // adjust grainsize for balanced distribution of iterations
5283  grainsize = tc / num_tasks;
5284  extras = tc % num_tasks;
5285  }
5286  }
5287  break;
5288  default:
5289  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5290  }
5291 
5292  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5293  (last_chunk < 0 ? last_chunk : extras));
5294  KMP_DEBUG_ASSERT(num_tasks > extras);
5295  KMP_DEBUG_ASSERT(num_tasks > 0);
5296  // =========================================================================
5297 
5298  // check if clause value first
5299  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5300  if (if_val == 0) { // if(0) specified, mark task as serial
5301  taskdata->td_flags.task_serial = 1;
5302  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5303  // always start serial tasks linearly
5304  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5305  grainsize, extras, last_chunk, tc,
5306 #if OMPT_SUPPORT
5307  OMPT_GET_RETURN_ADDRESS(0),
5308 #endif
5309  task_dup);
5310  // !taskdata->td_flags.native => currently force linear spawning of tasks
5311  // for GOMP_taskloop
5312  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5313  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5314  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5315  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5316  last_chunk));
5317  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5318  grainsize, extras, last_chunk, tc, num_tasks_min,
5319 #if OMPT_SUPPORT
5320  OMPT_GET_RETURN_ADDRESS(0),
5321 #endif
5322  task_dup);
5323  } else {
5324  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5325  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5326  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5327  last_chunk));
5328  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5329  grainsize, extras, last_chunk, tc,
5330 #if OMPT_SUPPORT
5331  OMPT_GET_RETURN_ADDRESS(0),
5332 #endif
5333  task_dup);
5334  }
5335 
5336 #if OMPT_SUPPORT && OMPT_OPTIONAL
5337  if (ompt_enabled.ompt_callback_work) {
5338  ompt_callbacks.ompt_callback(ompt_callback_work)(
5339  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5340  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5341  }
5342 #endif
5343 
5344  if (nogroup == 0) {
5345 #if OMPT_SUPPORT && OMPT_OPTIONAL
5346  OMPT_STORE_RETURN_ADDRESS(gtid);
5347 #endif
5348  __kmpc_end_taskgroup(loc, gtid);
5349  }
5350  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5351 }
5352 
5369 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5370  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5371  int sched, kmp_uint64 grainsize, void *task_dup) {
5372  __kmp_assert_valid_gtid(gtid);
5373  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5374  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5375  0, task_dup);
5376  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5377 }
5378 
5396 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5397  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5398  int nogroup, int sched, kmp_uint64 grainsize,
5399  int modifier, void *task_dup) {
5400  __kmp_assert_valid_gtid(gtid);
5401  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5402  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5403  modifier, task_dup);
5404  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5405 }
5406 
5416  if (gtid == KMP_GTID_DNE)
5417  return NULL;
5418 
5419  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5420  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5421 
5422  if (!taskdata)
5423  return NULL;
5424 
5425  return &taskdata->td_target_data.async_handle;
5426 }
5427 
5436 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5437  if (gtid == KMP_GTID_DNE)
5438  return FALSE;
5439 
5440  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5441  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5442 
5443  if (!taskdata)
5444  return FALSE;
5445 
5446  return taskdata->td_task_team != NULL;
5447 }
5448 
5449 #if OMPX_TASKGRAPH
5450 // __kmp_find_tdg: identify a TDG through its ID
5451 // gtid: Global Thread ID
5452 // tdg_id: ID of the TDG
5453 // returns: If a TDG corresponding to this ID is found and not
5454 // its initial state, return the pointer to it, otherwise nullptr
5455 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5456  kmp_tdg_info_t *res = nullptr;
5457  if (__kmp_max_tdgs == 0)
5458  return res;
5459 
5460  if (__kmp_global_tdgs == NULL)
5461  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5462  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5463 
5464  if ((__kmp_global_tdgs[tdg_id]) &&
5465  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5466  res = __kmp_global_tdgs[tdg_id];
5467  return res;
5468 }
5469 
5470 // __kmp_print_tdg_dot: prints the TDG to a dot file
5471 // tdg: ID of the TDG
5472 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) {
5473  kmp_int32 tdg_id = tdg->tdg_id;
5474  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5475 
5476  char file_name[20];
5477  sprintf(file_name, "tdg_%d.dot", tdg_id);
5478  kmp_safe_raii_file_t tdg_file(file_name, "w");
5479 
5480  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5481  fprintf(tdg_file,
5482  "digraph TDG {\n"
5483  " compound=true\n"
5484  " subgraph cluster {\n"
5485  " label=TDG_%d\n",
5486  tdg_id);
5487  for (kmp_int32 i = 0; i < num_tasks; i++) {
5488  fprintf(tdg_file, " %d[style=bold]\n", i);
5489  }
5490  fprintf(tdg_file, " }\n");
5491  for (kmp_int32 i = 0; i < num_tasks; i++) {
5492  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5493  kmp_int32 *successors = tdg->record_map[i].successors;
5494  if (nsuccessors > 0) {
5495  for (kmp_int32 j = 0; j < nsuccessors; j++)
5496  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5497  }
5498  }
5499  fprintf(tdg_file, "}");
5500  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5501 }
5502 
5503 // __kmp_start_record: launch the execution of a previous
5504 // recorded TDG
5505 // gtid: Global Thread ID
5506 // tdg: ID of the TDG
5507 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5508  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5509  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5510  tdg->tdg_id, tdg->num_roots));
5511  kmp_node_info_t *this_record_map = tdg->record_map;
5512  kmp_int32 *this_root_tasks = tdg->root_tasks;
5513  kmp_int32 this_num_roots = tdg->num_roots;
5514  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5515 
5516  kmp_info_t *thread = __kmp_threads[gtid];
5517  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5518 
5519  if (tdg->rec_taskred_data) {
5520  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5521  }
5522 
5523  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5524  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5525 
5526  td->td_parent = parent_task;
5527  this_record_map[j].parent_task = parent_task;
5528 
5529  kmp_taskgroup_t *parent_taskgroup =
5530  this_record_map[j].parent_task->td_taskgroup;
5531 
5532  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5533  this_record_map[j].npredecessors);
5534  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5535 
5536  if (parent_taskgroup) {
5537  KMP_ATOMIC_INC(&parent_taskgroup->count);
5538  // The taskgroup is different so we must update it
5539  td->td_taskgroup = parent_taskgroup;
5540  } else if (td->td_taskgroup != nullptr) {
5541  // If the parent doesnt have a taskgroup, remove it from the task
5542  td->td_taskgroup = nullptr;
5543  }
5544  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5545  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5546  }
5547 
5548  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5549  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5550  }
5551  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5552  tdg->tdg_id, tdg->num_roots));
5553 }
5554 
5555 // __kmp_start_record: set up a TDG structure and turn the
5556 // recording flag to true
5557 // gtid: Global Thread ID of the encountering thread
5558 // input_flags: Flags associated with the TDG
5559 // tdg_id: ID of the TDG to record
5560 static inline void __kmp_start_record(kmp_int32 gtid,
5561  kmp_taskgraph_flags_t *flags,
5562  kmp_int32 tdg_id) {
5563  kmp_tdg_info_t *tdg =
5564  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5565  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5566  // Initializing the TDG structure
5567  tdg->tdg_id = tdg_id;
5568  tdg->map_size = INIT_MAPSIZE;
5569  tdg->num_roots = -1;
5570  tdg->root_tasks = nullptr;
5571  tdg->tdg_status = KMP_TDG_RECORDING;
5572  tdg->rec_num_taskred = 0;
5573  tdg->rec_taskred_data = nullptr;
5574  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5575 
5576  // Initializing the list of nodes in this TDG
5577  kmp_node_info_t *this_record_map =
5578  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5579  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5580  kmp_int32 *successorsList =
5581  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5582  this_record_map[i].task = nullptr;
5583  this_record_map[i].successors = successorsList;
5584  this_record_map[i].nsuccessors = 0;
5585  this_record_map[i].npredecessors = 0;
5586  this_record_map[i].successors_size = __kmp_successors_size;
5587  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5588  }
5589 
5590  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5591 }
5592 
5593 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5594 // the beginning of the record process of a task region
5595 // loc_ref: Location of TDG, not used yet
5596 // gtid: Global Thread ID of the encountering thread
5597 // input_flags: Flags associated with the TDG
5598 // tdg_id: ID of the TDG to record, for now, incremental integer
5599 // returns: 1 if we record, otherwise, 0
5600 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5601  kmp_int32 input_flags, kmp_int32 tdg_id) {
5602 
5603  kmp_int32 res;
5604  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5605  KA_TRACE(10,
5606  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5607  gtid, loc_ref, input_flags, tdg_id));
5608 
5609  if (__kmp_max_tdgs == 0) {
5610  KA_TRACE(
5611  10,
5612  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5613  "__kmp_max_tdgs = 0\n",
5614  gtid, loc_ref, input_flags, tdg_id));
5615  return 1;
5616  }
5617 
5618  __kmpc_taskgroup(loc_ref, gtid);
5619  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5620  // TODO: use re_record flag
5621  __kmp_exec_tdg(gtid, tdg);
5622  res = 0;
5623  } else {
5624  __kmp_curr_tdg_idx = tdg_id;
5625  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5626  __kmp_start_record(gtid, flags, tdg_id);
5627  __kmp_num_tdg++;
5628  res = 1;
5629  }
5630  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5631  gtid, tdg_id, res ? "record" : "execute"));
5632  return res;
5633 }
5634 
5635 // __kmp_end_record: set up a TDG after recording it
5636 // gtid: Global thread ID
5637 // tdg: Pointer to the TDG
5638 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5639  // Store roots
5640  kmp_node_info_t *this_record_map = tdg->record_map;
5641  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5642  kmp_int32 *this_root_tasks =
5643  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5644  kmp_int32 this_map_size = tdg->map_size;
5645  kmp_int32 this_num_roots = 0;
5646  kmp_info_t *thread = __kmp_threads[gtid];
5647 
5648  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5649  if (this_record_map[i].npredecessors == 0) {
5650  this_root_tasks[this_num_roots++] = i;
5651  }
5652  }
5653 
5654  // Update with roots info and mapsize
5655  tdg->map_size = this_map_size;
5656  tdg->num_roots = this_num_roots;
5657  tdg->root_tasks = this_root_tasks;
5658  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5659  tdg->tdg_status = KMP_TDG_READY;
5660 
5661  if (thread->th.th_current_task->td_dephash) {
5662  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5663  thread->th.th_current_task->td_dephash = NULL;
5664  }
5665 
5666  // Reset predecessor counter
5667  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5668  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5669  this_record_map[i].npredecessors);
5670  }
5671  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5672 
5673  if (__kmp_tdg_dot)
5674  __kmp_print_tdg_dot(tdg);
5675 }
5676 
5677 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5678 // the end of recording phase
5679 //
5680 // loc_ref: Source location information
5681 // gtid: Global thread ID
5682 // input_flags: Flags attached to the graph
5683 // tdg_id: ID of the TDG just finished recording
5684 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5685  kmp_int32 input_flags, kmp_int32 tdg_id) {
5686  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5687 
5688  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5689  " tdg=%d with flags=%d\n",
5690  gtid, loc_ref, tdg_id, input_flags));
5691  if (__kmp_max_tdgs) {
5692  // TODO: use input_flags->nowait
5693  __kmpc_end_taskgroup(loc_ref, gtid);
5694  if (__kmp_tdg_is_recording(tdg->tdg_status))
5695  __kmp_end_record(gtid, tdg);
5696  }
5697  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5698  " tdg=%d, its status is now READY\n",
5699  gtid, loc_ref, tdg_id));
5700 }
5701 #endif
kmp_taskred_data::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2444
kmp_taskred_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2461
kmp_taskred_data::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2452
__kmpc_proxy_task_completed
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
Definition: kmp_tasking.cpp:4492
kmp_taskred_data_t
struct kmp_taskred_data kmp_taskred_data_t
kmp_taskred_data::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2445
__kmpc_omp_get_target_async_handle_ptr
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp_tasking.cpp:5415
kmp_taskred_data
Definition: kmp_tasking.cpp:2442
kmp_task_red_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2434
kmp_taskred_data::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2443
__kmpc_taskred_init
void * __kmpc_taskred_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2599
__kmpc_task_reduction_init
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2572
kmp_task_red_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2435
kmp_taskred_data::reduce_pend
void * reduce_pend
Definition: kmp_tasking.cpp:2447
kmp_safe_raii_file_t
Definition: kmp.h:4530
__kmpc_task_reduction_get_th_data
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp_tasking.cpp:2642
__kmpc_taskloop
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
Definition: kmp_tasking.cpp:5369
kmp_task_red_input_t
struct kmp_task_red_input kmp_task_red_input_t
__kmpc_omp_has_task_team
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
Definition: kmp_tasking.cpp:5436
__kmpc_taskloop_5
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
Definition: kmp_tasking.cpp:5396
kmp_taskred_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2463
kmp_taskred_data::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2449
kmp_taskred_data::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2450
kmp_task_red_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2436
__kmpc_proxy_task_completed_ooo
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
Definition: kmp_tasking.cpp:4556
ident
Definition: kmp.h:234
__kmpc_task_reduction_modifier_fini
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
Definition: kmp_tasking.cpp:2851
kmp_taskred_flags
Definition: kmp_tasking.cpp:2420
kmp_task_red_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2431
kmp_taskred_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2467
kmp_taskred_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2465
kmp_taskred_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2466
kmp_taskred_data::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2451
kmp_taskred_input_t
struct kmp_taskred_input kmp_taskred_input_t
kmp_taskred_input
Definition: kmp_tasking.cpp:2460
__kmpc_task_reduction_modifier_init
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2817
kmp_task_red_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2430
__kmpc_taskred_modifier_init
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2837
kmp_taskred_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2468
kmp_taskred_flags_t
struct kmp_taskred_flags kmp_taskred_flags_t
kmp_taskred_input::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2462
kmp_taskred_flags::lazy_priv
unsigned lazy_priv
Definition: kmp_tasking.cpp:2422
kmp_taskred_data::reduce_priv
void * reduce_priv
Definition: kmp_tasking.cpp:2446
kmp_task_red_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2433
KMP_COUNT_BLOCK
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
kmp_task_red_input
Definition: kmp_tasking.cpp:2429
__kmpc_omp_reg_task_with_affinity
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
Definition: kmp_tasking.cpp:1729