LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 /* forward declaration */
25 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
26  kmp_info_t *this_thr);
27 static void __kmp_alloc_task_deque(kmp_info_t *thread,
28  kmp_thread_data_t *thread_data);
29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
30  kmp_task_team_t *task_team);
31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
32 
33 #ifdef BUILD_TIED_TASK_STACK
34 
35 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
36 // from top do bottom
37 //
38 // gtid: global thread identifier for thread containing stack
39 // thread_data: thread data for task team thread containing stack
40 // threshold: value above which the trace statement triggers
41 // location: string identifying call site of this function (for trace)
42 static void __kmp_trace_task_stack(kmp_int32 gtid,
43  kmp_thread_data_t *thread_data,
44  int threshold, char *location) {
45  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
46  kmp_taskdata_t **stack_top = task_stack->ts_top;
47  kmp_int32 entries = task_stack->ts_entries;
48  kmp_taskdata_t *tied_task;
49 
50  KA_TRACE(
51  threshold,
52  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
53  "first_block = %p, stack_top = %p \n",
54  location, gtid, entries, task_stack->ts_first_block, stack_top));
55 
56  KMP_DEBUG_ASSERT(stack_top != NULL);
57  KMP_DEBUG_ASSERT(entries > 0);
58 
59  while (entries != 0) {
60  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
61  // fix up ts_top if we need to pop from previous block
62  if (entries & TASK_STACK_INDEX_MASK == 0) {
63  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
64 
65  stack_block = stack_block->sb_prev;
66  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
67  }
68 
69  // finish bookkeeping
70  stack_top--;
71  entries--;
72 
73  tied_task = *stack_top;
74 
75  KMP_DEBUG_ASSERT(tied_task != NULL);
76  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
77 
78  KA_TRACE(threshold,
79  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
80  "stack_top=%p, tied_task=%p\n",
81  location, gtid, entries, stack_top, tied_task));
82  }
83  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
84 
85  KA_TRACE(threshold,
86  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
87  location, gtid));
88 }
89 
90 // __kmp_init_task_stack: initialize the task stack for the first time
91 // after a thread_data structure is created.
92 // It should not be necessary to do this again (assuming the stack works).
93 //
94 // gtid: global thread identifier of calling thread
95 // thread_data: thread data for task team thread containing stack
96 static void __kmp_init_task_stack(kmp_int32 gtid,
97  kmp_thread_data_t *thread_data) {
98  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
99  kmp_stack_block_t *first_block;
100 
101  // set up the first block of the stack
102  first_block = &task_stack->ts_first_block;
103  task_stack->ts_top = (kmp_taskdata_t **)first_block;
104  memset((void *)first_block, '\0',
105  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
106 
107  // initialize the stack to be empty
108  task_stack->ts_entries = TASK_STACK_EMPTY;
109  first_block->sb_next = NULL;
110  first_block->sb_prev = NULL;
111 }
112 
113 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
114 //
115 // gtid: global thread identifier for calling thread
116 // thread_data: thread info for thread containing stack
117 static void __kmp_free_task_stack(kmp_int32 gtid,
118  kmp_thread_data_t *thread_data) {
119  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
120  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
121 
122  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
123  // free from the second block of the stack
124  while (stack_block != NULL) {
125  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
126 
127  stack_block->sb_next = NULL;
128  stack_block->sb_prev = NULL;
129  if (stack_block != &task_stack->ts_first_block) {
130  __kmp_thread_free(thread,
131  stack_block); // free the block, if not the first
132  }
133  stack_block = next_block;
134  }
135  // initialize the stack to be empty
136  task_stack->ts_entries = 0;
137  task_stack->ts_top = NULL;
138 }
139 
140 // __kmp_push_task_stack: Push the tied task onto the task stack.
141 // Grow the stack if necessary by allocating another block.
142 //
143 // gtid: global thread identifier for calling thread
144 // thread: thread info for thread containing stack
145 // tied_task: the task to push on the stack
146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
147  kmp_taskdata_t *tied_task) {
148  // GEH - need to consider what to do if tt_threads_data not allocated yet
149  kmp_thread_data_t *thread_data =
150  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
151  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
152 
153  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
154  return; // Don't push anything on stack if team or team tasks are serialized
155  }
156 
157  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
158  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
159 
160  KA_TRACE(20,
161  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
162  gtid, thread, tied_task));
163  // Store entry
164  *(task_stack->ts_top) = tied_task;
165 
166  // Do bookkeeping for next push
167  task_stack->ts_top++;
168  task_stack->ts_entries++;
169 
170  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
171  // Find beginning of this task block
172  kmp_stack_block_t *stack_block =
173  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
174 
175  // Check if we already have a block
176  if (stack_block->sb_next !=
177  NULL) { // reset ts_top to beginning of next block
178  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
179  } else { // Alloc new block and link it up
180  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
181  thread, sizeof(kmp_stack_block_t));
182 
183  task_stack->ts_top = &new_block->sb_block[0];
184  stack_block->sb_next = new_block;
185  new_block->sb_prev = stack_block;
186  new_block->sb_next = NULL;
187 
188  KA_TRACE(
189  30,
190  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
191  gtid, tied_task, new_block));
192  }
193  }
194  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
195  tied_task));
196 }
197 
198 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
199 // the task, just check to make sure it matches the ending task passed in.
200 //
201 // gtid: global thread identifier for the calling thread
202 // thread: thread info structure containing stack
203 // tied_task: the task popped off the stack
204 // ending_task: the task that is ending (should match popped task)
205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
206  kmp_taskdata_t *ending_task) {
207  // GEH - need to consider what to do if tt_threads_data not allocated yet
208  kmp_thread_data_t *thread_data =
209  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
210  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
211  kmp_taskdata_t *tied_task;
212 
213  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
214  // Don't pop anything from stack if team or team tasks are serialized
215  return;
216  }
217 
218  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
219  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
220 
221  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
222  thread));
223 
224  // fix up ts_top if we need to pop from previous block
225  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
226  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
227 
228  stack_block = stack_block->sb_prev;
229  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
230  }
231 
232  // finish bookkeeping
233  task_stack->ts_top--;
234  task_stack->ts_entries--;
235 
236  tied_task = *(task_stack->ts_top);
237 
238  KMP_DEBUG_ASSERT(tied_task != NULL);
239  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
240  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
241 
242  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
243  tied_task));
244  return;
245 }
246 #endif /* BUILD_TIED_TASK_STACK */
247 
248 // returns 1 if new task is allowed to execute, 0 otherwise
249 // checks Task Scheduling constraint (if requested) and
250 // mutexinoutset dependencies if any
251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
252  const kmp_taskdata_t *tasknew,
253  const kmp_taskdata_t *taskcurr) {
254  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
255  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
256  // only descendant of all deferred tied tasks can be scheduled, checking
257  // the last one is enough, as it in turn is the descendant of all others
258  kmp_taskdata_t *current = taskcurr->td_last_tied;
259  KMP_DEBUG_ASSERT(current != NULL);
260  // check if the task is not suspended on barrier
261  if (current->td_flags.tasktype == TASK_EXPLICIT ||
262  current->td_taskwait_thread > 0) { // <= 0 on barrier
263  kmp_int32 level = current->td_level;
264  kmp_taskdata_t *parent = tasknew->td_parent;
265  while (parent != current && parent->td_level > level) {
266  // check generation up to the level of the current task
267  parent = parent->td_parent;
268  KMP_DEBUG_ASSERT(parent != NULL);
269  }
270  if (parent != current)
271  return false;
272  }
273  }
274  // Check mutexinoutset dependencies, acquire locks
275  kmp_depnode_t *node = tasknew->td_depnode;
276  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
277  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
278  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
279  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
280  continue;
281  // could not get the lock, release previous locks
282  for (int j = i - 1; j >= 0; --j)
283  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
284  return false;
285  }
286  // negative num_locks means all locks acquired successfully
287  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
288  }
289  return true;
290 }
291 
292 // __kmp_realloc_task_deque:
293 // Re-allocates a task deque for a particular thread, copies the content from
294 // the old deque and adjusts the necessary data structures relating to the
295 // deque. This operation must be done with the deque_lock being held
296 static void __kmp_realloc_task_deque(kmp_info_t *thread,
297  kmp_thread_data_t *thread_data) {
298  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
299  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
300  kmp_int32 new_size = 2 * size;
301 
302  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
303  "%d] for thread_data %p\n",
304  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
305 
306  kmp_taskdata_t **new_deque =
307  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
308 
309  int i, j;
310  for (i = thread_data->td.td_deque_head, j = 0; j < size;
311  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
312  new_deque[j] = thread_data->td.td_deque[i];
313 
314  __kmp_free(thread_data->td.td_deque);
315 
316  thread_data->td.td_deque_head = 0;
317  thread_data->td.td_deque_tail = size;
318  thread_data->td.td_deque = new_deque;
319  thread_data->td.td_deque_size = new_size;
320 }
321 
322 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
323  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
324  kmp_thread_data_t *thread_data = &l->td;
325  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
326  thread_data->td.td_deque_last_stolen = -1;
327  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
328  "for thread_data %p\n",
329  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
330  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
331  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
332  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
333  return l;
334 }
335 
336 // The function finds the deque of priority tasks with given priority, or
337 // allocates a new deque and put it into sorted (high -> low) list of deques.
338 // Deques of non-default priority tasks are shared between all threads in team,
339 // as opposed to per-thread deques of tasks with default priority.
340 // The function is called under the lock task_team->tt.tt_task_pri_lock.
341 static kmp_thread_data_t *
342 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
343  kmp_thread_data_t *thread_data;
344  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
345  if (lst->priority == pri) {
346  // Found queue of tasks with given priority.
347  thread_data = &lst->td;
348  } else if (lst->priority < pri) {
349  // All current priority queues contain tasks with lower priority.
350  // Allocate new one for given priority tasks.
351  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
352  thread_data = &list->td;
353  list->priority = pri;
354  list->next = lst;
355  task_team->tt.tt_task_pri_list = list;
356  } else { // task_team->tt.tt_task_pri_list->priority > pri
357  kmp_task_pri_t *next_queue = lst->next;
358  while (next_queue && next_queue->priority > pri) {
359  lst = next_queue;
360  next_queue = lst->next;
361  }
362  // lst->priority > pri && (next == NULL || pri >= next->priority)
363  if (next_queue == NULL) {
364  // No queue with pri priority, need to allocate new one.
365  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
366  thread_data = &list->td;
367  list->priority = pri;
368  list->next = NULL;
369  lst->next = list;
370  } else if (next_queue->priority == pri) {
371  // Found queue of tasks with given priority.
372  thread_data = &next_queue->td;
373  } else { // lst->priority > pri > next->priority
374  // insert newly allocated between existed queues
375  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
376  thread_data = &list->td;
377  list->priority = pri;
378  list->next = next_queue;
379  lst->next = list;
380  }
381  }
382  return thread_data;
383 }
384 
385 // __kmp_push_priority_task: Add a task to the team's priority task deque
386 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
387  kmp_taskdata_t *taskdata,
388  kmp_task_team_t *task_team,
389  kmp_int32 pri) {
390  kmp_thread_data_t *thread_data = NULL;
391  KA_TRACE(20,
392  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
393  gtid, taskdata, pri));
394 
395  // Find task queue specific to priority value
396  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
397  if (UNLIKELY(lst == NULL)) {
398  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
399  if (task_team->tt.tt_task_pri_list == NULL) {
400  // List of queues is still empty, allocate one.
401  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
402  thread_data = &list->td;
403  list->priority = pri;
404  list->next = NULL;
405  task_team->tt.tt_task_pri_list = list;
406  } else {
407  // Other thread initialized a queue. Check if it fits and get thread_data.
408  thread_data = __kmp_get_priority_deque_data(task_team, pri);
409  }
410  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
411  } else {
412  if (lst->priority == pri) {
413  // Found queue of tasks with given priority.
414  thread_data = &lst->td;
415  } else {
416  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
417  thread_data = __kmp_get_priority_deque_data(task_team, pri);
418  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
419  }
420  }
421  KMP_DEBUG_ASSERT(thread_data);
422 
423  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
424  // Check if deque is full
425  if (TCR_4(thread_data->td.td_deque_ntasks) >=
426  TASK_DEQUE_SIZE(thread_data->td)) {
427  if (__kmp_enable_task_throttling &&
428  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
429  thread->th.th_current_task)) {
430  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
431  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
432  "TASK_NOT_PUSHED for task %p\n",
433  gtid, taskdata));
434  return TASK_NOT_PUSHED;
435  } else {
436  // expand deque to push the task which is not allowed to execute
437  __kmp_realloc_task_deque(thread, thread_data);
438  }
439  }
440  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
441  TASK_DEQUE_SIZE(thread_data->td));
442  // Push taskdata.
443  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
444  // Wrap index.
445  thread_data->td.td_deque_tail =
446  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
447  TCW_4(thread_data->td.td_deque_ntasks,
448  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
449  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
450  KMP_FSYNC_RELEASING(taskdata); // releasing child
451  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
452  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
453  gtid, taskdata, thread_data->td.td_deque_ntasks,
454  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
455  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
456  task_team->tt.tt_num_task_pri++; // atomic inc
457  return TASK_SUCCESSFULLY_PUSHED;
458 }
459 
460 // __kmp_push_task: Add a task to the thread's deque
461 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
462  kmp_info_t *thread = __kmp_threads[gtid];
463  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
464 
465  // If we encounter a hidden helper task, and the current thread is not a
466  // hidden helper thread, we have to give the task to any hidden helper thread
467  // starting from its shadow one.
468  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
469  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
470  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
471  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
472  // Signal the hidden helper threads.
473  __kmp_hidden_helper_worker_thread_signal();
474  return TASK_SUCCESSFULLY_PUSHED;
475  }
476 
477  kmp_task_team_t *task_team = thread->th.th_task_team;
478  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
479  kmp_thread_data_t *thread_data;
480 
481  KA_TRACE(20,
482  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
483 
484  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
485  // untied task needs to increment counter so that the task structure is not
486  // freed prematurely
487  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
488  KMP_DEBUG_USE_VAR(counter);
489  KA_TRACE(
490  20,
491  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
492  gtid, counter, taskdata));
493  }
494 
495  // The first check avoids building task_team thread data if serialized
496  if (UNLIKELY(taskdata->td_flags.task_serial)) {
497  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
498  "TASK_NOT_PUSHED for task %p\n",
499  gtid, taskdata));
500  return TASK_NOT_PUSHED;
501  }
502 
503  // Now that serialized tasks have returned, we can assume that we are not in
504  // immediate exec mode
505  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
506  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
507  __kmp_enable_tasking(task_team, thread);
508  }
509  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
510  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
511 
512  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
513  __kmp_max_task_priority > 0) {
514  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
515  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
516  }
517 
518  // Find tasking deque specific to encountering thread
519  thread_data = &task_team->tt.tt_threads_data[tid];
520 
521  // No lock needed since only owner can allocate. If the task is hidden_helper,
522  // we don't need it either because we have initialized the dequeue for hidden
523  // helper thread data.
524  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
525  __kmp_alloc_task_deque(thread, thread_data);
526  }
527 
528  int locked = 0;
529  // Check if deque is full
530  if (TCR_4(thread_data->td.td_deque_ntasks) >=
531  TASK_DEQUE_SIZE(thread_data->td)) {
532  if (__kmp_enable_task_throttling &&
533  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
534  thread->th.th_current_task)) {
535  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
536  "TASK_NOT_PUSHED for task %p\n",
537  gtid, taskdata));
538  return TASK_NOT_PUSHED;
539  } else {
540  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
541  locked = 1;
542  if (TCR_4(thread_data->td.td_deque_ntasks) >=
543  TASK_DEQUE_SIZE(thread_data->td)) {
544  // expand deque to push the task which is not allowed to execute
545  __kmp_realloc_task_deque(thread, thread_data);
546  }
547  }
548  }
549  // Lock the deque for the task push operation
550  if (!locked) {
551  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
552  // Need to recheck as we can get a proxy task from thread outside of OpenMP
553  if (TCR_4(thread_data->td.td_deque_ntasks) >=
554  TASK_DEQUE_SIZE(thread_data->td)) {
555  if (__kmp_enable_task_throttling &&
556  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
557  thread->th.th_current_task)) {
558  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
559  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
560  "returning TASK_NOT_PUSHED for task %p\n",
561  gtid, taskdata));
562  return TASK_NOT_PUSHED;
563  } else {
564  // expand deque to push the task which is not allowed to execute
565  __kmp_realloc_task_deque(thread, thread_data);
566  }
567  }
568  }
569  // Must have room since no thread can add tasks but calling thread
570  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
571  TASK_DEQUE_SIZE(thread_data->td));
572 
573  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
574  taskdata; // Push taskdata
575  // Wrap index.
576  thread_data->td.td_deque_tail =
577  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
578  TCW_4(thread_data->td.td_deque_ntasks,
579  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
580  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
581  KMP_FSYNC_RELEASING(taskdata); // releasing child
582  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
583  "task=%p ntasks=%d head=%u tail=%u\n",
584  gtid, taskdata, thread_data->td.td_deque_ntasks,
585  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
586 
587  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
588 
589  return TASK_SUCCESSFULLY_PUSHED;
590 }
591 
592 // __kmp_pop_current_task_from_thread: set up current task from called thread
593 // when team ends
594 //
595 // this_thr: thread structure to set current_task in.
596 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
597  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
598  "this_thread=%p, curtask=%p, "
599  "curtask_parent=%p\n",
600  0, this_thr, this_thr->th.th_current_task,
601  this_thr->th.th_current_task->td_parent));
602 
603  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
604 
605  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
606  "this_thread=%p, curtask=%p, "
607  "curtask_parent=%p\n",
608  0, this_thr, this_thr->th.th_current_task,
609  this_thr->th.th_current_task->td_parent));
610 }
611 
612 // __kmp_push_current_task_to_thread: set up current task in called thread for a
613 // new team
614 //
615 // this_thr: thread structure to set up
616 // team: team for implicit task data
617 // tid: thread within team to set up
618 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
619  int tid) {
620  // current task of the thread is a parent of the new just created implicit
621  // tasks of new team
622  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
623  "curtask=%p "
624  "parent_task=%p\n",
625  tid, this_thr, this_thr->th.th_current_task,
626  team->t.t_implicit_task_taskdata[tid].td_parent));
627 
628  KMP_DEBUG_ASSERT(this_thr != NULL);
629 
630  if (tid == 0) {
631  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
632  team->t.t_implicit_task_taskdata[0].td_parent =
633  this_thr->th.th_current_task;
634  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
635  }
636  } else {
637  team->t.t_implicit_task_taskdata[tid].td_parent =
638  team->t.t_implicit_task_taskdata[0].td_parent;
639  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
640  }
641 
642  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
643  "curtask=%p "
644  "parent_task=%p\n",
645  tid, this_thr, this_thr->th.th_current_task,
646  team->t.t_implicit_task_taskdata[tid].td_parent));
647 }
648 
649 // __kmp_task_start: bookkeeping for a task starting execution
650 //
651 // GTID: global thread id of calling thread
652 // task: task starting execution
653 // current_task: task suspending
654 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
655  kmp_taskdata_t *current_task) {
656  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
657  kmp_info_t *thread = __kmp_threads[gtid];
658 
659  KA_TRACE(10,
660  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
661  gtid, taskdata, current_task));
662 
663  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
664 
665  // mark currently executing task as suspended
666  // TODO: GEH - make sure root team implicit task is initialized properly.
667  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
668  current_task->td_flags.executing = 0;
669 
670 // Add task to stack if tied
671 #ifdef BUILD_TIED_TASK_STACK
672  if (taskdata->td_flags.tiedness == TASK_TIED) {
673  __kmp_push_task_stack(gtid, thread, taskdata);
674  }
675 #endif /* BUILD_TIED_TASK_STACK */
676 
677  // mark starting task as executing and as current task
678  thread->th.th_current_task = taskdata;
679 
680  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
681  taskdata->td_flags.tiedness == TASK_UNTIED);
682  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
683  taskdata->td_flags.tiedness == TASK_UNTIED);
684  taskdata->td_flags.started = 1;
685  taskdata->td_flags.executing = 1;
686  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
687  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
688 
689  // GEH TODO: shouldn't we pass some sort of location identifier here?
690  // APT: yes, we will pass location here.
691  // need to store current thread state (in a thread or taskdata structure)
692  // before setting work_state, otherwise wrong state is set after end of task
693 
694  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
695 
696  return;
697 }
698 
699 #if OMPT_SUPPORT
700 //------------------------------------------------------------------------------
701 // __ompt_task_init:
702 // Initialize OMPT fields maintained by a task. This will only be called after
703 // ompt_start_tool, so we already know whether ompt is enabled or not.
704 
705 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
706  // The calls to __ompt_task_init already have the ompt_enabled condition.
707  task->ompt_task_info.task_data.value = 0;
708  task->ompt_task_info.frame.exit_frame = ompt_data_none;
709  task->ompt_task_info.frame.enter_frame = ompt_data_none;
710  task->ompt_task_info.frame.exit_frame_flags =
711  ompt_frame_runtime | ompt_frame_framepointer;
712  task->ompt_task_info.frame.enter_frame_flags =
713  ompt_frame_runtime | ompt_frame_framepointer;
714  task->ompt_task_info.dispatch_chunk.start = 0;
715  task->ompt_task_info.dispatch_chunk.iterations = 0;
716 }
717 
718 // __ompt_task_start:
719 // Build and trigger task-begin event
720 static inline void __ompt_task_start(kmp_task_t *task,
721  kmp_taskdata_t *current_task,
722  kmp_int32 gtid) {
723  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
724  ompt_task_status_t status = ompt_task_switch;
725  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
726  status = ompt_task_yield;
727  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
728  }
729  /* let OMPT know that we're about to run this task */
730  if (ompt_enabled.ompt_callback_task_schedule) {
731  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
732  &(current_task->ompt_task_info.task_data), status,
733  &(taskdata->ompt_task_info.task_data));
734  }
735  taskdata->ompt_task_info.scheduling_parent = current_task;
736 }
737 
738 // __ompt_task_finish:
739 // Build and trigger final task-schedule event
740 static inline void __ompt_task_finish(kmp_task_t *task,
741  kmp_taskdata_t *resumed_task,
742  ompt_task_status_t status) {
743  if (ompt_enabled.ompt_callback_task_schedule) {
744  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
745  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
746  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
747  status = ompt_task_cancel;
748  }
749 
750  /* let OMPT know that we're returning to the callee task */
751  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
752  &(taskdata->ompt_task_info.task_data), status,
753  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
754  }
755 }
756 #endif
757 
758 template <bool ompt>
759 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
760  kmp_task_t *task,
761  void *frame_address,
762  void *return_address) {
763  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
764  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
765 
766  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
767  "current_task=%p\n",
768  gtid, loc_ref, taskdata, current_task));
769 
770  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
771  // untied task needs to increment counter so that the task structure is not
772  // freed prematurely
773  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
774  KMP_DEBUG_USE_VAR(counter);
775  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
776  "incremented for task %p\n",
777  gtid, counter, taskdata));
778  }
779 
780  taskdata->td_flags.task_serial =
781  1; // Execute this task immediately, not deferred.
782  __kmp_task_start(gtid, task, current_task);
783 
784 #if OMPT_SUPPORT
785  if (ompt) {
786  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
787  current_task->ompt_task_info.frame.enter_frame.ptr =
788  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
789  current_task->ompt_task_info.frame.enter_frame_flags =
790  taskdata->ompt_task_info.frame.exit_frame_flags =
791  ompt_frame_application | ompt_frame_framepointer;
792  }
793  if (ompt_enabled.ompt_callback_task_create) {
794  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
795  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
796  &(parent_info->task_data), &(parent_info->frame),
797  &(taskdata->ompt_task_info.task_data),
798  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
799  return_address);
800  }
801  __ompt_task_start(task, current_task, gtid);
802  }
803 #endif // OMPT_SUPPORT
804 
805  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
806  loc_ref, taskdata));
807 }
808 
809 #if OMPT_SUPPORT
810 OMPT_NOINLINE
811 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
812  kmp_task_t *task,
813  void *frame_address,
814  void *return_address) {
815  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
816  return_address);
817 }
818 #endif // OMPT_SUPPORT
819 
820 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
821 // execution
822 //
823 // loc_ref: source location information; points to beginning of task block.
824 // gtid: global thread number.
825 // task: task thunk for the started task.
826 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
827  kmp_task_t *task) {
828 #if OMPT_SUPPORT
829  if (UNLIKELY(ompt_enabled.enabled)) {
830  OMPT_STORE_RETURN_ADDRESS(gtid);
831  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
832  OMPT_GET_FRAME_ADDRESS(1),
833  OMPT_LOAD_RETURN_ADDRESS(gtid));
834  return;
835  }
836 #endif
837  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
838 }
839 
840 #ifdef TASK_UNUSED
841 // __kmpc_omp_task_begin: report that a given task has started execution
842 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
843 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
844  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
845 
846  KA_TRACE(
847  10,
848  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
849  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
850 
851  __kmp_task_start(gtid, task, current_task);
852 
853  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
854  loc_ref, KMP_TASK_TO_TASKDATA(task)));
855  return;
856 }
857 #endif // TASK_UNUSED
858 
859 // __kmp_free_task: free the current task space and the space for shareds
860 //
861 // gtid: Global thread ID of calling thread
862 // taskdata: task to free
863 // thread: thread data structure of caller
864 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
865  kmp_info_t *thread) {
866  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
867  taskdata));
868 
869  // Check to make sure all flags and counters have the correct values
870  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
871  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
872  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
873  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
874  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
875  taskdata->td_flags.task_serial == 1);
876  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
877  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
878  // Clear data to not be re-used later by mistake.
879  task->data1.destructors = NULL;
880  task->data2.priority = 0;
881 
882  taskdata->td_flags.freed = 1;
883 // deallocate the taskdata and shared variable blocks associated with this task
884 #if USE_FAST_MEMORY
885  __kmp_fast_free(thread, taskdata);
886 #else /* ! USE_FAST_MEMORY */
887  __kmp_thread_free(thread, taskdata);
888 #endif
889  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
890 }
891 
892 // __kmp_free_task_and_ancestors: free the current task and ancestors without
893 // children
894 //
895 // gtid: Global thread ID of calling thread
896 // taskdata: task to free
897 // thread: thread data structure of caller
898 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
899  kmp_taskdata_t *taskdata,
900  kmp_info_t *thread) {
901  // Proxy tasks must always be allowed to free their parents
902  // because they can be run in background even in serial mode.
903  kmp_int32 team_serial =
904  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
905  !taskdata->td_flags.proxy;
906  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
907 
908  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
909  KMP_DEBUG_ASSERT(children >= 0);
910 
911  // Now, go up the ancestor tree to see if any ancestors can now be freed.
912  while (children == 0) {
913  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
914 
915  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
916  "and freeing itself\n",
917  gtid, taskdata));
918 
919  // --- Deallocate my ancestor task ---
920  __kmp_free_task(gtid, taskdata, thread);
921 
922  taskdata = parent_taskdata;
923 
924  if (team_serial)
925  return;
926  // Stop checking ancestors at implicit task instead of walking up ancestor
927  // tree to avoid premature deallocation of ancestors.
928  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
929  if (taskdata->td_dephash) { // do we need to cleanup dephash?
930  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
931  kmp_tasking_flags_t flags_old = taskdata->td_flags;
932  if (children == 0 && flags_old.complete == 1) {
933  kmp_tasking_flags_t flags_new = flags_old;
934  flags_new.complete = 0;
935  if (KMP_COMPARE_AND_STORE_ACQ32(
936  RCAST(kmp_int32 *, &taskdata->td_flags),
937  *RCAST(kmp_int32 *, &flags_old),
938  *RCAST(kmp_int32 *, &flags_new))) {
939  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
940  "dephash of implicit task %p\n",
941  gtid, taskdata));
942  // cleanup dephash of finished implicit task
943  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
944  }
945  }
946  }
947  return;
948  }
949  // Predecrement simulated by "- 1" calculation
950  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
951  KMP_DEBUG_ASSERT(children >= 0);
952  }
953 
954  KA_TRACE(
955  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
956  "not freeing it yet\n",
957  gtid, taskdata, children));
958 }
959 
960 // Only need to keep track of child task counts if any of the following:
961 // 1. team parallel and tasking not serialized;
962 // 2. it is a proxy or detachable or hidden helper task
963 // 3. the children counter of its parent task is greater than 0.
964 // The reason for the 3rd one is for serialized team that found detached task,
965 // hidden helper task, T. In this case, the execution of T is still deferred,
966 // and it is also possible that a regular task depends on T. In this case, if we
967 // don't track the children, task synchronization will be broken.
968 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
969  kmp_tasking_flags_t flags = taskdata->td_flags;
970  bool ret = !(flags.team_serial || flags.tasking_ser);
971  ret = ret || flags.proxy == TASK_PROXY ||
972  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
973  ret = ret ||
974  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
975  return ret;
976 }
977 
978 // __kmp_task_finish: bookkeeping to do when a task finishes execution
979 //
980 // gtid: global thread ID for calling thread
981 // task: task to be finished
982 // resumed_task: task to be resumed. (may be NULL if task is serialized)
983 //
984 // template<ompt>: effectively ompt_enabled.enabled!=0
985 // the version with ompt=false is inlined, allowing to optimize away all ompt
986 // code in this case
987 template <bool ompt>
988 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
989  kmp_taskdata_t *resumed_task) {
990  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
991  kmp_info_t *thread = __kmp_threads[gtid];
992  kmp_task_team_t *task_team =
993  thread->th.th_task_team; // might be NULL for serial teams...
994 #if KMP_DEBUG
995  kmp_int32 children = 0;
996 #endif
997  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
998  "task %p\n",
999  gtid, taskdata, resumed_task));
1000 
1001  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1002 
1003 // Pop task from stack if tied
1004 #ifdef BUILD_TIED_TASK_STACK
1005  if (taskdata->td_flags.tiedness == TASK_TIED) {
1006  __kmp_pop_task_stack(gtid, thread, taskdata);
1007  }
1008 #endif /* BUILD_TIED_TASK_STACK */
1009 
1010  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1011  // untied task needs to check the counter so that the task structure is not
1012  // freed prematurely
1013  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1014  KA_TRACE(
1015  20,
1016  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1017  gtid, counter, taskdata));
1018  if (counter > 0) {
1019  // untied task is not done, to be continued possibly by other thread, do
1020  // not free it now
1021  if (resumed_task == NULL) {
1022  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1023  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1024  // task is the parent
1025  }
1026  thread->th.th_current_task = resumed_task; // restore current_task
1027  resumed_task->td_flags.executing = 1; // resume previous task
1028  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1029  "resuming task %p\n",
1030  gtid, taskdata, resumed_task));
1031  return;
1032  }
1033  }
1034 
1035  // bookkeeping for resuming task:
1036  // GEH - note tasking_ser => task_serial
1037  KMP_DEBUG_ASSERT(
1038  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1039  taskdata->td_flags.task_serial);
1040  if (taskdata->td_flags.task_serial) {
1041  if (resumed_task == NULL) {
1042  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1043  // task is the parent
1044  }
1045  } else {
1046  KMP_DEBUG_ASSERT(resumed_task !=
1047  NULL); // verify that resumed task is passed as argument
1048  }
1049 
1050  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1051  destructor thunk that has been generated by the compiler. The code is
1052  placed here, since at this point other tasks might have been released
1053  hence overlapping the destructor invocations with some other work in the
1054  released tasks. The OpenMP spec is not specific on when the destructors
1055  are invoked, so we should be free to choose. */
1056  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1057  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1058  KMP_ASSERT(destr_thunk);
1059  destr_thunk(gtid, task);
1060  }
1061 
1062  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1063  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1064  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1065 
1066  bool detach = false;
1067  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1068  if (taskdata->td_allow_completion_event.type ==
1069  KMP_EVENT_ALLOW_COMPLETION) {
1070  // event hasn't been fulfilled yet. Try to detach task.
1071  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1072  if (taskdata->td_allow_completion_event.type ==
1073  KMP_EVENT_ALLOW_COMPLETION) {
1074  // task finished execution
1075  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1076  taskdata->td_flags.executing = 0; // suspend the finishing task
1077 
1078 #if OMPT_SUPPORT
1079  // For a detached task, which is not completed, we switch back
1080  // the omp_fulfill_event signals completion
1081  // locking is necessary to avoid a race with ompt_task_late_fulfill
1082  if (ompt)
1083  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1084 #endif
1085 
1086  // no access to taskdata after this point!
1087  // __kmp_fulfill_event might free taskdata at any time from now
1088 
1089  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1090  detach = true;
1091  }
1092  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1093  }
1094  }
1095 
1096  if (!detach) {
1097  taskdata->td_flags.complete = 1; // mark the task as completed
1098 
1099 #if OMPT_SUPPORT
1100  // This is not a detached task, we are done here
1101  if (ompt)
1102  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1103 #endif
1104  // TODO: What would be the balance between the conditions in the function
1105  // and an atomic operation?
1106  if (__kmp_track_children_task(taskdata)) {
1107  __kmp_release_deps(gtid, taskdata);
1108  // Predecrement simulated by "- 1" calculation
1109 #if KMP_DEBUG
1110  children = -1 +
1111 #endif
1112  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1113  KMP_DEBUG_ASSERT(children >= 0);
1114  if (taskdata->td_taskgroup)
1115  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1116  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1117  task_team->tt.tt_hidden_helper_task_encountered)) {
1118  // if we found proxy or hidden helper tasks there could exist a dependency
1119  // chain with the proxy task as origin
1120  __kmp_release_deps(gtid, taskdata);
1121  }
1122  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1123  // called. Othertwise, if a task is executed immediately from the
1124  // release_deps code, the flag will be reset to 1 again by this same
1125  // function
1126  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1127  taskdata->td_flags.executing = 0; // suspend the finishing task
1128  }
1129 
1130  KA_TRACE(
1131  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1132  gtid, taskdata, children));
1133 
1134  // Free this task and then ancestor tasks if they have no children.
1135  // Restore th_current_task first as suggested by John:
1136  // johnmc: if an asynchronous inquiry peers into the runtime system
1137  // it doesn't see the freed task as the current task.
1138  thread->th.th_current_task = resumed_task;
1139  if (!detach)
1140  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1141 
1142  // TODO: GEH - make sure root team implicit task is initialized properly.
1143  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1144  resumed_task->td_flags.executing = 1; // resume previous task
1145 
1146  KA_TRACE(
1147  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1148  gtid, taskdata, resumed_task));
1149 
1150  return;
1151 }
1152 
1153 template <bool ompt>
1154 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1155  kmp_int32 gtid,
1156  kmp_task_t *task) {
1157  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1158  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1159  KMP_DEBUG_ASSERT(gtid >= 0);
1160  // this routine will provide task to resume
1161  __kmp_task_finish<ompt>(gtid, task, NULL);
1162 
1163  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1164  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1165 
1166 #if OMPT_SUPPORT
1167  if (ompt) {
1168  ompt_frame_t *ompt_frame;
1169  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1170  ompt_frame->enter_frame = ompt_data_none;
1171  ompt_frame->enter_frame_flags =
1172  ompt_frame_runtime | ompt_frame_framepointer;
1173  }
1174 #endif
1175 
1176  return;
1177 }
1178 
1179 #if OMPT_SUPPORT
1180 OMPT_NOINLINE
1181 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1182  kmp_task_t *task) {
1183  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1184 }
1185 #endif // OMPT_SUPPORT
1186 
1187 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1188 //
1189 // loc_ref: source location information; points to end of task block.
1190 // gtid: global thread number.
1191 // task: task thunk for the completed task.
1192 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1193  kmp_task_t *task) {
1194 #if OMPT_SUPPORT
1195  if (UNLIKELY(ompt_enabled.enabled)) {
1196  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1197  return;
1198  }
1199 #endif
1200  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1201 }
1202 
1203 #ifdef TASK_UNUSED
1204 // __kmpc_omp_task_complete: report that a task has completed execution
1205 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1206 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1207  kmp_task_t *task) {
1208  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1209  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1210 
1211  __kmp_task_finish<false>(gtid, task,
1212  NULL); // Not sure how to find task to resume
1213 
1214  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1215  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1216  return;
1217 }
1218 #endif // TASK_UNUSED
1219 
1220 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1221 // task for a given thread
1222 //
1223 // loc_ref: reference to source location of parallel region
1224 // this_thr: thread data structure corresponding to implicit task
1225 // team: team for this_thr
1226 // tid: thread id of given thread within team
1227 // set_curr_task: TRUE if need to push current task to thread
1228 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1229 // have already been done elsewhere.
1230 // TODO: Get better loc_ref. Value passed in may be NULL
1231 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1232  kmp_team_t *team, int tid, int set_curr_task) {
1233  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1234 
1235  KF_TRACE(
1236  10,
1237  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1238  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1239 
1240  task->td_task_id = KMP_GEN_TASK_ID();
1241  task->td_team = team;
1242  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1243  // in debugger)
1244  task->td_ident = loc_ref;
1245  task->td_taskwait_ident = NULL;
1246  task->td_taskwait_counter = 0;
1247  task->td_taskwait_thread = 0;
1248 
1249  task->td_flags.tiedness = TASK_TIED;
1250  task->td_flags.tasktype = TASK_IMPLICIT;
1251  task->td_flags.proxy = TASK_FULL;
1252 
1253  // All implicit tasks are executed immediately, not deferred
1254  task->td_flags.task_serial = 1;
1255  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1256  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1257 
1258  task->td_flags.started = 1;
1259  task->td_flags.executing = 1;
1260  task->td_flags.complete = 0;
1261  task->td_flags.freed = 0;
1262 
1263  task->td_depnode = NULL;
1264  task->td_last_tied = task;
1265  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1266 
1267  if (set_curr_task) { // only do this init first time thread is created
1268  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1269  // Not used: don't need to deallocate implicit task
1270  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1271  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1272  task->td_dephash = NULL;
1273  __kmp_push_current_task_to_thread(this_thr, team, tid);
1274  } else {
1275  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1276  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1277  }
1278 
1279 #if OMPT_SUPPORT
1280  if (UNLIKELY(ompt_enabled.enabled))
1281  __ompt_task_init(task, tid);
1282 #endif
1283 
1284  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1285  team, task));
1286 }
1287 
1288 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1289 // at the end of parallel regions. Some resources are kept for reuse in the next
1290 // parallel region.
1291 //
1292 // thread: thread data structure corresponding to implicit task
1293 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1294  kmp_taskdata_t *task = thread->th.th_current_task;
1295  if (task->td_dephash) {
1296  int children;
1297  task->td_flags.complete = 1;
1298  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1299  kmp_tasking_flags_t flags_old = task->td_flags;
1300  if (children == 0 && flags_old.complete == 1) {
1301  kmp_tasking_flags_t flags_new = flags_old;
1302  flags_new.complete = 0;
1303  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1304  *RCAST(kmp_int32 *, &flags_old),
1305  *RCAST(kmp_int32 *, &flags_new))) {
1306  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1307  "dephash of implicit task %p\n",
1308  thread->th.th_info.ds.ds_gtid, task));
1309  __kmp_dephash_free_entries(thread, task->td_dephash);
1310  }
1311  }
1312  }
1313 }
1314 
1315 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1316 // when these are destroyed regions
1317 //
1318 // thread: thread data structure corresponding to implicit task
1319 void __kmp_free_implicit_task(kmp_info_t *thread) {
1320  kmp_taskdata_t *task = thread->th.th_current_task;
1321  if (task && task->td_dephash) {
1322  __kmp_dephash_free(thread, task->td_dephash);
1323  task->td_dephash = NULL;
1324  }
1325 }
1326 
1327 // Round up a size to a power of two specified by val: Used to insert padding
1328 // between structures co-allocated using a single malloc() call
1329 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1330  if (size & (val - 1)) {
1331  size &= ~(val - 1);
1332  if (size <= KMP_SIZE_T_MAX - val) {
1333  size += val; // Round up if there is no overflow.
1334  }
1335  }
1336  return size;
1337 } // __kmp_round_up_to_va
1338 
1339 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1340 //
1341 // loc_ref: source location information
1342 // gtid: global thread number.
1343 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1344 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1345 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1346 // private vars accessed in task.
1347 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1348 // in task.
1349 // task_entry: Pointer to task code entry point generated by compiler.
1350 // returns: a pointer to the allocated kmp_task_t structure (task).
1351 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1352  kmp_tasking_flags_t *flags,
1353  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1354  kmp_routine_entry_t task_entry) {
1355  kmp_task_t *task;
1356  kmp_taskdata_t *taskdata;
1357  kmp_info_t *thread = __kmp_threads[gtid];
1358  kmp_team_t *team = thread->th.th_team;
1359  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1360  size_t shareds_offset;
1361 
1362  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1363  __kmp_middle_initialize();
1364 
1365  if (flags->hidden_helper) {
1366  if (__kmp_enable_hidden_helper) {
1367  if (!TCR_4(__kmp_init_hidden_helper))
1368  __kmp_hidden_helper_initialize();
1369  } else {
1370  // If the hidden helper task is not enabled, reset the flag to FALSE.
1371  flags->hidden_helper = FALSE;
1372  }
1373  }
1374 
1375  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1376  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1377  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1378  sizeof_shareds, task_entry));
1379 
1380  KMP_DEBUG_ASSERT(parent_task);
1381  if (parent_task->td_flags.final) {
1382  if (flags->merged_if0) {
1383  }
1384  flags->final = 1;
1385  }
1386 
1387  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1388  // Untied task encountered causes the TSC algorithm to check entire deque of
1389  // the victim thread. If no untied task encountered, then checking the head
1390  // of the deque should be enough.
1391  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1392  }
1393 
1394  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1395  // the tasking setup
1396  // when that happens is too late.
1397  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1398  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1399  if (flags->proxy == TASK_PROXY) {
1400  flags->tiedness = TASK_UNTIED;
1401  flags->merged_if0 = 1;
1402  }
1403  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1404  tasking support enabled */
1405  if ((thread->th.th_task_team) == NULL) {
1406  /* This should only happen if the team is serialized
1407  setup a task team and propagate it to the thread */
1408  KMP_DEBUG_ASSERT(team->t.t_serialized);
1409  KA_TRACE(30,
1410  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1411  gtid));
1412  // 1 indicates setup the current team regardless of nthreads
1413  __kmp_task_team_setup(thread, team, 1);
1414  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1415  }
1416  kmp_task_team_t *task_team = thread->th.th_task_team;
1417 
1418  /* tasking must be enabled now as the task might not be pushed */
1419  if (!KMP_TASKING_ENABLED(task_team)) {
1420  KA_TRACE(
1421  30,
1422  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1423  __kmp_enable_tasking(task_team, thread);
1424  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1425  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1426  // No lock needed since only owner can allocate
1427  if (thread_data->td.td_deque == NULL) {
1428  __kmp_alloc_task_deque(thread, thread_data);
1429  }
1430  }
1431 
1432  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1433  task_team->tt.tt_found_proxy_tasks == FALSE)
1434  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1435  if (flags->hidden_helper &&
1436  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1437  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1438  }
1439 
1440  // Calculate shared structure offset including padding after kmp_task_t struct
1441  // to align pointers in shared struct
1442  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1443  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1444 
1445  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1446  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1447  shareds_offset));
1448  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1449  sizeof_shareds));
1450 
1451  // Avoid double allocation here by combining shareds with taskdata
1452 #if USE_FAST_MEMORY
1453  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1454  sizeof_shareds);
1455 #else /* ! USE_FAST_MEMORY */
1456  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1457  sizeof_shareds);
1458 #endif /* USE_FAST_MEMORY */
1459 
1460  task = KMP_TASKDATA_TO_TASK(taskdata);
1461 
1462 // Make sure task & taskdata are aligned appropriately
1463 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1464  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1465  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1466 #else
1467  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1468  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1469 #endif
1470  if (sizeof_shareds > 0) {
1471  // Avoid double allocation here by combining shareds with taskdata
1472  task->shareds = &((char *)taskdata)[shareds_offset];
1473  // Make sure shareds struct is aligned to pointer size
1474  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1475  0);
1476  } else {
1477  task->shareds = NULL;
1478  }
1479  task->routine = task_entry;
1480  task->part_id = 0; // AC: Always start with 0 part id
1481 
1482  taskdata->td_task_id = KMP_GEN_TASK_ID();
1483  taskdata->td_team = thread->th.th_team;
1484  taskdata->td_alloc_thread = thread;
1485  taskdata->td_parent = parent_task;
1486  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1487  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1488  taskdata->td_ident = loc_ref;
1489  taskdata->td_taskwait_ident = NULL;
1490  taskdata->td_taskwait_counter = 0;
1491  taskdata->td_taskwait_thread = 0;
1492  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1493  // avoid copying icvs for proxy tasks
1494  if (flags->proxy == TASK_FULL)
1495  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1496 
1497  taskdata->td_flags = *flags;
1498  taskdata->td_task_team = thread->th.th_task_team;
1499  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1500  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1501  // If it is hidden helper task, we need to set the team and task team
1502  // correspondingly.
1503  if (flags->hidden_helper) {
1504  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1505  taskdata->td_team = shadow_thread->th.th_team;
1506  taskdata->td_task_team = shadow_thread->th.th_task_team;
1507  }
1508 
1509  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1510  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1511 
1512  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1513  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1514 
1515  // GEH - Note we serialize the task if the team is serialized to make sure
1516  // implicit parallel region tasks are not left until program termination to
1517  // execute. Also, it helps locality to execute immediately.
1518 
1519  taskdata->td_flags.task_serial =
1520  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1521  taskdata->td_flags.tasking_ser || flags->merged_if0);
1522 
1523  taskdata->td_flags.started = 0;
1524  taskdata->td_flags.executing = 0;
1525  taskdata->td_flags.complete = 0;
1526  taskdata->td_flags.freed = 0;
1527 
1528  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1529  // start at one because counts current task and children
1530  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1531  taskdata->td_taskgroup =
1532  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1533  taskdata->td_dephash = NULL;
1534  taskdata->td_depnode = NULL;
1535  if (flags->tiedness == TASK_UNTIED)
1536  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1537  else
1538  taskdata->td_last_tied = taskdata;
1539  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1540 #if OMPT_SUPPORT
1541  if (UNLIKELY(ompt_enabled.enabled))
1542  __ompt_task_init(taskdata, gtid);
1543 #endif
1544  // TODO: What would be the balance between the conditions in the function and
1545  // an atomic operation?
1546  if (__kmp_track_children_task(taskdata)) {
1547  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1548  if (parent_task->td_taskgroup)
1549  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1550  // Only need to keep track of allocated child tasks for explicit tasks since
1551  // implicit not deallocated
1552  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1553  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1554  }
1555  if (flags->hidden_helper) {
1556  taskdata->td_flags.task_serial = FALSE;
1557  // Increment the number of hidden helper tasks to be executed
1558  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1559  }
1560  }
1561 
1562  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1563  gtid, taskdata, taskdata->td_parent));
1564 
1565  return task;
1566 }
1567 
1568 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1569  kmp_int32 flags, size_t sizeof_kmp_task_t,
1570  size_t sizeof_shareds,
1571  kmp_routine_entry_t task_entry) {
1572  kmp_task_t *retval;
1573  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1574  __kmp_assert_valid_gtid(gtid);
1575  input_flags->native = FALSE;
1576  // __kmp_task_alloc() sets up all other runtime flags
1577  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1578  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1579  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1580  input_flags->proxy ? "proxy" : "",
1581  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1582  sizeof_shareds, task_entry));
1583 
1584  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1585  sizeof_shareds, task_entry);
1586 
1587  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1588 
1589  return retval;
1590 }
1591 
1592 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1593  kmp_int32 flags,
1594  size_t sizeof_kmp_task_t,
1595  size_t sizeof_shareds,
1596  kmp_routine_entry_t task_entry,
1597  kmp_int64 device_id) {
1598  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1599  // target task is untied defined in the specification
1600  input_flags.tiedness = TASK_UNTIED;
1601 
1602  if (__kmp_enable_hidden_helper)
1603  input_flags.hidden_helper = TRUE;
1604 
1605  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1606  sizeof_shareds, task_entry);
1607 }
1608 
1622 kmp_int32
1624  kmp_task_t *new_task, kmp_int32 naffins,
1625  kmp_task_affinity_info_t *affin_list) {
1626  return 0;
1627 }
1628 
1629 // __kmp_invoke_task: invoke the specified task
1630 //
1631 // gtid: global thread ID of caller
1632 // task: the task to invoke
1633 // current_task: the task to resume after task invocation
1634 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1635  kmp_taskdata_t *current_task) {
1636  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1637  kmp_info_t *thread;
1638  int discard = 0 /* false */;
1639  KA_TRACE(
1640  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1641  gtid, taskdata, current_task));
1642  KMP_DEBUG_ASSERT(task);
1643  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1644  taskdata->td_flags.complete == 1)) {
1645  // This is a proxy task that was already completed but it needs to run
1646  // its bottom-half finish
1647  KA_TRACE(
1648  30,
1649  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1650  gtid, taskdata));
1651 
1652  __kmp_bottom_half_finish_proxy(gtid, task);
1653 
1654  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1655  "proxy task %p, resuming task %p\n",
1656  gtid, taskdata, current_task));
1657 
1658  return;
1659  }
1660 
1661 #if OMPT_SUPPORT
1662  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1663  // does not execute code.
1664  ompt_thread_info_t oldInfo;
1665  if (UNLIKELY(ompt_enabled.enabled)) {
1666  // Store the threads states and restore them after the task
1667  thread = __kmp_threads[gtid];
1668  oldInfo = thread->th.ompt_thread_info;
1669  thread->th.ompt_thread_info.wait_id = 0;
1670  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1671  ? ompt_state_work_serial
1672  : ompt_state_work_parallel;
1673  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1674  }
1675 #endif
1676 
1677  // Decreament the counter of hidden helper tasks to be executed
1678  if (taskdata->td_flags.hidden_helper) {
1679  // Hidden helper tasks can only be executed by hidden helper threads
1680  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1681  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1682  }
1683 
1684  // Proxy tasks are not handled by the runtime
1685  if (taskdata->td_flags.proxy != TASK_PROXY) {
1686  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1687  }
1688 
1689  // TODO: cancel tasks if the parallel region has also been cancelled
1690  // TODO: check if this sequence can be hoisted above __kmp_task_start
1691  // if cancellation has been enabled for this run ...
1692  if (UNLIKELY(__kmp_omp_cancellation)) {
1693  thread = __kmp_threads[gtid];
1694  kmp_team_t *this_team = thread->th.th_team;
1695  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1696  if ((taskgroup && taskgroup->cancel_request) ||
1697  (this_team->t.t_cancel_request == cancel_parallel)) {
1698 #if OMPT_SUPPORT && OMPT_OPTIONAL
1699  ompt_data_t *task_data;
1700  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1701  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1702  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1703  task_data,
1704  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1705  : ompt_cancel_parallel) |
1706  ompt_cancel_discarded_task,
1707  NULL);
1708  }
1709 #endif
1710  KMP_COUNT_BLOCK(TASK_cancelled);
1711  // this task belongs to a task group and we need to cancel it
1712  discard = 1 /* true */;
1713  }
1714  }
1715 
1716  // Invoke the task routine and pass in relevant data.
1717  // Thunks generated by gcc take a different argument list.
1718  if (!discard) {
1719  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1720  taskdata->td_last_tied = current_task->td_last_tied;
1721  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1722  }
1723 #if KMP_STATS_ENABLED
1724  KMP_COUNT_BLOCK(TASK_executed);
1725  switch (KMP_GET_THREAD_STATE()) {
1726  case FORK_JOIN_BARRIER:
1727  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1728  break;
1729  case PLAIN_BARRIER:
1730  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1731  break;
1732  case TASKYIELD:
1733  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1734  break;
1735  case TASKWAIT:
1736  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1737  break;
1738  case TASKGROUP:
1739  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1740  break;
1741  default:
1742  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1743  break;
1744  }
1745 #endif // KMP_STATS_ENABLED
1746 
1747 // OMPT task begin
1748 #if OMPT_SUPPORT
1749  if (UNLIKELY(ompt_enabled.enabled))
1750  __ompt_task_start(task, current_task, gtid);
1751 #endif
1752 #if OMPT_SUPPORT && OMPT_OPTIONAL
1753  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1754  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1755  ompt_data_t instance = ompt_data_none;
1756  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1757  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1758  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1759  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1760  ompt_dispatch_taskloop_chunk, instance);
1761  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1762  }
1763 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1764 
1765 #if OMPD_SUPPORT
1766  if (ompd_state & OMPD_ENABLE_BP)
1767  ompd_bp_task_begin();
1768 #endif
1769 
1770 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1771  kmp_uint64 cur_time;
1772  kmp_int32 kmp_itt_count_task =
1773  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1774  current_task->td_flags.tasktype == TASK_IMPLICIT;
1775  if (kmp_itt_count_task) {
1776  thread = __kmp_threads[gtid];
1777  // Time outer level explicit task on barrier for adjusting imbalance time
1778  if (thread->th.th_bar_arrive_time)
1779  cur_time = __itt_get_timestamp();
1780  else
1781  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1782  }
1783  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1784 #endif
1785 
1786  if (task->routine != NULL) {
1787 #ifdef KMP_GOMP_COMPAT
1788  if (taskdata->td_flags.native) {
1789  ((void (*)(void *))(*(task->routine)))(task->shareds);
1790  } else
1791 #endif /* KMP_GOMP_COMPAT */
1792  {
1793  (*(task->routine))(gtid, task);
1794  }
1795  }
1796  KMP_POP_PARTITIONED_TIMER();
1797 
1798 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1799  if (kmp_itt_count_task) {
1800  // Barrier imbalance - adjust arrive time with the task duration
1801  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1802  }
1803  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1804  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1805 #endif
1806  }
1807 
1808 #if OMPD_SUPPORT
1809  if (ompd_state & OMPD_ENABLE_BP)
1810  ompd_bp_task_end();
1811 #endif
1812 
1813  // Proxy tasks are not handled by the runtime
1814  if (taskdata->td_flags.proxy != TASK_PROXY) {
1815 #if OMPT_SUPPORT
1816  if (UNLIKELY(ompt_enabled.enabled)) {
1817  thread->th.ompt_thread_info = oldInfo;
1818  if (taskdata->td_flags.tiedness == TASK_TIED) {
1819  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1820  }
1821  __kmp_task_finish<true>(gtid, task, current_task);
1822  } else
1823 #endif
1824  __kmp_task_finish<false>(gtid, task, current_task);
1825  }
1826 
1827  KA_TRACE(
1828  30,
1829  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1830  gtid, taskdata, current_task));
1831  return;
1832 }
1833 
1834 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1835 //
1836 // loc_ref: location of original task pragma (ignored)
1837 // gtid: Global Thread ID of encountering thread
1838 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1839 // Returns:
1840 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1841 // be resumed later.
1842 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1843 // resumed later.
1844 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1845  kmp_task_t *new_task) {
1846  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1847 
1848  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1849  loc_ref, new_taskdata));
1850 
1851 #if OMPT_SUPPORT
1852  kmp_taskdata_t *parent;
1853  if (UNLIKELY(ompt_enabled.enabled)) {
1854  parent = new_taskdata->td_parent;
1855  if (ompt_enabled.ompt_callback_task_create) {
1856  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1857  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1858  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1859  OMPT_GET_RETURN_ADDRESS(0));
1860  }
1861  }
1862 #endif
1863 
1864  /* Should we execute the new task or queue it? For now, let's just always try
1865  to queue it. If the queue fills up, then we'll execute it. */
1866 
1867  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1868  { // Execute this task immediately
1869  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1870  new_taskdata->td_flags.task_serial = 1;
1871  __kmp_invoke_task(gtid, new_task, current_task);
1872  }
1873 
1874  KA_TRACE(
1875  10,
1876  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1877  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1878  gtid, loc_ref, new_taskdata));
1879 
1880 #if OMPT_SUPPORT
1881  if (UNLIKELY(ompt_enabled.enabled)) {
1882  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1883  }
1884 #endif
1885  return TASK_CURRENT_NOT_QUEUED;
1886 }
1887 
1888 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1889 //
1890 // gtid: Global Thread ID of encountering thread
1891 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1892 // serialize_immediate: if TRUE then if the task is executed immediately its
1893 // execution will be serialized
1894 // Returns:
1895 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1896 // be resumed later.
1897 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1898 // resumed later.
1899 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1900  bool serialize_immediate) {
1901  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1902 
1903  /* Should we execute the new task or queue it? For now, let's just always try
1904  to queue it. If the queue fills up, then we'll execute it. */
1905  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1906  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1907  { // Execute this task immediately
1908  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1909  if (serialize_immediate)
1910  new_taskdata->td_flags.task_serial = 1;
1911  __kmp_invoke_task(gtid, new_task, current_task);
1912  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1913  __kmp_wpolicy_passive) {
1914  kmp_info_t *this_thr = __kmp_threads[gtid];
1915  kmp_team_t *team = this_thr->th.th_team;
1916  kmp_int32 nthreads = this_thr->th.th_team_nproc;
1917  for (int i = 0; i < nthreads; ++i) {
1918  kmp_info_t *thread = team->t.t_threads[i];
1919  if (thread == this_thr)
1920  continue;
1921  if (thread->th.th_sleep_loc != NULL) {
1922  __kmp_null_resume_wrapper(thread);
1923  break; // awake one thread at a time
1924  }
1925  }
1926  }
1927  return TASK_CURRENT_NOT_QUEUED;
1928 }
1929 
1930 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1931 // non-thread-switchable task from the parent thread only!
1932 //
1933 // loc_ref: location of original task pragma (ignored)
1934 // gtid: Global Thread ID of encountering thread
1935 // new_task: non-thread-switchable task thunk allocated by
1936 // __kmp_omp_task_alloc()
1937 // Returns:
1938 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1939 // be resumed later.
1940 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1941 // resumed later.
1942 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1943  kmp_task_t *new_task) {
1944  kmp_int32 res;
1945  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1946 
1947 #if KMP_DEBUG || OMPT_SUPPORT
1948  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1949 #endif
1950  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1951  new_taskdata));
1952  __kmp_assert_valid_gtid(gtid);
1953 
1954 #if OMPT_SUPPORT
1955  kmp_taskdata_t *parent = NULL;
1956  if (UNLIKELY(ompt_enabled.enabled)) {
1957  if (!new_taskdata->td_flags.started) {
1958  OMPT_STORE_RETURN_ADDRESS(gtid);
1959  parent = new_taskdata->td_parent;
1960  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1961  parent->ompt_task_info.frame.enter_frame.ptr =
1962  OMPT_GET_FRAME_ADDRESS(0);
1963  }
1964  if (ompt_enabled.ompt_callback_task_create) {
1965  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1966  &(parent->ompt_task_info.task_data),
1967  &(parent->ompt_task_info.frame),
1968  &(new_taskdata->ompt_task_info.task_data),
1969  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1970  OMPT_LOAD_RETURN_ADDRESS(gtid));
1971  }
1972  } else {
1973  // We are scheduling the continuation of an UNTIED task.
1974  // Scheduling back to the parent task.
1975  __ompt_task_finish(new_task,
1976  new_taskdata->ompt_task_info.scheduling_parent,
1977  ompt_task_switch);
1978  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1979  }
1980  }
1981 #endif
1982 
1983  res = __kmp_omp_task(gtid, new_task, true);
1984 
1985  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1986  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1987  gtid, loc_ref, new_taskdata));
1988 #if OMPT_SUPPORT
1989  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1990  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1991  }
1992 #endif
1993  return res;
1994 }
1995 
1996 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1997 // a taskloop task with the correct OMPT return address
1998 //
1999 // loc_ref: location of original task pragma (ignored)
2000 // gtid: Global Thread ID of encountering thread
2001 // new_task: non-thread-switchable task thunk allocated by
2002 // __kmp_omp_task_alloc()
2003 // codeptr_ra: return address for OMPT callback
2004 // Returns:
2005 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2006 // be resumed later.
2007 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2008 // resumed later.
2009 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2010  kmp_task_t *new_task, void *codeptr_ra) {
2011  kmp_int32 res;
2012  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2013 
2014 #if KMP_DEBUG || OMPT_SUPPORT
2015  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2016 #endif
2017  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2018  new_taskdata));
2019 
2020 #if OMPT_SUPPORT
2021  kmp_taskdata_t *parent = NULL;
2022  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2023  parent = new_taskdata->td_parent;
2024  if (!parent->ompt_task_info.frame.enter_frame.ptr)
2025  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2026  if (ompt_enabled.ompt_callback_task_create) {
2027  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2028  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2029  &(new_taskdata->ompt_task_info.task_data),
2030  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2031  codeptr_ra);
2032  }
2033  }
2034 #endif
2035 
2036  res = __kmp_omp_task(gtid, new_task, true);
2037 
2038  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2039  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2040  gtid, loc_ref, new_taskdata));
2041 #if OMPT_SUPPORT
2042  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2043  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2044  }
2045 #endif
2046  return res;
2047 }
2048 
2049 template <bool ompt>
2050 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2051  void *frame_address,
2052  void *return_address) {
2053  kmp_taskdata_t *taskdata = nullptr;
2054  kmp_info_t *thread;
2055  int thread_finished = FALSE;
2056  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2057 
2058  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2059  KMP_DEBUG_ASSERT(gtid >= 0);
2060 
2061  if (__kmp_tasking_mode != tskm_immediate_exec) {
2062  thread = __kmp_threads[gtid];
2063  taskdata = thread->th.th_current_task;
2064 
2065 #if OMPT_SUPPORT && OMPT_OPTIONAL
2066  ompt_data_t *my_task_data;
2067  ompt_data_t *my_parallel_data;
2068 
2069  if (ompt) {
2070  my_task_data = &(taskdata->ompt_task_info.task_data);
2071  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2072 
2073  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2074 
2075  if (ompt_enabled.ompt_callback_sync_region) {
2076  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2077  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2078  my_task_data, return_address);
2079  }
2080 
2081  if (ompt_enabled.ompt_callback_sync_region_wait) {
2082  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2083  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2084  my_task_data, return_address);
2085  }
2086  }
2087 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2088 
2089 // Debugger: The taskwait is active. Store location and thread encountered the
2090 // taskwait.
2091 #if USE_ITT_BUILD
2092 // Note: These values are used by ITT events as well.
2093 #endif /* USE_ITT_BUILD */
2094  taskdata->td_taskwait_counter += 1;
2095  taskdata->td_taskwait_ident = loc_ref;
2096  taskdata->td_taskwait_thread = gtid + 1;
2097 
2098 #if USE_ITT_BUILD
2099  void *itt_sync_obj = NULL;
2100 #if USE_ITT_NOTIFY
2101  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2102 #endif /* USE_ITT_NOTIFY */
2103 #endif /* USE_ITT_BUILD */
2104 
2105  bool must_wait =
2106  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2107 
2108  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2109  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2110  // If hidden helper thread is encountered, we must enable wait here.
2111  must_wait =
2112  must_wait ||
2113  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2114  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2115 
2116  if (must_wait) {
2117  kmp_flag_32<false, false> flag(
2118  RCAST(std::atomic<kmp_uint32> *,
2119  &(taskdata->td_incomplete_child_tasks)),
2120  0U);
2121  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2122  flag.execute_tasks(thread, gtid, FALSE,
2123  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2124  __kmp_task_stealing_constraint);
2125  }
2126  }
2127 #if USE_ITT_BUILD
2128  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2129  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2130 #endif /* USE_ITT_BUILD */
2131 
2132  // Debugger: The taskwait is completed. Location remains, but thread is
2133  // negated.
2134  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2135 
2136 #if OMPT_SUPPORT && OMPT_OPTIONAL
2137  if (ompt) {
2138  if (ompt_enabled.ompt_callback_sync_region_wait) {
2139  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2140  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2141  my_task_data, return_address);
2142  }
2143  if (ompt_enabled.ompt_callback_sync_region) {
2144  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2145  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2146  my_task_data, return_address);
2147  }
2148  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2149  }
2150 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2151 
2152  }
2153 
2154  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2155  "returning TASK_CURRENT_NOT_QUEUED\n",
2156  gtid, taskdata));
2157 
2158  return TASK_CURRENT_NOT_QUEUED;
2159 }
2160 
2161 #if OMPT_SUPPORT && OMPT_OPTIONAL
2162 OMPT_NOINLINE
2163 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2164  void *frame_address,
2165  void *return_address) {
2166  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2167  return_address);
2168 }
2169 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2170 
2171 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2172 // complete
2173 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2174 #if OMPT_SUPPORT && OMPT_OPTIONAL
2175  if (UNLIKELY(ompt_enabled.enabled)) {
2176  OMPT_STORE_RETURN_ADDRESS(gtid);
2177  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2178  OMPT_LOAD_RETURN_ADDRESS(gtid));
2179  }
2180 #endif
2181  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2182 }
2183 
2184 // __kmpc_omp_taskyield: switch to a different task
2185 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2186  kmp_taskdata_t *taskdata = NULL;
2187  kmp_info_t *thread;
2188  int thread_finished = FALSE;
2189 
2190  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2191  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2192 
2193  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2194  gtid, loc_ref, end_part));
2195  __kmp_assert_valid_gtid(gtid);
2196 
2197  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2198  thread = __kmp_threads[gtid];
2199  taskdata = thread->th.th_current_task;
2200 // Should we model this as a task wait or not?
2201 // Debugger: The taskwait is active. Store location and thread encountered the
2202 // taskwait.
2203 #if USE_ITT_BUILD
2204 // Note: These values are used by ITT events as well.
2205 #endif /* USE_ITT_BUILD */
2206  taskdata->td_taskwait_counter += 1;
2207  taskdata->td_taskwait_ident = loc_ref;
2208  taskdata->td_taskwait_thread = gtid + 1;
2209 
2210 #if USE_ITT_BUILD
2211  void *itt_sync_obj = NULL;
2212 #if USE_ITT_NOTIFY
2213  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2214 #endif /* USE_ITT_NOTIFY */
2215 #endif /* USE_ITT_BUILD */
2216  if (!taskdata->td_flags.team_serial) {
2217  kmp_task_team_t *task_team = thread->th.th_task_team;
2218  if (task_team != NULL) {
2219  if (KMP_TASKING_ENABLED(task_team)) {
2220 #if OMPT_SUPPORT
2221  if (UNLIKELY(ompt_enabled.enabled))
2222  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2223 #endif
2224  __kmp_execute_tasks_32(
2225  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2226  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2227  __kmp_task_stealing_constraint);
2228 #if OMPT_SUPPORT
2229  if (UNLIKELY(ompt_enabled.enabled))
2230  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2231 #endif
2232  }
2233  }
2234  }
2235 #if USE_ITT_BUILD
2236  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2237 #endif /* USE_ITT_BUILD */
2238 
2239  // Debugger: The taskwait is completed. Location remains, but thread is
2240  // negated.
2241  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2242  }
2243 
2244  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2245  "returning TASK_CURRENT_NOT_QUEUED\n",
2246  gtid, taskdata));
2247 
2248  return TASK_CURRENT_NOT_QUEUED;
2249 }
2250 
2251 // Task Reduction implementation
2252 //
2253 // Note: initial implementation didn't take into account the possibility
2254 // to specify omp_orig for initializer of the UDR (user defined reduction).
2255 // Corrected implementation takes into account the omp_orig object.
2256 // Compiler is free to use old implementation if omp_orig is not specified.
2257 
2266 typedef struct kmp_taskred_flags {
2268  unsigned lazy_priv : 1;
2269  unsigned reserved31 : 31;
2271 
2275 typedef struct kmp_task_red_input {
2276  void *reduce_shar;
2277  size_t reduce_size;
2278  // three compiler-generated routines (init, fini are optional):
2279  void *reduce_init;
2280  void *reduce_fini;
2281  void *reduce_comb;
2284 
2288 typedef struct kmp_taskred_data {
2289  void *reduce_shar;
2290  size_t reduce_size;
2292  void *reduce_priv;
2293  void *reduce_pend;
2294  // three compiler-generated routines (init, fini are optional):
2295  void *reduce_comb;
2296  void *reduce_init;
2297  void *reduce_fini;
2298  void *reduce_orig;
2300 
2306 typedef struct kmp_taskred_input {
2307  void *reduce_shar;
2308  void *reduce_orig;
2309  size_t reduce_size;
2310  // three compiler-generated routines (init, fini are optional):
2311  void *reduce_init;
2312  void *reduce_fini;
2313  void *reduce_comb;
2320 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2321 template <>
2322 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2323  kmp_task_red_input_t &src) {
2324  item.reduce_orig = NULL;
2325 }
2326 template <>
2327 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2328  kmp_taskred_input_t &src) {
2329  if (src.reduce_orig != NULL) {
2330  item.reduce_orig = src.reduce_orig;
2331  } else {
2332  item.reduce_orig = src.reduce_shar;
2333  } // non-NULL reduce_orig means new interface used
2334 }
2335 
2336 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2337 template <>
2338 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2339  size_t offset) {
2340  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2341 }
2342 template <>
2343 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2344  size_t offset) {
2345  ((void (*)(void *, void *))item.reduce_init)(
2346  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2347 }
2348 
2349 template <typename T>
2350 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2351  __kmp_assert_valid_gtid(gtid);
2352  kmp_info_t *thread = __kmp_threads[gtid];
2353  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2354  kmp_uint32 nth = thread->th.th_team_nproc;
2355  kmp_taskred_data_t *arr;
2356 
2357  // check input data just in case
2358  KMP_ASSERT(tg != NULL);
2359  KMP_ASSERT(data != NULL);
2360  KMP_ASSERT(num > 0);
2361  if (nth == 1) {
2362  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2363  gtid, tg));
2364  return (void *)tg;
2365  }
2366  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2367  gtid, tg, num));
2368  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2369  thread, num * sizeof(kmp_taskred_data_t));
2370  for (int i = 0; i < num; ++i) {
2371  size_t size = data[i].reduce_size - 1;
2372  // round the size up to cache line per thread-specific item
2373  size += CACHE_LINE - size % CACHE_LINE;
2374  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2375  arr[i].reduce_shar = data[i].reduce_shar;
2376  arr[i].reduce_size = size;
2377  arr[i].flags = data[i].flags;
2378  arr[i].reduce_comb = data[i].reduce_comb;
2379  arr[i].reduce_init = data[i].reduce_init;
2380  arr[i].reduce_fini = data[i].reduce_fini;
2381  __kmp_assign_orig<T>(arr[i], data[i]);
2382  if (!arr[i].flags.lazy_priv) {
2383  // allocate cache-line aligned block and fill it with zeros
2384  arr[i].reduce_priv = __kmp_allocate(nth * size);
2385  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2386  if (arr[i].reduce_init != NULL) {
2387  // initialize all thread-specific items
2388  for (size_t j = 0; j < nth; ++j) {
2389  __kmp_call_init<T>(arr[i], j * size);
2390  }
2391  }
2392  } else {
2393  // only allocate space for pointers now,
2394  // objects will be lazily allocated/initialized if/when requested
2395  // note that __kmp_allocate zeroes the allocated memory
2396  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2397  }
2398  }
2399  tg->reduce_data = (void *)arr;
2400  tg->reduce_num_data = num;
2401  return (void *)tg;
2402 }
2403 
2418 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2419  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2420 }
2421 
2434 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2435  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2436 }
2437 
2438 // Copy task reduction data (except for shared pointers).
2439 template <typename T>
2440 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2441  kmp_taskgroup_t *tg, void *reduce_data) {
2442  kmp_taskred_data_t *arr;
2443  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2444  " from data %p\n",
2445  thr, tg, reduce_data));
2446  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2447  thr, num * sizeof(kmp_taskred_data_t));
2448  // threads will share private copies, thunk routines, sizes, flags, etc.:
2449  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2450  for (int i = 0; i < num; ++i) {
2451  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2452  }
2453  tg->reduce_data = (void *)arr;
2454  tg->reduce_num_data = num;
2455 }
2456 
2466 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2467  __kmp_assert_valid_gtid(gtid);
2468  kmp_info_t *thread = __kmp_threads[gtid];
2469  kmp_int32 nth = thread->th.th_team_nproc;
2470  if (nth == 1)
2471  return data; // nothing to do
2472 
2473  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2474  if (tg == NULL)
2475  tg = thread->th.th_current_task->td_taskgroup;
2476  KMP_ASSERT(tg != NULL);
2477  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2478  kmp_int32 num = tg->reduce_num_data;
2479  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2480 
2481  KMP_ASSERT(data != NULL);
2482  while (tg != NULL) {
2483  for (int i = 0; i < num; ++i) {
2484  if (!arr[i].flags.lazy_priv) {
2485  if (data == arr[i].reduce_shar ||
2486  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2487  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2488  } else {
2489  // check shared location first
2490  void **p_priv = (void **)(arr[i].reduce_priv);
2491  if (data == arr[i].reduce_shar)
2492  goto found;
2493  // check if we get some thread specific location as parameter
2494  for (int j = 0; j < nth; ++j)
2495  if (data == p_priv[j])
2496  goto found;
2497  continue; // not found, continue search
2498  found:
2499  if (p_priv[tid] == NULL) {
2500  // allocate thread specific object lazily
2501  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2502  if (arr[i].reduce_init != NULL) {
2503  if (arr[i].reduce_orig != NULL) { // new interface
2504  ((void (*)(void *, void *))arr[i].reduce_init)(
2505  p_priv[tid], arr[i].reduce_orig);
2506  } else { // old interface (single parameter)
2507  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2508  }
2509  }
2510  }
2511  return p_priv[tid];
2512  }
2513  }
2514  tg = tg->parent;
2515  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2516  num = tg->reduce_num_data;
2517  }
2518  KMP_ASSERT2(0, "Unknown task reduction item");
2519  return NULL; // ERROR, this line never executed
2520 }
2521 
2522 // Finalize task reduction.
2523 // Called from __kmpc_end_taskgroup()
2524 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2525  kmp_int32 nth = th->th.th_team_nproc;
2526  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2527  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2528  kmp_int32 num = tg->reduce_num_data;
2529  for (int i = 0; i < num; ++i) {
2530  void *sh_data = arr[i].reduce_shar;
2531  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2532  void (*f_comb)(void *, void *) =
2533  (void (*)(void *, void *))(arr[i].reduce_comb);
2534  if (!arr[i].flags.lazy_priv) {
2535  void *pr_data = arr[i].reduce_priv;
2536  size_t size = arr[i].reduce_size;
2537  for (int j = 0; j < nth; ++j) {
2538  void *priv_data = (char *)pr_data + j * size;
2539  f_comb(sh_data, priv_data); // combine results
2540  if (f_fini)
2541  f_fini(priv_data); // finalize if needed
2542  }
2543  } else {
2544  void **pr_data = (void **)(arr[i].reduce_priv);
2545  for (int j = 0; j < nth; ++j) {
2546  if (pr_data[j] != NULL) {
2547  f_comb(sh_data, pr_data[j]); // combine results
2548  if (f_fini)
2549  f_fini(pr_data[j]); // finalize if needed
2550  __kmp_free(pr_data[j]);
2551  }
2552  }
2553  }
2554  __kmp_free(arr[i].reduce_priv);
2555  }
2556  __kmp_thread_free(th, arr);
2557  tg->reduce_data = NULL;
2558  tg->reduce_num_data = 0;
2559 }
2560 
2561 // Cleanup task reduction data for parallel or worksharing,
2562 // do not touch task private data other threads still working with.
2563 // Called from __kmpc_end_taskgroup()
2564 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2565  __kmp_thread_free(th, tg->reduce_data);
2566  tg->reduce_data = NULL;
2567  tg->reduce_num_data = 0;
2568 }
2569 
2570 template <typename T>
2571 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2572  int num, T *data) {
2573  __kmp_assert_valid_gtid(gtid);
2574  kmp_info_t *thr = __kmp_threads[gtid];
2575  kmp_int32 nth = thr->th.th_team_nproc;
2576  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2577  if (nth == 1) {
2578  KA_TRACE(10,
2579  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2580  gtid, thr->th.th_current_task->td_taskgroup));
2581  return (void *)thr->th.th_current_task->td_taskgroup;
2582  }
2583  kmp_team_t *team = thr->th.th_team;
2584  void *reduce_data;
2585  kmp_taskgroup_t *tg;
2586  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2587  if (reduce_data == NULL &&
2588  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2589  (void *)1)) {
2590  // single thread enters this block to initialize common reduction data
2591  KMP_DEBUG_ASSERT(reduce_data == NULL);
2592  // first initialize own data, then make a copy other threads can use
2593  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2594  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2595  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2596  // fini counters should be 0 at this point
2597  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2598  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2599  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2600  } else {
2601  while (
2602  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2603  (void *)1) { // wait for task reduction initialization
2604  KMP_CPU_PAUSE();
2605  }
2606  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2607  tg = thr->th.th_current_task->td_taskgroup;
2608  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2609  }
2610  return tg;
2611 }
2612 
2629 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2630  int num, void *data) {
2631  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2632  (kmp_task_red_input_t *)data);
2633 }
2634 
2649 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2650  void *data) {
2651  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2652  (kmp_taskred_input_t *)data);
2653 }
2654 
2663 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2664  __kmpc_end_taskgroup(loc, gtid);
2665 }
2666 
2667 // __kmpc_taskgroup: Start a new taskgroup
2668 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2669  __kmp_assert_valid_gtid(gtid);
2670  kmp_info_t *thread = __kmp_threads[gtid];
2671  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2672  kmp_taskgroup_t *tg_new =
2673  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2674  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2675  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2676  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2677  tg_new->parent = taskdata->td_taskgroup;
2678  tg_new->reduce_data = NULL;
2679  tg_new->reduce_num_data = 0;
2680  tg_new->gomp_data = NULL;
2681  taskdata->td_taskgroup = tg_new;
2682 
2683 #if OMPT_SUPPORT && OMPT_OPTIONAL
2684  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2685  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2686  if (!codeptr)
2687  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2688  kmp_team_t *team = thread->th.th_team;
2689  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2690  // FIXME: I think this is wrong for lwt!
2691  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2692 
2693  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2694  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2695  &(my_task_data), codeptr);
2696  }
2697 #endif
2698 }
2699 
2700 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2701 // and its descendants are complete
2702 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2703  __kmp_assert_valid_gtid(gtid);
2704  kmp_info_t *thread = __kmp_threads[gtid];
2705  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2706  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2707  int thread_finished = FALSE;
2708 
2709 #if OMPT_SUPPORT && OMPT_OPTIONAL
2710  kmp_team_t *team;
2711  ompt_data_t my_task_data;
2712  ompt_data_t my_parallel_data;
2713  void *codeptr = nullptr;
2714  if (UNLIKELY(ompt_enabled.enabled)) {
2715  team = thread->th.th_team;
2716  my_task_data = taskdata->ompt_task_info.task_data;
2717  // FIXME: I think this is wrong for lwt!
2718  my_parallel_data = team->t.ompt_team_info.parallel_data;
2719  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2720  if (!codeptr)
2721  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2722  }
2723 #endif
2724 
2725  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2726  KMP_DEBUG_ASSERT(taskgroup != NULL);
2727  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2728 
2729  if (__kmp_tasking_mode != tskm_immediate_exec) {
2730  // mark task as waiting not on a barrier
2731  taskdata->td_taskwait_counter += 1;
2732  taskdata->td_taskwait_ident = loc;
2733  taskdata->td_taskwait_thread = gtid + 1;
2734 #if USE_ITT_BUILD
2735  // For ITT the taskgroup wait is similar to taskwait until we need to
2736  // distinguish them
2737  void *itt_sync_obj = NULL;
2738 #if USE_ITT_NOTIFY
2739  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2740 #endif /* USE_ITT_NOTIFY */
2741 #endif /* USE_ITT_BUILD */
2742 
2743 #if OMPT_SUPPORT && OMPT_OPTIONAL
2744  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2745  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2746  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2747  &(my_task_data), codeptr);
2748  }
2749 #endif
2750 
2751  if (!taskdata->td_flags.team_serial ||
2752  (thread->th.th_task_team != NULL &&
2753  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2754  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2755  kmp_flag_32<false, false> flag(
2756  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2757  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2758  flag.execute_tasks(thread, gtid, FALSE,
2759  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2760  __kmp_task_stealing_constraint);
2761  }
2762  }
2763  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2764 
2765 #if OMPT_SUPPORT && OMPT_OPTIONAL
2766  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2767  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2768  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2769  &(my_task_data), codeptr);
2770  }
2771 #endif
2772 
2773 #if USE_ITT_BUILD
2774  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2775  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2776 #endif /* USE_ITT_BUILD */
2777  }
2778  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2779 
2780  if (taskgroup->reduce_data != NULL &&
2781  !taskgroup->gomp_data) { // need to reduce?
2782  int cnt;
2783  void *reduce_data;
2784  kmp_team_t *t = thread->th.th_team;
2785  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2786  // check if <priv> data of the first reduction variable shared for the team
2787  void *priv0 = arr[0].reduce_priv;
2788  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2789  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2790  // finishing task reduction on parallel
2791  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2792  if (cnt == thread->th.th_team_nproc - 1) {
2793  // we are the last thread passing __kmpc_reduction_modifier_fini()
2794  // finalize task reduction:
2795  __kmp_task_reduction_fini(thread, taskgroup);
2796  // cleanup fields in the team structure:
2797  // TODO: is relaxed store enough here (whole barrier should follow)?
2798  __kmp_thread_free(thread, reduce_data);
2799  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2800  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2801  } else {
2802  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2803  // so do not finalize reduction, just clean own copy of the data
2804  __kmp_task_reduction_clean(thread, taskgroup);
2805  }
2806  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2807  NULL &&
2808  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2809  // finishing task reduction on worksharing
2810  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2811  if (cnt == thread->th.th_team_nproc - 1) {
2812  // we are the last thread passing __kmpc_reduction_modifier_fini()
2813  __kmp_task_reduction_fini(thread, taskgroup);
2814  // cleanup fields in team structure:
2815  // TODO: is relaxed store enough here (whole barrier should follow)?
2816  __kmp_thread_free(thread, reduce_data);
2817  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2818  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2819  } else {
2820  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2821  // so do not finalize reduction, just clean own copy of the data
2822  __kmp_task_reduction_clean(thread, taskgroup);
2823  }
2824  } else {
2825  // finishing task reduction on taskgroup
2826  __kmp_task_reduction_fini(thread, taskgroup);
2827  }
2828  }
2829  // Restore parent taskgroup for the current task
2830  taskdata->td_taskgroup = taskgroup->parent;
2831  __kmp_thread_free(thread, taskgroup);
2832 
2833  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2834  gtid, taskdata));
2835 
2836 #if OMPT_SUPPORT && OMPT_OPTIONAL
2837  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2838  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2839  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2840  &(my_task_data), codeptr);
2841  }
2842 #endif
2843 }
2844 
2845 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
2846  kmp_task_team_t *task_team,
2847  kmp_int32 is_constrained) {
2848  kmp_task_t *task = NULL;
2849  kmp_taskdata_t *taskdata;
2850  kmp_taskdata_t *current;
2851  kmp_thread_data_t *thread_data;
2852  int ntasks = task_team->tt.tt_num_task_pri;
2853  if (ntasks == 0) {
2854  KA_TRACE(
2855  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2856  return NULL;
2857  }
2858  do {
2859  // decrement num_tasks to "reserve" one task to get for execution
2860  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2861  ntasks - 1))
2862  break;
2863  } while (ntasks > 0);
2864  if (ntasks == 0) {
2865  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2866  __kmp_get_gtid()));
2867  return NULL;
2868  }
2869  // We got a "ticket" to get a "reserved" priority task
2870  int deque_ntasks;
2871  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2872  do {
2873  KMP_ASSERT(list != NULL);
2874  thread_data = &list->td;
2875  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2876  deque_ntasks = thread_data->td.td_deque_ntasks;
2877  if (deque_ntasks == 0) {
2878  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2879  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2880  __kmp_get_gtid(), thread_data));
2881  list = list->next;
2882  }
2883  } while (deque_ntasks == 0);
2884  KMP_DEBUG_ASSERT(deque_ntasks);
2885  int target = thread_data->td.td_deque_head;
2886  current = __kmp_threads[gtid]->th.th_current_task;
2887  taskdata = thread_data->td.td_deque[target];
2888  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2889  // Bump head pointer and Wrap.
2890  thread_data->td.td_deque_head =
2891  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2892  } else {
2893  if (!task_team->tt.tt_untied_task_encountered) {
2894  // The TSC does not allow to steal victim task
2895  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2896  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2897  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2898  gtid, thread_data, task_team, deque_ntasks, target,
2899  thread_data->td.td_deque_tail));
2900  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2901  return NULL;
2902  }
2903  int i;
2904  // walk through the deque trying to steal any task
2905  taskdata = NULL;
2906  for (i = 1; i < deque_ntasks; ++i) {
2907  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2908  taskdata = thread_data->td.td_deque[target];
2909  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2910  break; // found task to execute
2911  } else {
2912  taskdata = NULL;
2913  }
2914  }
2915  if (taskdata == NULL) {
2916  // No appropriate candidate found to execute
2917  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2918  KA_TRACE(
2919  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2920  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2921  gtid, thread_data, task_team, deque_ntasks,
2922  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2923  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2924  return NULL;
2925  }
2926  int prev = target;
2927  for (i = i + 1; i < deque_ntasks; ++i) {
2928  // shift remaining tasks in the deque left by 1
2929  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2930  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2931  prev = target;
2932  }
2933  KMP_DEBUG_ASSERT(
2934  thread_data->td.td_deque_tail ==
2935  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2936  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2937  }
2938  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2939  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2940  task = KMP_TASKDATA_TO_TASK(taskdata);
2941  return task;
2942 }
2943 
2944 // __kmp_remove_my_task: remove a task from my own deque
2945 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2946  kmp_task_team_t *task_team,
2947  kmp_int32 is_constrained) {
2948  kmp_task_t *task;
2949  kmp_taskdata_t *taskdata;
2950  kmp_thread_data_t *thread_data;
2951  kmp_uint32 tail;
2952 
2953  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2954  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2955  NULL); // Caller should check this condition
2956 
2957  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2958 
2959  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2960  gtid, thread_data->td.td_deque_ntasks,
2961  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2962 
2963  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2964  KA_TRACE(10,
2965  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2966  "ntasks=%d head=%u tail=%u\n",
2967  gtid, thread_data->td.td_deque_ntasks,
2968  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2969  return NULL;
2970  }
2971 
2972  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2973 
2974  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2975  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2976  KA_TRACE(10,
2977  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2978  "ntasks=%d head=%u tail=%u\n",
2979  gtid, thread_data->td.td_deque_ntasks,
2980  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2981  return NULL;
2982  }
2983 
2984  tail = (thread_data->td.td_deque_tail - 1) &
2985  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2986  taskdata = thread_data->td.td_deque[tail];
2987 
2988  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2989  thread->th.th_current_task)) {
2990  // The TSC does not allow to steal victim task
2991  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2992  KA_TRACE(10,
2993  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2994  "ntasks=%d head=%u tail=%u\n",
2995  gtid, thread_data->td.td_deque_ntasks,
2996  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2997  return NULL;
2998  }
2999 
3000  thread_data->td.td_deque_tail = tail;
3001  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3002 
3003  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3004 
3005  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3006  "ntasks=%d head=%u tail=%u\n",
3007  gtid, taskdata, thread_data->td.td_deque_ntasks,
3008  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3009 
3010  task = KMP_TASKDATA_TO_TASK(taskdata);
3011  return task;
3012 }
3013 
3014 // __kmp_steal_task: remove a task from another thread's deque
3015 // Assume that calling thread has already checked existence of
3016 // task_team thread_data before calling this routine.
3017 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
3018  kmp_task_team_t *task_team,
3019  std::atomic<kmp_int32> *unfinished_threads,
3020  int *thread_finished,
3021  kmp_int32 is_constrained) {
3022  kmp_task_t *task;
3023  kmp_taskdata_t *taskdata;
3024  kmp_taskdata_t *current;
3025  kmp_thread_data_t *victim_td, *threads_data;
3026  kmp_int32 target;
3027  kmp_int32 victim_tid;
3028 
3029  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3030 
3031  threads_data = task_team->tt.tt_threads_data;
3032  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3033 
3034  victim_tid = victim_thr->th.th_info.ds.ds_tid;
3035  victim_td = &threads_data[victim_tid];
3036 
3037  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3038  "task_team=%p ntasks=%d head=%u tail=%u\n",
3039  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3040  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3041  victim_td->td.td_deque_tail));
3042 
3043  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3044  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3045  "task_team=%p ntasks=%d head=%u tail=%u\n",
3046  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3047  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3048  victim_td->td.td_deque_tail));
3049  return NULL;
3050  }
3051 
3052  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3053 
3054  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3055  // Check again after we acquire the lock
3056  if (ntasks == 0) {
3057  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3058  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3059  "task_team=%p ntasks=%d head=%u tail=%u\n",
3060  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3061  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3062  return NULL;
3063  }
3064 
3065  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3066  current = __kmp_threads[gtid]->th.th_current_task;
3067  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3068  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3069  // Bump head pointer and Wrap.
3070  victim_td->td.td_deque_head =
3071  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3072  } else {
3073  if (!task_team->tt.tt_untied_task_encountered) {
3074  // The TSC does not allow to steal victim task
3075  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3076  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3077  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3078  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3079  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3080  return NULL;
3081  }
3082  int i;
3083  // walk through victim's deque trying to steal any task
3084  target = victim_td->td.td_deque_head;
3085  taskdata = NULL;
3086  for (i = 1; i < ntasks; ++i) {
3087  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3088  taskdata = victim_td->td.td_deque[target];
3089  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3090  break; // found victim task
3091  } else {
3092  taskdata = NULL;
3093  }
3094  }
3095  if (taskdata == NULL) {
3096  // No appropriate candidate to steal found
3097  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3098  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3099  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3100  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3101  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3102  return NULL;
3103  }
3104  int prev = target;
3105  for (i = i + 1; i < ntasks; ++i) {
3106  // shift remaining tasks in the deque left by 1
3107  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3108  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3109  prev = target;
3110  }
3111  KMP_DEBUG_ASSERT(
3112  victim_td->td.td_deque_tail ==
3113  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3114  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3115  }
3116  if (*thread_finished) {
3117  // We need to un-mark this victim as a finished victim. This must be done
3118  // before releasing the lock, or else other threads (starting with the
3119  // primary thread victim) might be prematurely released from the barrier!!!
3120 #if KMP_DEBUG
3121  kmp_int32 count =
3122 #endif
3123  KMP_ATOMIC_INC(unfinished_threads);
3124  KA_TRACE(
3125  20,
3126  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3127  gtid, count + 1, task_team));
3128  *thread_finished = FALSE;
3129  }
3130  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3131 
3132  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3133 
3134  KMP_COUNT_BLOCK(TASK_stolen);
3135  KA_TRACE(10,
3136  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3137  "task_team=%p ntasks=%d head=%u tail=%u\n",
3138  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3139  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3140 
3141  task = KMP_TASKDATA_TO_TASK(taskdata);
3142  return task;
3143 }
3144 
3145 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3146 // condition is statisfied (return true) or there are none left (return false).
3147 //
3148 // final_spin is TRUE if this is the spin at the release barrier.
3149 // thread_finished indicates whether the thread is finished executing all
3150 // the tasks it has on its deque, and is at the release barrier.
3151 // spinner is the location on which to spin.
3152 // spinner == NULL means only execute a single task and return.
3153 // checker is the value to check to terminate the spin.
3154 template <class C>
3155 static inline int __kmp_execute_tasks_template(
3156  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3157  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3158  kmp_int32 is_constrained) {
3159  kmp_task_team_t *task_team = thread->th.th_task_team;
3160  kmp_thread_data_t *threads_data;
3161  kmp_task_t *task;
3162  kmp_info_t *other_thread;
3163  kmp_taskdata_t *current_task = thread->th.th_current_task;
3164  std::atomic<kmp_int32> *unfinished_threads;
3165  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3166  tid = thread->th.th_info.ds.ds_tid;
3167 
3168  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3169  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3170 
3171  if (task_team == NULL || current_task == NULL)
3172  return FALSE;
3173 
3174  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3175  "*thread_finished=%d\n",
3176  gtid, final_spin, *thread_finished));
3177 
3178  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3179  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3180 
3181  KMP_DEBUG_ASSERT(threads_data != NULL);
3182 
3183  nthreads = task_team->tt.tt_nproc;
3184  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3185  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
3186  task_team->tt.tt_hidden_helper_task_encountered);
3187  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3188 
3189  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3190  // getting tasks from target constructs
3191  while (1) { // Inner loop to find a task and execute it
3192  task = NULL;
3193  if (task_team->tt.tt_num_task_pri) { // get priority task first
3194  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3195  }
3196  if (task == NULL && use_own_tasks) { // check own queue next
3197  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3198  }
3199  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3200  int asleep = 1;
3201  use_own_tasks = 0;
3202  // Try to steal from the last place I stole from successfully.
3203  if (victim_tid == -2) { // haven't stolen anything yet
3204  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3205  if (victim_tid !=
3206  -1) // if we have a last stolen from victim, get the thread
3207  other_thread = threads_data[victim_tid].td.td_thr;
3208  }
3209  if (victim_tid != -1) { // found last victim
3210  asleep = 0;
3211  } else if (!new_victim) { // no recent steals and we haven't already
3212  // used a new victim; select a random thread
3213  do { // Find a different thread to steal work from.
3214  // Pick a random thread. Initial plan was to cycle through all the
3215  // threads, and only return if we tried to steal from every thread,
3216  // and failed. Arch says that's not such a great idea.
3217  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3218  if (victim_tid >= tid) {
3219  ++victim_tid; // Adjusts random distribution to exclude self
3220  }
3221  // Found a potential victim
3222  other_thread = threads_data[victim_tid].td.td_thr;
3223  // There is a slight chance that __kmp_enable_tasking() did not wake
3224  // up all threads waiting at the barrier. If victim is sleeping,
3225  // then wake it up. Since we were going to pay the cache miss
3226  // penalty for referencing another thread's kmp_info_t struct
3227  // anyway,
3228  // the check shouldn't cost too much performance at this point. In
3229  // extra barrier mode, tasks do not sleep at the separate tasking
3230  // barrier, so this isn't a problem.
3231  asleep = 0;
3232  if ((__kmp_tasking_mode == tskm_task_teams) &&
3233  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3234  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3235  NULL)) {
3236  asleep = 1;
3237  __kmp_null_resume_wrapper(other_thread);
3238  // A sleeping thread should not have any tasks on it's queue.
3239  // There is a slight possibility that it resumes, steals a task
3240  // from another thread, which spawns more tasks, all in the time
3241  // that it takes this thread to check => don't write an assertion
3242  // that the victim's queue is empty. Try stealing from a
3243  // different thread.
3244  }
3245  } while (asleep);
3246  }
3247 
3248  if (!asleep) {
3249  // We have a victim to try to steal from
3250  task = __kmp_steal_task(other_thread, gtid, task_team,
3251  unfinished_threads, thread_finished,
3252  is_constrained);
3253  }
3254  if (task != NULL) { // set last stolen to victim
3255  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3256  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3257  // The pre-refactored code did not try more than 1 successful new
3258  // vicitm, unless the last one generated more local tasks;
3259  // new_victim keeps track of this
3260  new_victim = 1;
3261  }
3262  } else { // No tasks found; unset last_stolen
3263  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3264  victim_tid = -2; // no successful victim found
3265  }
3266  }
3267 
3268  if (task == NULL)
3269  break; // break out of tasking loop
3270 
3271 // Found a task; execute it
3272 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3273  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3274  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3275  // get the object reliably
3276  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3277  }
3278  __kmp_itt_task_starting(itt_sync_obj);
3279  }
3280 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3281  __kmp_invoke_task(gtid, task, current_task);
3282 #if USE_ITT_BUILD
3283  if (itt_sync_obj != NULL)
3284  __kmp_itt_task_finished(itt_sync_obj);
3285 #endif /* USE_ITT_BUILD */
3286  // If this thread is only partway through the barrier and the condition is
3287  // met, then return now, so that the barrier gather/release pattern can
3288  // proceed. If this thread is in the last spin loop in the barrier,
3289  // waiting to be released, we know that the termination condition will not
3290  // be satisfied, so don't waste any cycles checking it.
3291  if (flag == NULL || (!final_spin && flag->done_check())) {
3292  KA_TRACE(
3293  15,
3294  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3295  gtid));
3296  return TRUE;
3297  }
3298  if (thread->th.th_task_team == NULL) {
3299  break;
3300  }
3301  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3302  // If execution of a stolen task results in more tasks being placed on our
3303  // run queue, reset use_own_tasks
3304  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3305  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3306  "other tasks, restart\n",
3307  gtid));
3308  use_own_tasks = 1;
3309  new_victim = 0;
3310  }
3311  }
3312 
3313  // The task source has been exhausted. If in final spin loop of barrier,
3314  // check if termination condition is satisfied. The work queue may be empty
3315  // but there might be proxy tasks still executing.
3316  if (final_spin &&
3317  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3318  // First, decrement the #unfinished threads, if that has not already been
3319  // done. This decrement might be to the spin location, and result in the
3320  // termination condition being satisfied.
3321  if (!*thread_finished) {
3322 #if KMP_DEBUG
3323  kmp_int32 count = -1 +
3324 #endif
3325  KMP_ATOMIC_DEC(unfinished_threads);
3326  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3327  "unfinished_threads to %d task_team=%p\n",
3328  gtid, count, task_team));
3329  *thread_finished = TRUE;
3330  }
3331 
3332  // It is now unsafe to reference thread->th.th_team !!!
3333  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3334  // thread to pass through the barrier, where it might reset each thread's
3335  // th.th_team field for the next parallel region. If we can steal more
3336  // work, we know that this has not happened yet.
3337  if (flag != NULL && flag->done_check()) {
3338  KA_TRACE(
3339  15,
3340  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3341  gtid));
3342  return TRUE;
3343  }
3344  }
3345 
3346  // If this thread's task team is NULL, primary thread has recognized that
3347  // there are no more tasks; bail out
3348  if (thread->th.th_task_team == NULL) {
3349  KA_TRACE(15,
3350  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3351  return FALSE;
3352  }
3353 
3354  // Check the flag again to see if it has already done in case to be trapped
3355  // into infinite loop when a if0 task depends on a hidden helper task
3356  // outside any parallel region. Detached tasks are not impacted in this case
3357  // because the only thread executing this function has to execute the proxy
3358  // task so it is in another code path that has the same check.
3359  if (flag == NULL || (!final_spin && flag->done_check())) {
3360  KA_TRACE(15,
3361  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3362  gtid));
3363  return TRUE;
3364  }
3365 
3366  // We could be getting tasks from target constructs; if this is the only
3367  // thread, keep trying to execute tasks from own queue
3368  if (nthreads == 1 &&
3369  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3370  use_own_tasks = 1;
3371  else {
3372  KA_TRACE(15,
3373  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3374  return FALSE;
3375  }
3376  }
3377 }
3378 
3379 template <bool C, bool S>
3380 int __kmp_execute_tasks_32(
3381  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3382  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3383  kmp_int32 is_constrained) {
3384  return __kmp_execute_tasks_template(
3385  thread, gtid, flag, final_spin,
3386  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3387 }
3388 
3389 template <bool C, bool S>
3390 int __kmp_execute_tasks_64(
3391  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3392  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3393  kmp_int32 is_constrained) {
3394  return __kmp_execute_tasks_template(
3395  thread, gtid, flag, final_spin,
3396  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3397 }
3398 
3399 template <bool C, bool S>
3400 int __kmp_atomic_execute_tasks_64(
3401  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3402  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3403  kmp_int32 is_constrained) {
3404  return __kmp_execute_tasks_template(
3405  thread, gtid, flag, final_spin,
3406  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3407 }
3408 
3409 int __kmp_execute_tasks_oncore(
3410  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3411  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3412  kmp_int32 is_constrained) {
3413  return __kmp_execute_tasks_template(
3414  thread, gtid, flag, final_spin,
3415  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3416 }
3417 
3418 template int
3419 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3420  kmp_flag_32<false, false> *, int,
3421  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3422 
3423 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3424  kmp_flag_64<false, true> *,
3425  int,
3426  int *USE_ITT_BUILD_ARG(void *),
3427  kmp_int32);
3428 
3429 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3430  kmp_flag_64<true, false> *,
3431  int,
3432  int *USE_ITT_BUILD_ARG(void *),
3433  kmp_int32);
3434 
3435 template int __kmp_atomic_execute_tasks_64<false, true>(
3436  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3437  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3438 
3439 template int __kmp_atomic_execute_tasks_64<true, false>(
3440  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3441  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3442 
3443 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3444 // next barrier so they can assist in executing enqueued tasks.
3445 // First thread in allocates the task team atomically.
3446 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3447  kmp_info_t *this_thr) {
3448  kmp_thread_data_t *threads_data;
3449  int nthreads, i, is_init_thread;
3450 
3451  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3452  __kmp_gtid_from_thread(this_thr)));
3453 
3454  KMP_DEBUG_ASSERT(task_team != NULL);
3455  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3456 
3457  nthreads = task_team->tt.tt_nproc;
3458  KMP_DEBUG_ASSERT(nthreads > 0);
3459  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3460 
3461  // Allocate or increase the size of threads_data if necessary
3462  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3463 
3464  if (!is_init_thread) {
3465  // Some other thread already set up the array.
3466  KA_TRACE(
3467  20,
3468  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3469  __kmp_gtid_from_thread(this_thr)));
3470  return;
3471  }
3472  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3473  KMP_DEBUG_ASSERT(threads_data != NULL);
3474 
3475  if (__kmp_tasking_mode == tskm_task_teams &&
3476  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3477  // Release any threads sleeping at the barrier, so that they can steal
3478  // tasks and execute them. In extra barrier mode, tasks do not sleep
3479  // at the separate tasking barrier, so this isn't a problem.
3480  for (i = 0; i < nthreads; i++) {
3481  void *sleep_loc;
3482  kmp_info_t *thread = threads_data[i].td.td_thr;
3483 
3484  if (i == this_thr->th.th_info.ds.ds_tid) {
3485  continue;
3486  }
3487  // Since we haven't locked the thread's suspend mutex lock at this
3488  // point, there is a small window where a thread might be putting
3489  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3490  // To work around this, __kmp_execute_tasks_template() periodically checks
3491  // see if other threads are sleeping (using the same random mechanism that
3492  // is used for task stealing) and awakens them if they are.
3493  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3494  NULL) {
3495  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3496  __kmp_gtid_from_thread(this_thr),
3497  __kmp_gtid_from_thread(thread)));
3498  __kmp_null_resume_wrapper(thread);
3499  } else {
3500  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3501  __kmp_gtid_from_thread(this_thr),
3502  __kmp_gtid_from_thread(thread)));
3503  }
3504  }
3505  }
3506 
3507  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3508  __kmp_gtid_from_thread(this_thr)));
3509 }
3510 
3511 /* // TODO: Check the comment consistency
3512  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3513  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3514  * After a child * thread checks into a barrier and calls __kmp_release() from
3515  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3516  * longer assume that the kmp_team_t structure is intact (at any moment, the
3517  * primary thread may exit the barrier code and free the team data structure,
3518  * and return the threads to the thread pool).
3519  *
3520  * This does not work with the tasking code, as the thread is still
3521  * expected to participate in the execution of any tasks that may have been
3522  * spawned my a member of the team, and the thread still needs access to all
3523  * to each thread in the team, so that it can steal work from it.
3524  *
3525  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3526  * counting mechanism, and is allocated by the primary thread before calling
3527  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3528  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3529  * of the kmp_task_team_t structs for consecutive barriers can overlap
3530  * (and will, unless the primary thread is the last thread to exit the barrier
3531  * release phase, which is not typical). The existence of such a struct is
3532  * useful outside the context of tasking.
3533  *
3534  * We currently use the existence of the threads array as an indicator that
3535  * tasks were spawned since the last barrier. If the structure is to be
3536  * useful outside the context of tasking, then this will have to change, but
3537  * not setting the field minimizes the performance impact of tasking on
3538  * barriers, when no explicit tasks were spawned (pushed, actually).
3539  */
3540 
3541 static kmp_task_team_t *__kmp_free_task_teams =
3542  NULL; // Free list for task_team data structures
3543 // Lock for task team data structures
3544 kmp_bootstrap_lock_t __kmp_task_team_lock =
3545  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3546 
3547 // __kmp_alloc_task_deque:
3548 // Allocates a task deque for a particular thread, and initialize the necessary
3549 // data structures relating to the deque. This only happens once per thread
3550 // per task team since task teams are recycled. No lock is needed during
3551 // allocation since each thread allocates its own deque.
3552 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3553  kmp_thread_data_t *thread_data) {
3554  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3555  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3556 
3557  // Initialize last stolen task field to "none"
3558  thread_data->td.td_deque_last_stolen = -1;
3559 
3560  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3561  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3562  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3563 
3564  KE_TRACE(
3565  10,
3566  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3567  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3568  // Allocate space for task deque, and zero the deque
3569  // Cannot use __kmp_thread_calloc() because threads not around for
3570  // kmp_reap_task_team( ).
3571  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3572  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3573  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3574 }
3575 
3576 // __kmp_free_task_deque:
3577 // Deallocates a task deque for a particular thread. Happens at library
3578 // deallocation so don't need to reset all thread data fields.
3579 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3580  if (thread_data->td.td_deque != NULL) {
3581  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3582  TCW_4(thread_data->td.td_deque_ntasks, 0);
3583  __kmp_free(thread_data->td.td_deque);
3584  thread_data->td.td_deque = NULL;
3585  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3586  }
3587 
3588 #ifdef BUILD_TIED_TASK_STACK
3589  // GEH: Figure out what to do here for td_susp_tied_tasks
3590  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3591  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3592  }
3593 #endif // BUILD_TIED_TASK_STACK
3594 }
3595 
3596 // __kmp_realloc_task_threads_data:
3597 // Allocates a threads_data array for a task team, either by allocating an
3598 // initial array or enlarging an existing array. Only the first thread to get
3599 // the lock allocs or enlarges the array and re-initializes the array elements.
3600 // That thread returns "TRUE", the rest return "FALSE".
3601 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3602 // The current size is given by task_team -> tt.tt_max_threads.
3603 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3604  kmp_task_team_t *task_team) {
3605  kmp_thread_data_t **threads_data_p;
3606  kmp_int32 nthreads, maxthreads;
3607  int is_init_thread = FALSE;
3608 
3609  if (TCR_4(task_team->tt.tt_found_tasks)) {
3610  // Already reallocated and initialized.
3611  return FALSE;
3612  }
3613 
3614  threads_data_p = &task_team->tt.tt_threads_data;
3615  nthreads = task_team->tt.tt_nproc;
3616  maxthreads = task_team->tt.tt_max_threads;
3617 
3618  // All threads must lock when they encounter the first task of the implicit
3619  // task region to make sure threads_data fields are (re)initialized before
3620  // used.
3621  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3622 
3623  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3624  // first thread to enable tasking
3625  kmp_team_t *team = thread->th.th_team;
3626  int i;
3627 
3628  is_init_thread = TRUE;
3629  if (maxthreads < nthreads) {
3630 
3631  if (*threads_data_p != NULL) {
3632  kmp_thread_data_t *old_data = *threads_data_p;
3633  kmp_thread_data_t *new_data = NULL;
3634 
3635  KE_TRACE(
3636  10,
3637  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3638  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3639  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3640  // Reallocate threads_data to have more elements than current array
3641  // Cannot use __kmp_thread_realloc() because threads not around for
3642  // kmp_reap_task_team( ). Note all new array entries are initialized
3643  // to zero by __kmp_allocate().
3644  new_data = (kmp_thread_data_t *)__kmp_allocate(
3645  nthreads * sizeof(kmp_thread_data_t));
3646  // copy old data to new data
3647  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3648  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3649 
3650 #ifdef BUILD_TIED_TASK_STACK
3651  // GEH: Figure out if this is the right thing to do
3652  for (i = maxthreads; i < nthreads; i++) {
3653  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3654  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3655  }
3656 #endif // BUILD_TIED_TASK_STACK
3657  // Install the new data and free the old data
3658  (*threads_data_p) = new_data;
3659  __kmp_free(old_data);
3660  } else {
3661  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3662  "threads data for task_team %p, size = %d\n",
3663  __kmp_gtid_from_thread(thread), task_team, nthreads));
3664  // Make the initial allocate for threads_data array, and zero entries
3665  // Cannot use __kmp_thread_calloc() because threads not around for
3666  // kmp_reap_task_team( ).
3667  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3668  nthreads * sizeof(kmp_thread_data_t));
3669 #ifdef BUILD_TIED_TASK_STACK
3670  // GEH: Figure out if this is the right thing to do
3671  for (i = 0; i < nthreads; i++) {
3672  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3673  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3674  }
3675 #endif // BUILD_TIED_TASK_STACK
3676  }
3677  task_team->tt.tt_max_threads = nthreads;
3678  } else {
3679  // If array has (more than) enough elements, go ahead and use it
3680  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3681  }
3682 
3683  // initialize threads_data pointers back to thread_info structures
3684  for (i = 0; i < nthreads; i++) {
3685  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3686  thread_data->td.td_thr = team->t.t_threads[i];
3687 
3688  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3689  // The last stolen field survives across teams / barrier, and the number
3690  // of threads may have changed. It's possible (likely?) that a new
3691  // parallel region will exhibit the same behavior as previous region.
3692  thread_data->td.td_deque_last_stolen = -1;
3693  }
3694  }
3695 
3696  KMP_MB();
3697  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3698  }
3699 
3700  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3701  return is_init_thread;
3702 }
3703 
3704 // __kmp_free_task_threads_data:
3705 // Deallocates a threads_data array for a task team, including any attached
3706 // tasking deques. Only occurs at library shutdown.
3707 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3708  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3709  if (task_team->tt.tt_threads_data != NULL) {
3710  int i;
3711  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3712  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3713  }
3714  __kmp_free(task_team->tt.tt_threads_data);
3715  task_team->tt.tt_threads_data = NULL;
3716  }
3717  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3718 }
3719 
3720 // __kmp_free_task_pri_list:
3721 // Deallocates tasking deques used for priority tasks.
3722 // Only occurs at library shutdown.
3723 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3724  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3725  if (task_team->tt.tt_task_pri_list != NULL) {
3726  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3727  while (list != NULL) {
3728  kmp_task_pri_t *next = list->next;
3729  __kmp_free_task_deque(&list->td);
3730  __kmp_free(list);
3731  list = next;
3732  }
3733  task_team->tt.tt_task_pri_list = NULL;
3734  }
3735  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3736 }
3737 
3738 // __kmp_allocate_task_team:
3739 // Allocates a task team associated with a specific team, taking it from
3740 // the global task team free list if possible. Also initializes data
3741 // structures.
3742 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3743  kmp_team_t *team) {
3744  kmp_task_team_t *task_team = NULL;
3745  int nthreads;
3746 
3747  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3748  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3749 
3750  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3751  // Take a task team from the task team pool
3752  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3753  if (__kmp_free_task_teams != NULL) {
3754  task_team = __kmp_free_task_teams;
3755  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3756  task_team->tt.tt_next = NULL;
3757  }
3758  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3759  }
3760 
3761  if (task_team == NULL) {
3762  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3763  "task team for team %p\n",
3764  __kmp_gtid_from_thread(thread), team));
3765  // Allocate a new task team if one is not available. Cannot use
3766  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3767  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3768  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3769  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3770 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3771  // suppress race conditions detection on synchronization flags in debug mode
3772  // this helps to analyze library internals eliminating false positives
3773  __itt_suppress_mark_range(
3774  __itt_suppress_range, __itt_suppress_threading_errors,
3775  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3776  __itt_suppress_mark_range(__itt_suppress_range,
3777  __itt_suppress_threading_errors,
3778  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3779  sizeof(task_team->tt.tt_active));
3780 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3781  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3782  // task_team->tt.tt_threads_data = NULL;
3783  // task_team->tt.tt_max_threads = 0;
3784  // task_team->tt.tt_next = NULL;
3785  }
3786 
3787  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3788  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3789  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3790  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3791 
3792  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3793  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3794  TCW_4(task_team->tt.tt_active, TRUE);
3795 
3796  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3797  "unfinished_threads init'd to %d\n",
3798  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3799  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3800  return task_team;
3801 }
3802 
3803 // __kmp_free_task_team:
3804 // Frees the task team associated with a specific thread, and adds it
3805 // to the global task team free list.
3806 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3807  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3808  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3809 
3810  // Put task team back on free list
3811  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3812 
3813  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3814  task_team->tt.tt_next = __kmp_free_task_teams;
3815  TCW_PTR(__kmp_free_task_teams, task_team);
3816 
3817  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3818 }
3819 
3820 // __kmp_reap_task_teams:
3821 // Free all the task teams on the task team free list.
3822 // Should only be done during library shutdown.
3823 // Cannot do anything that needs a thread structure or gtid since they are
3824 // already gone.
3825 void __kmp_reap_task_teams(void) {
3826  kmp_task_team_t *task_team;
3827 
3828  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3829  // Free all task_teams on the free list
3830  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3831  while ((task_team = __kmp_free_task_teams) != NULL) {
3832  __kmp_free_task_teams = task_team->tt.tt_next;
3833  task_team->tt.tt_next = NULL;
3834 
3835  // Free threads_data if necessary
3836  if (task_team->tt.tt_threads_data != NULL) {
3837  __kmp_free_task_threads_data(task_team);
3838  }
3839  if (task_team->tt.tt_task_pri_list != NULL) {
3840  __kmp_free_task_pri_list(task_team);
3841  }
3842  __kmp_free(task_team);
3843  }
3844  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3845  }
3846 }
3847 
3848 // __kmp_wait_to_unref_task_teams:
3849 // Some threads could still be in the fork barrier release code, possibly
3850 // trying to steal tasks. Wait for each thread to unreference its task team.
3851 void __kmp_wait_to_unref_task_teams(void) {
3852  kmp_info_t *thread;
3853  kmp_uint32 spins;
3854  kmp_uint64 time;
3855  int done;
3856 
3857  KMP_INIT_YIELD(spins);
3858  KMP_INIT_BACKOFF(time);
3859 
3860  for (;;) {
3861  done = TRUE;
3862 
3863  // TODO: GEH - this may be is wrong because some sync would be necessary
3864  // in case threads are added to the pool during the traversal. Need to
3865  // verify that lock for thread pool is held when calling this routine.
3866  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3867  thread = thread->th.th_next_pool) {
3868 #if KMP_OS_WINDOWS
3869  DWORD exit_val;
3870 #endif
3871  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3872  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3873  __kmp_gtid_from_thread(thread)));
3874  continue;
3875  }
3876 #if KMP_OS_WINDOWS
3877  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3878  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3879  thread->th.th_task_team = NULL;
3880  continue;
3881  }
3882 #endif
3883 
3884  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3885 
3886  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3887  "unreference task_team\n",
3888  __kmp_gtid_from_thread(thread)));
3889 
3890  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3891  void *sleep_loc;
3892  // If the thread is sleeping, awaken it.
3893  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3894  NULL) {
3895  KA_TRACE(
3896  10,
3897  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3898  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3899  __kmp_null_resume_wrapper(thread);
3900  }
3901  }
3902  }
3903  if (done) {
3904  break;
3905  }
3906 
3907  // If oversubscribed or have waited a bit, yield.
3908  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3909  }
3910 }
3911 
3912 // __kmp_task_team_setup: Create a task_team for the current team, but use
3913 // an already created, unused one if it already exists.
3914 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3915  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3916 
3917  // If this task_team hasn't been created yet, allocate it. It will be used in
3918  // the region after the next.
3919  // If it exists, it is the current task team and shouldn't be touched yet as
3920  // it may still be in use.
3921  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3922  (always || team->t.t_nproc > 1)) {
3923  team->t.t_task_team[this_thr->th.th_task_state] =
3924  __kmp_allocate_task_team(this_thr, team);
3925  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3926  " for team %d at parity=%d\n",
3927  __kmp_gtid_from_thread(this_thr),
3928  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3929  this_thr->th.th_task_state));
3930  }
3931 
3932  // After threads exit the release, they will call sync, and then point to this
3933  // other task_team; make sure it is allocated and properly initialized. As
3934  // threads spin in the barrier release phase, they will continue to use the
3935  // previous task_team struct(above), until they receive the signal to stop
3936  // checking for tasks (they can't safely reference the kmp_team_t struct,
3937  // which could be reallocated by the primary thread). No task teams are formed
3938  // for serialized teams.
3939  if (team->t.t_nproc > 1) {
3940  int other_team = 1 - this_thr->th.th_task_state;
3941  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3942  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3943  team->t.t_task_team[other_team] =
3944  __kmp_allocate_task_team(this_thr, team);
3945  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3946  "task_team %p for team %d at parity=%d\n",
3947  __kmp_gtid_from_thread(this_thr),
3948  team->t.t_task_team[other_team], team->t.t_id, other_team));
3949  } else { // Leave the old task team struct in place for the upcoming region;
3950  // adjust as needed
3951  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3952  if (!task_team->tt.tt_active ||
3953  team->t.t_nproc != task_team->tt.tt_nproc) {
3954  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3955  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3956  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3957  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3958  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3959  team->t.t_nproc);
3960  TCW_4(task_team->tt.tt_active, TRUE);
3961  }
3962  // if team size has changed, the first thread to enable tasking will
3963  // realloc threads_data if necessary
3964  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3965  "%p for team %d at parity=%d\n",
3966  __kmp_gtid_from_thread(this_thr),
3967  team->t.t_task_team[other_team], team->t.t_id, other_team));
3968  }
3969  }
3970 
3971  // For regular thread, task enabling should be called when the task is going
3972  // to be pushed to a dequeue. However, for the hidden helper thread, we need
3973  // it ahead of time so that some operations can be performed without race
3974  // condition.
3975  if (this_thr == __kmp_hidden_helper_main_thread) {
3976  for (int i = 0; i < 2; ++i) {
3977  kmp_task_team_t *task_team = team->t.t_task_team[i];
3978  if (KMP_TASKING_ENABLED(task_team)) {
3979  continue;
3980  }
3981  __kmp_enable_tasking(task_team, this_thr);
3982  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3983  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3984  if (thread_data->td.td_deque == NULL) {
3985  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3986  }
3987  }
3988  }
3989  }
3990 }
3991 
3992 // __kmp_task_team_sync: Propagation of task team data from team to threads
3993 // which happens just after the release phase of a team barrier. This may be
3994 // called by any thread, but only for teams with # threads > 1.
3995 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3996  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3997 
3998  // Toggle the th_task_state field, to switch which task_team this thread
3999  // refers to
4000  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4001 
4002  // It is now safe to propagate the task team pointer from the team struct to
4003  // the current thread.
4004  TCW_PTR(this_thr->th.th_task_team,
4005  team->t.t_task_team[this_thr->th.th_task_state]);
4006  KA_TRACE(20,
4007  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4008  "%p from Team #%d (parity=%d)\n",
4009  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4010  team->t.t_id, this_thr->th.th_task_state));
4011 }
4012 
4013 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4014 // barrier gather phase. Only called by primary thread if #threads in team > 1
4015 // or if proxy tasks were created.
4016 //
4017 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4018 // by passing in 0 optionally as the last argument. When wait is zero, primary
4019 // thread does not wait for unfinished_threads to reach 0.
4020 void __kmp_task_team_wait(
4021  kmp_info_t *this_thr,
4022  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4023  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4024 
4025  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4026  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4027 
4028  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4029  if (wait) {
4030  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4031  "(for unfinished_threads to reach 0) on task_team = %p\n",
4032  __kmp_gtid_from_thread(this_thr), task_team));
4033  // Worker threads may have dropped through to release phase, but could
4034  // still be executing tasks. Wait here for tasks to complete. To avoid
4035  // memory contention, only primary thread checks termination condition.
4036  kmp_flag_32<false, false> flag(
4037  RCAST(std::atomic<kmp_uint32> *,
4038  &task_team->tt.tt_unfinished_threads),
4039  0U);
4040  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4041  }
4042  // Deactivate the old task team, so that the worker threads will stop
4043  // referencing it while spinning.
4044  KA_TRACE(
4045  20,
4046  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4047  "setting active to false, setting local and team's pointer to NULL\n",
4048  __kmp_gtid_from_thread(this_thr), task_team));
4049  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
4050  task_team->tt.tt_found_proxy_tasks == TRUE ||
4051  task_team->tt.tt_hidden_helper_task_encountered == TRUE);
4052  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4053  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4054  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4055  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4056  KMP_MB();
4057 
4058  TCW_PTR(this_thr->th.th_task_team, NULL);
4059  }
4060 }
4061 
4062 // __kmp_tasking_barrier:
4063 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4064 // Internal function to execute all tasks prior to a regular barrier or a join
4065 // barrier. It is a full barrier itself, which unfortunately turns regular
4066 // barriers into double barriers and join barriers into 1 1/2 barriers.
4067 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4068  std::atomic<kmp_uint32> *spin = RCAST(
4069  std::atomic<kmp_uint32> *,
4070  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4071  int flag = FALSE;
4072  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4073 
4074 #if USE_ITT_BUILD
4075  KMP_FSYNC_SPIN_INIT(spin, NULL);
4076 #endif /* USE_ITT_BUILD */
4077  kmp_flag_32<false, false> spin_flag(spin, 0U);
4078  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4079  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4080 #if USE_ITT_BUILD
4081  // TODO: What about itt_sync_obj??
4082  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4083 #endif /* USE_ITT_BUILD */
4084 
4085  if (TCR_4(__kmp_global.g.g_done)) {
4086  if (__kmp_global.g.g_abort)
4087  __kmp_abort_thread();
4088  break;
4089  }
4090  KMP_YIELD(TRUE);
4091  }
4092 #if USE_ITT_BUILD
4093  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4094 #endif /* USE_ITT_BUILD */
4095 }
4096 
4097 // __kmp_give_task puts a task into a given thread queue if:
4098 // - the queue for that thread was created
4099 // - there's space in that queue
4100 // Because of this, __kmp_push_task needs to check if there's space after
4101 // getting the lock
4102 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4103  kmp_int32 pass) {
4104  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4105  kmp_task_team_t *task_team = taskdata->td_task_team;
4106 
4107  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4108  taskdata, tid));
4109 
4110  // If task_team is NULL something went really bad...
4111  KMP_DEBUG_ASSERT(task_team != NULL);
4112 
4113  bool result = false;
4114  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4115 
4116  if (thread_data->td.td_deque == NULL) {
4117  // There's no queue in this thread, go find another one
4118  // We're guaranteed that at least one thread has a queue
4119  KA_TRACE(30,
4120  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4121  tid, taskdata));
4122  return result;
4123  }
4124 
4125  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4126  TASK_DEQUE_SIZE(thread_data->td)) {
4127  KA_TRACE(
4128  30,
4129  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4130  taskdata, tid));
4131 
4132  // if this deque is bigger than the pass ratio give a chance to another
4133  // thread
4134  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4135  return result;
4136 
4137  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4138  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4139  TASK_DEQUE_SIZE(thread_data->td)) {
4140  // expand deque to push the task which is not allowed to execute
4141  __kmp_realloc_task_deque(thread, thread_data);
4142  }
4143 
4144  } else {
4145 
4146  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4147 
4148  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4149  TASK_DEQUE_SIZE(thread_data->td)) {
4150  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4151  "thread %d.\n",
4152  taskdata, tid));
4153 
4154  // if this deque is bigger than the pass ratio give a chance to another
4155  // thread
4156  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4157  goto release_and_exit;
4158 
4159  __kmp_realloc_task_deque(thread, thread_data);
4160  }
4161  }
4162 
4163  // lock is held here, and there is space in the deque
4164 
4165  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4166  // Wrap index.
4167  thread_data->td.td_deque_tail =
4168  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4169  TCW_4(thread_data->td.td_deque_ntasks,
4170  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4171 
4172  result = true;
4173  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4174  taskdata, tid));
4175 
4176 release_and_exit:
4177  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4178 
4179  return result;
4180 }
4181 
4182 #define PROXY_TASK_FLAG 0x40000000
4183 /* The finish of the proxy tasks is divided in two pieces:
4184  - the top half is the one that can be done from a thread outside the team
4185  - the bottom half must be run from a thread within the team
4186 
4187  In order to run the bottom half the task gets queued back into one of the
4188  threads of the team. Once the td_incomplete_child_task counter of the parent
4189  is decremented the threads can leave the barriers. So, the bottom half needs
4190  to be queued before the counter is decremented. The top half is therefore
4191  divided in two parts:
4192  - things that can be run before queuing the bottom half
4193  - things that must be run after queuing the bottom half
4194 
4195  This creates a second race as the bottom half can free the task before the
4196  second top half is executed. To avoid this we use the
4197  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4198  half. */
4199 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4200  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4201  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4202  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4203  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4204 
4205  taskdata->td_flags.complete = 1; // mark the task as completed
4206 
4207  if (taskdata->td_taskgroup)
4208  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4209 
4210  // Create an imaginary children for this task so the bottom half cannot
4211  // release the task before we have completed the second top half
4212  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4213 }
4214 
4215 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4216 #if KMP_DEBUG
4217  kmp_int32 children = 0;
4218  // Predecrement simulated by "- 1" calculation
4219  children = -1 +
4220 #endif
4221  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4222  KMP_DEBUG_ASSERT(children >= 0);
4223 
4224  // Remove the imaginary children
4225  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4226 }
4227 
4228 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4229  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4230  kmp_info_t *thread = __kmp_threads[gtid];
4231 
4232  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4233  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4234  1); // top half must run before bottom half
4235 
4236  // We need to wait to make sure the top half is finished
4237  // Spinning here should be ok as this should happen quickly
4238  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4239  PROXY_TASK_FLAG) > 0)
4240  ;
4241 
4242  __kmp_release_deps(gtid, taskdata);
4243  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4244 }
4245 
4254 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4255  KMP_DEBUG_ASSERT(ptask != NULL);
4256  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4257  KA_TRACE(
4258  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4259  gtid, taskdata));
4260  __kmp_assert_valid_gtid(gtid);
4261  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4262 
4263  __kmp_first_top_half_finish_proxy(taskdata);
4264  __kmp_second_top_half_finish_proxy(taskdata);
4265  __kmp_bottom_half_finish_proxy(gtid, ptask);
4266 
4267  KA_TRACE(10,
4268  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4269  gtid, taskdata));
4270 }
4271 
4272 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4273  KMP_DEBUG_ASSERT(ptask != NULL);
4274  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4275 
4276  // Enqueue task to complete bottom half completion from a thread within the
4277  // corresponding team
4278  kmp_team_t *team = taskdata->td_team;
4279  kmp_int32 nthreads = team->t.t_nproc;
4280  kmp_info_t *thread;
4281 
4282  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4283  // but we cannot use __kmp_get_random here
4284  kmp_int32 start_k = start % nthreads;
4285  kmp_int32 pass = 1;
4286  kmp_int32 k = start_k;
4287 
4288  do {
4289  // For now we're just linearly trying to find a thread
4290  thread = team->t.t_threads[k];
4291  k = (k + 1) % nthreads;
4292 
4293  // we did a full pass through all the threads
4294  if (k == start_k)
4295  pass = pass << 1;
4296 
4297  } while (!__kmp_give_task(thread, k, ptask, pass));
4298 
4299  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4300  // awake at least one thread to execute given task
4301  for (int i = 0; i < nthreads; ++i) {
4302  thread = team->t.t_threads[i];
4303  if (thread->th.th_sleep_loc != NULL) {
4304  __kmp_null_resume_wrapper(thread);
4305  break;
4306  }
4307  }
4308  }
4309 }
4310 
4318 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4319  KMP_DEBUG_ASSERT(ptask != NULL);
4320  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4321 
4322  KA_TRACE(
4323  10,
4324  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4325  taskdata));
4326 
4327  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4328 
4329  __kmp_first_top_half_finish_proxy(taskdata);
4330 
4331  __kmpc_give_task(ptask);
4332 
4333  __kmp_second_top_half_finish_proxy(taskdata);
4334 
4335  KA_TRACE(
4336  10,
4337  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4338  taskdata));
4339 }
4340 
4341 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4342  kmp_task_t *task) {
4343  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4344  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4345  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4346  td->td_allow_completion_event.ed.task = task;
4347  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4348  }
4349  return &td->td_allow_completion_event;
4350 }
4351 
4352 void __kmp_fulfill_event(kmp_event_t *event) {
4353  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4354  kmp_task_t *ptask = event->ed.task;
4355  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4356  bool detached = false;
4357  int gtid = __kmp_get_gtid();
4358 
4359  // The associated task might have completed or could be completing at this
4360  // point.
4361  // We need to take the lock to avoid races
4362  __kmp_acquire_tas_lock(&event->lock, gtid);
4363  if (taskdata->td_flags.proxy == TASK_PROXY) {
4364  detached = true;
4365  } else {
4366 #if OMPT_SUPPORT
4367  // The OMPT event must occur under mutual exclusion,
4368  // otherwise the tool might access ptask after free
4369  if (UNLIKELY(ompt_enabled.enabled))
4370  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4371 #endif
4372  }
4373  event->type = KMP_EVENT_UNINITIALIZED;
4374  __kmp_release_tas_lock(&event->lock, gtid);
4375 
4376  if (detached) {
4377 #if OMPT_SUPPORT
4378  // We free ptask afterwards and know the task is finished,
4379  // so locking is not necessary
4380  if (UNLIKELY(ompt_enabled.enabled))
4381  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4382 #endif
4383  // If the task detached complete the proxy task
4384  if (gtid >= 0) {
4385  kmp_team_t *team = taskdata->td_team;
4386  kmp_info_t *thread = __kmp_get_thread();
4387  if (thread->th.th_team == team) {
4388  __kmpc_proxy_task_completed(gtid, ptask);
4389  return;
4390  }
4391  }
4392 
4393  // fallback
4395  }
4396  }
4397 }
4398 
4399 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4400 // for taskloop
4401 //
4402 // thread: allocating thread
4403 // task_src: pointer to source task to be duplicated
4404 // returns: a pointer to the allocated kmp_task_t structure (task).
4405 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4406  kmp_task_t *task;
4407  kmp_taskdata_t *taskdata;
4408  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4409  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4410  size_t shareds_offset;
4411  size_t task_size;
4412 
4413  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4414  task_src));
4415  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4416  TASK_FULL); // it should not be proxy task
4417  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4418  task_size = taskdata_src->td_size_alloc;
4419 
4420  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4421  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4422  task_size));
4423 #if USE_FAST_MEMORY
4424  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4425 #else
4426  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4427 #endif /* USE_FAST_MEMORY */
4428  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4429 
4430  task = KMP_TASKDATA_TO_TASK(taskdata);
4431 
4432  // Initialize new task (only specific fields not affected by memcpy)
4433  taskdata->td_task_id = KMP_GEN_TASK_ID();
4434  if (task->shareds != NULL) { // need setup shareds pointer
4435  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4436  task->shareds = &((char *)taskdata)[shareds_offset];
4437  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4438  0);
4439  }
4440  taskdata->td_alloc_thread = thread;
4441  taskdata->td_parent = parent_task;
4442  // task inherits the taskgroup from the parent task
4443  taskdata->td_taskgroup = parent_task->td_taskgroup;
4444  // tied task needs to initialize the td_last_tied at creation,
4445  // untied one does this when it is scheduled for execution
4446  if (taskdata->td_flags.tiedness == TASK_TIED)
4447  taskdata->td_last_tied = taskdata;
4448 
4449  // Only need to keep track of child task counts if team parallel and tasking
4450  // not serialized
4451  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4452  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4453  if (parent_task->td_taskgroup)
4454  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4455  // Only need to keep track of allocated child tasks for explicit tasks since
4456  // implicit not deallocated
4457  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4458  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4459  }
4460 
4461  KA_TRACE(20,
4462  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4463  thread, taskdata, taskdata->td_parent));
4464 #if OMPT_SUPPORT
4465  if (UNLIKELY(ompt_enabled.enabled))
4466  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4467 #endif
4468  return task;
4469 }
4470 
4471 // Routine optionally generated by the compiler for setting the lastprivate flag
4472 // and calling needed constructors for private/firstprivate objects
4473 // (used to form taskloop tasks from pattern task)
4474 // Parameters: dest task, src task, lastprivate flag.
4475 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4476 
4477 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4478 
4479 // class to encapsulate manipulating loop bounds in a taskloop task.
4480 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4481 // the loop bound variables.
4482 class kmp_taskloop_bounds_t {
4483  kmp_task_t *task;
4484  const kmp_taskdata_t *taskdata;
4485  size_t lower_offset;
4486  size_t upper_offset;
4487 
4488 public:
4489  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4490  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4491  lower_offset((char *)lb - (char *)task),
4492  upper_offset((char *)ub - (char *)task) {
4493  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4494  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4495  }
4496  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4497  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4498  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4499  size_t get_lower_offset() const { return lower_offset; }
4500  size_t get_upper_offset() const { return upper_offset; }
4501  kmp_uint64 get_lb() const {
4502  kmp_int64 retval;
4503 #if defined(KMP_GOMP_COMPAT)
4504  // Intel task just returns the lower bound normally
4505  if (!taskdata->td_flags.native) {
4506  retval = *(kmp_int64 *)((char *)task + lower_offset);
4507  } else {
4508  // GOMP task has to take into account the sizeof(long)
4509  if (taskdata->td_size_loop_bounds == 4) {
4510  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4511  retval = (kmp_int64)*lb;
4512  } else {
4513  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4514  retval = (kmp_int64)*lb;
4515  }
4516  }
4517 #else
4518  (void)taskdata;
4519  retval = *(kmp_int64 *)((char *)task + lower_offset);
4520 #endif // defined(KMP_GOMP_COMPAT)
4521  return retval;
4522  }
4523  kmp_uint64 get_ub() const {
4524  kmp_int64 retval;
4525 #if defined(KMP_GOMP_COMPAT)
4526  // Intel task just returns the upper bound normally
4527  if (!taskdata->td_flags.native) {
4528  retval = *(kmp_int64 *)((char *)task + upper_offset);
4529  } else {
4530  // GOMP task has to take into account the sizeof(long)
4531  if (taskdata->td_size_loop_bounds == 4) {
4532  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4533  retval = (kmp_int64)*ub;
4534  } else {
4535  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4536  retval = (kmp_int64)*ub;
4537  }
4538  }
4539 #else
4540  retval = *(kmp_int64 *)((char *)task + upper_offset);
4541 #endif // defined(KMP_GOMP_COMPAT)
4542  return retval;
4543  }
4544  void set_lb(kmp_uint64 lb) {
4545 #if defined(KMP_GOMP_COMPAT)
4546  // Intel task just sets the lower bound normally
4547  if (!taskdata->td_flags.native) {
4548  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4549  } else {
4550  // GOMP task has to take into account the sizeof(long)
4551  if (taskdata->td_size_loop_bounds == 4) {
4552  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4553  *lower = (kmp_uint32)lb;
4554  } else {
4555  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4556  *lower = (kmp_uint64)lb;
4557  }
4558  }
4559 #else
4560  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4561 #endif // defined(KMP_GOMP_COMPAT)
4562  }
4563  void set_ub(kmp_uint64 ub) {
4564 #if defined(KMP_GOMP_COMPAT)
4565  // Intel task just sets the upper bound normally
4566  if (!taskdata->td_flags.native) {
4567  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4568  } else {
4569  // GOMP task has to take into account the sizeof(long)
4570  if (taskdata->td_size_loop_bounds == 4) {
4571  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4572  *upper = (kmp_uint32)ub;
4573  } else {
4574  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4575  *upper = (kmp_uint64)ub;
4576  }
4577  }
4578 #else
4579  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4580 #endif // defined(KMP_GOMP_COMPAT)
4581  }
4582 };
4583 
4584 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4585 //
4586 // loc Source location information
4587 // gtid Global thread ID
4588 // task Pattern task, exposes the loop iteration range
4589 // lb Pointer to loop lower bound in task structure
4590 // ub Pointer to loop upper bound in task structure
4591 // st Loop stride
4592 // ub_glob Global upper bound (used for lastprivate check)
4593 // num_tasks Number of tasks to execute
4594 // grainsize Number of loop iterations per task
4595 // extras Number of chunks with grainsize+1 iterations
4596 // last_chunk Reduction of grainsize for last task
4597 // tc Iterations count
4598 // task_dup Tasks duplication routine
4599 // codeptr_ra Return address for OMPT events
4600 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4601  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4602  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4603  kmp_uint64 grainsize, kmp_uint64 extras,
4604  kmp_int64 last_chunk, kmp_uint64 tc,
4605 #if OMPT_SUPPORT
4606  void *codeptr_ra,
4607 #endif
4608  void *task_dup) {
4609  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4610  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4611  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4612  // compiler provides global bounds here
4613  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4614  kmp_uint64 lower = task_bounds.get_lb();
4615  kmp_uint64 upper = task_bounds.get_ub();
4616  kmp_uint64 i;
4617  kmp_info_t *thread = __kmp_threads[gtid];
4618  kmp_taskdata_t *current_task = thread->th.th_current_task;
4619  kmp_task_t *next_task;
4620  kmp_int32 lastpriv = 0;
4621 
4622  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4623  (last_chunk < 0 ? last_chunk : extras));
4624  KMP_DEBUG_ASSERT(num_tasks > extras);
4625  KMP_DEBUG_ASSERT(num_tasks > 0);
4626  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4627  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4628  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4629  ub_glob, st, task_dup));
4630 
4631  // Launch num_tasks tasks, assign grainsize iterations each task
4632  for (i = 0; i < num_tasks; ++i) {
4633  kmp_uint64 chunk_minus_1;
4634  if (extras == 0) {
4635  chunk_minus_1 = grainsize - 1;
4636  } else {
4637  chunk_minus_1 = grainsize;
4638  --extras; // first extras iterations get bigger chunk (grainsize+1)
4639  }
4640  upper = lower + st * chunk_minus_1;
4641  if (upper > *ub) {
4642  upper = *ub;
4643  }
4644  if (i == num_tasks - 1) {
4645  // schedule the last task, set lastprivate flag if needed
4646  if (st == 1) { // most common case
4647  KMP_DEBUG_ASSERT(upper == *ub);
4648  if (upper == ub_glob)
4649  lastpriv = 1;
4650  } else if (st > 0) { // positive loop stride
4651  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4652  if ((kmp_uint64)st > ub_glob - upper)
4653  lastpriv = 1;
4654  } else { // negative loop stride
4655  KMP_DEBUG_ASSERT(upper + st < *ub);
4656  if (upper - ub_glob < (kmp_uint64)(-st))
4657  lastpriv = 1;
4658  }
4659  }
4660  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4661  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4662  kmp_taskloop_bounds_t next_task_bounds =
4663  kmp_taskloop_bounds_t(next_task, task_bounds);
4664 
4665  // adjust task-specific bounds
4666  next_task_bounds.set_lb(lower);
4667  if (next_taskdata->td_flags.native) {
4668  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4669  } else {
4670  next_task_bounds.set_ub(upper);
4671  }
4672  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4673  // etc.
4674  ptask_dup(next_task, task, lastpriv);
4675  KA_TRACE(40,
4676  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4677  "upper %lld stride %lld, (offsets %p %p)\n",
4678  gtid, i, next_task, lower, upper, st,
4679  next_task_bounds.get_lower_offset(),
4680  next_task_bounds.get_upper_offset()));
4681 #if OMPT_SUPPORT
4682  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4683  codeptr_ra); // schedule new task
4684 #if OMPT_OPTIONAL
4685  if (ompt_enabled.ompt_callback_dispatch) {
4686  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4687  lower, upper, st);
4688  }
4689 #endif // OMPT_OPTIONAL
4690 #else
4691  __kmp_omp_task(gtid, next_task, true); // schedule new task
4692 #endif
4693  lower = upper + st; // adjust lower bound for the next iteration
4694  }
4695  // free the pattern task and exit
4696  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4697  // do not execute the pattern task, just do internal bookkeeping
4698  __kmp_task_finish<false>(gtid, task, current_task);
4699 }
4700 
4701 // Structure to keep taskloop parameters for auxiliary task
4702 // kept in the shareds of the task structure.
4703 typedef struct __taskloop_params {
4704  kmp_task_t *task;
4705  kmp_uint64 *lb;
4706  kmp_uint64 *ub;
4707  void *task_dup;
4708  kmp_int64 st;
4709  kmp_uint64 ub_glob;
4710  kmp_uint64 num_tasks;
4711  kmp_uint64 grainsize;
4712  kmp_uint64 extras;
4713  kmp_int64 last_chunk;
4714  kmp_uint64 tc;
4715  kmp_uint64 num_t_min;
4716 #if OMPT_SUPPORT
4717  void *codeptr_ra;
4718 #endif
4719 } __taskloop_params_t;
4720 
4721 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4722  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4723  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4724  kmp_uint64,
4725 #if OMPT_SUPPORT
4726  void *,
4727 #endif
4728  void *);
4729 
4730 // Execute part of the taskloop submitted as a task.
4731 int __kmp_taskloop_task(int gtid, void *ptask) {
4732  __taskloop_params_t *p =
4733  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4734  kmp_task_t *task = p->task;
4735  kmp_uint64 *lb = p->lb;
4736  kmp_uint64 *ub = p->ub;
4737  void *task_dup = p->task_dup;
4738  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4739  kmp_int64 st = p->st;
4740  kmp_uint64 ub_glob = p->ub_glob;
4741  kmp_uint64 num_tasks = p->num_tasks;
4742  kmp_uint64 grainsize = p->grainsize;
4743  kmp_uint64 extras = p->extras;
4744  kmp_int64 last_chunk = p->last_chunk;
4745  kmp_uint64 tc = p->tc;
4746  kmp_uint64 num_t_min = p->num_t_min;
4747 #if OMPT_SUPPORT
4748  void *codeptr_ra = p->codeptr_ra;
4749 #endif
4750 #if KMP_DEBUG
4751  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4752  KMP_DEBUG_ASSERT(task != NULL);
4753  KA_TRACE(20,
4754  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4755  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4756  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4757  st, task_dup));
4758 #endif
4759  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4760  if (num_tasks > num_t_min)
4761  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4762  grainsize, extras, last_chunk, tc, num_t_min,
4763 #if OMPT_SUPPORT
4764  codeptr_ra,
4765 #endif
4766  task_dup);
4767  else
4768  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4769  grainsize, extras, last_chunk, tc,
4770 #if OMPT_SUPPORT
4771  codeptr_ra,
4772 #endif
4773  task_dup);
4774 
4775  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4776  return 0;
4777 }
4778 
4779 // Schedule part of the taskloop as a task,
4780 // execute the rest of the taskloop.
4781 //
4782 // loc Source location information
4783 // gtid Global thread ID
4784 // task Pattern task, exposes the loop iteration range
4785 // lb Pointer to loop lower bound in task structure
4786 // ub Pointer to loop upper bound in task structure
4787 // st Loop stride
4788 // ub_glob Global upper bound (used for lastprivate check)
4789 // num_tasks Number of tasks to execute
4790 // grainsize Number of loop iterations per task
4791 // extras Number of chunks with grainsize+1 iterations
4792 // last_chunk Reduction of grainsize for last task
4793 // tc Iterations count
4794 // num_t_min Threshold to launch tasks recursively
4795 // task_dup Tasks duplication routine
4796 // codeptr_ra Return address for OMPT events
4797 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4798  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4799  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4800  kmp_uint64 grainsize, kmp_uint64 extras,
4801  kmp_int64 last_chunk, kmp_uint64 tc,
4802  kmp_uint64 num_t_min,
4803 #if OMPT_SUPPORT
4804  void *codeptr_ra,
4805 #endif
4806  void *task_dup) {
4807  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4808  KMP_DEBUG_ASSERT(task != NULL);
4809  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4810  KA_TRACE(20,
4811  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4812  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4813  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4814  st, task_dup));
4815  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4816  kmp_uint64 lower = *lb;
4817  kmp_info_t *thread = __kmp_threads[gtid];
4818  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4819  kmp_task_t *next_task;
4820  size_t lower_offset =
4821  (char *)lb - (char *)task; // remember offset of lb in the task structure
4822  size_t upper_offset =
4823  (char *)ub - (char *)task; // remember offset of ub in the task structure
4824 
4825  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4826  (last_chunk < 0 ? last_chunk : extras));
4827  KMP_DEBUG_ASSERT(num_tasks > extras);
4828  KMP_DEBUG_ASSERT(num_tasks > 0);
4829 
4830  // split the loop in two halves
4831  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4832  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4833  kmp_uint64 gr_size0 = grainsize;
4834  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4835  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4836  if (last_chunk < 0) {
4837  ext0 = ext1 = 0;
4838  last_chunk1 = last_chunk;
4839  tc0 = grainsize * n_tsk0;
4840  tc1 = tc - tc0;
4841  } else if (n_tsk0 <= extras) {
4842  gr_size0++; // integrate extras into grainsize
4843  ext0 = 0; // no extra iters in 1st half
4844  ext1 = extras - n_tsk0; // remaining extras
4845  tc0 = gr_size0 * n_tsk0;
4846  tc1 = tc - tc0;
4847  } else { // n_tsk0 > extras
4848  ext1 = 0; // no extra iters in 2nd half
4849  ext0 = extras;
4850  tc1 = grainsize * n_tsk1;
4851  tc0 = tc - tc1;
4852  }
4853  ub0 = lower + st * (tc0 - 1);
4854  lb1 = ub0 + st;
4855 
4856  // create pattern task for 2nd half of the loop
4857  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4858  // adjust lower bound (upper bound is not changed) for the 2nd half
4859  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4860  if (ptask_dup != NULL) // construct firstprivates, etc.
4861  ptask_dup(next_task, task, 0);
4862  *ub = ub0; // adjust upper bound for the 1st half
4863 
4864  // create auxiliary task for 2nd half of the loop
4865  // make sure new task has same parent task as the pattern task
4866  kmp_taskdata_t *current_task = thread->th.th_current_task;
4867  thread->th.th_current_task = taskdata->td_parent;
4868  kmp_task_t *new_task =
4869  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4870  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4871  // restore current task
4872  thread->th.th_current_task = current_task;
4873  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4874  p->task = next_task;
4875  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4876  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4877  p->task_dup = task_dup;
4878  p->st = st;
4879  p->ub_glob = ub_glob;
4880  p->num_tasks = n_tsk1;
4881  p->grainsize = grainsize;
4882  p->extras = ext1;
4883  p->last_chunk = last_chunk1;
4884  p->tc = tc1;
4885  p->num_t_min = num_t_min;
4886 #if OMPT_SUPPORT
4887  p->codeptr_ra = codeptr_ra;
4888 #endif
4889 
4890 #if OMPT_SUPPORT
4891  // schedule new task with correct return address for OMPT events
4892  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4893 #else
4894  __kmp_omp_task(gtid, new_task, true); // schedule new task
4895 #endif
4896 
4897  // execute the 1st half of current subrange
4898  if (n_tsk0 > num_t_min)
4899  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4900  ext0, last_chunk0, tc0, num_t_min,
4901 #if OMPT_SUPPORT
4902  codeptr_ra,
4903 #endif
4904  task_dup);
4905  else
4906  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4907  gr_size0, ext0, last_chunk0, tc0,
4908 #if OMPT_SUPPORT
4909  codeptr_ra,
4910 #endif
4911  task_dup);
4912 
4913  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4914 }
4915 
4916 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4917  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4918  int nogroup, int sched, kmp_uint64 grainsize,
4919  int modifier, void *task_dup) {
4920  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4921  KMP_DEBUG_ASSERT(task != NULL);
4922  if (nogroup == 0) {
4923 #if OMPT_SUPPORT && OMPT_OPTIONAL
4924  OMPT_STORE_RETURN_ADDRESS(gtid);
4925 #endif
4926  __kmpc_taskgroup(loc, gtid);
4927  }
4928 
4929  // =========================================================================
4930  // calculate loop parameters
4931  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4932  kmp_uint64 tc;
4933  // compiler provides global bounds here
4934  kmp_uint64 lower = task_bounds.get_lb();
4935  kmp_uint64 upper = task_bounds.get_ub();
4936  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4937  kmp_uint64 num_tasks = 0, extras = 0;
4938  kmp_int64 last_chunk =
4939  0; // reduce grainsize of last task by last_chunk in strict mode
4940  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4941  kmp_info_t *thread = __kmp_threads[gtid];
4942  kmp_taskdata_t *current_task = thread->th.th_current_task;
4943 
4944  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4945  "grain %llu(%d, %d), dup %p\n",
4946  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4947  task_dup));
4948 
4949  // compute trip count
4950  if (st == 1) { // most common case
4951  tc = upper - lower + 1;
4952  } else if (st < 0) {
4953  tc = (lower - upper) / (-st) + 1;
4954  } else { // st > 0
4955  tc = (upper - lower) / st + 1;
4956  }
4957  if (tc == 0) {
4958  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4959  // free the pattern task and exit
4960  __kmp_task_start(gtid, task, current_task);
4961  // do not execute anything for zero-trip loop
4962  __kmp_task_finish<false>(gtid, task, current_task);
4963  return;
4964  }
4965 
4966 #if OMPT_SUPPORT && OMPT_OPTIONAL
4967  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4968  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4969  if (ompt_enabled.ompt_callback_work) {
4970  ompt_callbacks.ompt_callback(ompt_callback_work)(
4971  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4972  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4973  }
4974 #endif
4975 
4976  if (num_tasks_min == 0)
4977  // TODO: can we choose better default heuristic?
4978  num_tasks_min =
4979  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4980 
4981  // compute num_tasks/grainsize based on the input provided
4982  switch (sched) {
4983  case 0: // no schedule clause specified, we can choose the default
4984  // let's try to schedule (team_size*10) tasks
4985  grainsize = thread->th.th_team_nproc * 10;
4986  KMP_FALLTHROUGH();
4987  case 2: // num_tasks provided
4988  if (grainsize > tc) {
4989  num_tasks = tc; // too big num_tasks requested, adjust values
4990  grainsize = 1;
4991  extras = 0;
4992  } else {
4993  num_tasks = grainsize;
4994  grainsize = tc / num_tasks;
4995  extras = tc % num_tasks;
4996  }
4997  break;
4998  case 1: // grainsize provided
4999  if (grainsize > tc) {
5000  num_tasks = 1;
5001  grainsize = tc; // too big grainsize requested, adjust values
5002  extras = 0;
5003  } else {
5004  if (modifier) {
5005  num_tasks = (tc + grainsize - 1) / grainsize;
5006  last_chunk = tc - (num_tasks * grainsize);
5007  extras = 0;
5008  } else {
5009  num_tasks = tc / grainsize;
5010  // adjust grainsize for balanced distribution of iterations
5011  grainsize = tc / num_tasks;
5012  extras = tc % num_tasks;
5013  }
5014  }
5015  break;
5016  default:
5017  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5018  }
5019 
5020  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5021  (last_chunk < 0 ? last_chunk : extras));
5022  KMP_DEBUG_ASSERT(num_tasks > extras);
5023  KMP_DEBUG_ASSERT(num_tasks > 0);
5024  // =========================================================================
5025 
5026  // check if clause value first
5027  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5028  if (if_val == 0) { // if(0) specified, mark task as serial
5029  taskdata->td_flags.task_serial = 1;
5030  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5031  // always start serial tasks linearly
5032  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5033  grainsize, extras, last_chunk, tc,
5034 #if OMPT_SUPPORT
5035  OMPT_GET_RETURN_ADDRESS(0),
5036 #endif
5037  task_dup);
5038  // !taskdata->td_flags.native => currently force linear spawning of tasks
5039  // for GOMP_taskloop
5040  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5041  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5042  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5043  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5044  last_chunk));
5045  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5046  grainsize, extras, last_chunk, tc, num_tasks_min,
5047 #if OMPT_SUPPORT
5048  OMPT_GET_RETURN_ADDRESS(0),
5049 #endif
5050  task_dup);
5051  } else {
5052  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5053  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5054  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5055  last_chunk));
5056  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5057  grainsize, extras, last_chunk, tc,
5058 #if OMPT_SUPPORT
5059  OMPT_GET_RETURN_ADDRESS(0),
5060 #endif
5061  task_dup);
5062  }
5063 
5064 #if OMPT_SUPPORT && OMPT_OPTIONAL
5065  if (ompt_enabled.ompt_callback_work) {
5066  ompt_callbacks.ompt_callback(ompt_callback_work)(
5067  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5068  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5069  }
5070 #endif
5071 
5072  if (nogroup == 0) {
5073 #if OMPT_SUPPORT && OMPT_OPTIONAL
5074  OMPT_STORE_RETURN_ADDRESS(gtid);
5075 #endif
5076  __kmpc_end_taskgroup(loc, gtid);
5077  }
5078  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5079 }
5080 
5097 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5098  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5099  int sched, kmp_uint64 grainsize, void *task_dup) {
5100  __kmp_assert_valid_gtid(gtid);
5101  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5102  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5103  0, task_dup);
5104  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5105 }
5106 
5124 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5125  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5126  int nogroup, int sched, kmp_uint64 grainsize,
5127  int modifier, void *task_dup) {
5128  __kmp_assert_valid_gtid(gtid);
5129  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5130  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5131  modifier, task_dup);
5132  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5133 }
kmp_taskred_data::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2290
kmp_taskred_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2307
kmp_taskred_data::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2298
__kmpc_proxy_task_completed
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
Definition: kmp_tasking.cpp:4254
kmp_taskred_data_t
struct kmp_taskred_data kmp_taskred_data_t
kmp_taskred_data::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2291
kmp_taskred_data
Definition: kmp_tasking.cpp:2288
kmp_task_red_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2280
kmp_taskred_data::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2289
__kmpc_taskred_init
void * __kmpc_taskred_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2434
__kmpc_task_reduction_init
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
Definition: kmp_tasking.cpp:2418
kmp_task_red_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2281
kmp_taskred_data::reduce_pend
void * reduce_pend
Definition: kmp_tasking.cpp:2293
__kmpc_task_reduction_get_th_data
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp_tasking.cpp:2466
__kmpc_taskloop
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
Definition: kmp_tasking.cpp:5097
kmp_task_red_input_t
struct kmp_task_red_input kmp_task_red_input_t
__kmpc_taskloop_5
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
Definition: kmp_tasking.cpp:5124
kmp_taskred_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2309
kmp_taskred_data::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2295
kmp_taskred_data::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2296
kmp_task_red_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2282
__kmpc_proxy_task_completed_ooo
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
Definition: kmp_tasking.cpp:4318
ident
Definition: kmp.h:234
__kmpc_task_reduction_modifier_fini
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
Definition: kmp_tasking.cpp:2663
kmp_taskred_flags
Definition: kmp_tasking.cpp:2266
kmp_task_red_input::reduce_size
size_t reduce_size
Definition: kmp_tasking.cpp:2277
kmp_taskred_input::reduce_comb
void * reduce_comb
Definition: kmp_tasking.cpp:2313
kmp_taskred_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2311
kmp_taskred_input::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2312
kmp_taskred_data::reduce_fini
void * reduce_fini
Definition: kmp_tasking.cpp:2297
kmp_taskred_input_t
struct kmp_taskred_input kmp_taskred_input_t
kmp_taskred_input
Definition: kmp_tasking.cpp:2306
__kmpc_task_reduction_modifier_init
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2629
kmp_task_red_input::reduce_shar
void * reduce_shar
Definition: kmp_tasking.cpp:2276
__kmpc_taskred_modifier_init
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition: kmp_tasking.cpp:2649
kmp_taskred_input::flags
kmp_taskred_flags_t flags
Definition: kmp_tasking.cpp:2314
kmp_taskred_flags_t
struct kmp_taskred_flags kmp_taskred_flags_t
kmp_taskred_input::reduce_orig
void * reduce_orig
Definition: kmp_tasking.cpp:2308
kmp_taskred_flags::lazy_priv
unsigned lazy_priv
Definition: kmp_tasking.cpp:2268
kmp_taskred_data::reduce_priv
void * reduce_priv
Definition: kmp_tasking.cpp:2292
kmp_task_red_input::reduce_init
void * reduce_init
Definition: kmp_tasking.cpp:2279
KMP_COUNT_BLOCK
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:908
kmp_task_red_input
Definition: kmp_tasking.cpp:2275
__kmpc_omp_reg_task_with_affinity
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
Definition: kmp_tasking.cpp:1623