LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 #if KMP_USE_HWLOC
23 // Copied from hwloc
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28 #endif
29 #include <ctype.h>
30 
31 // The machine topology
32 kmp_topology_t *__kmp_topology = nullptr;
33 // KMP_HW_SUBSET environment variable
34 kmp_hw_subset_t *__kmp_hw_subset = nullptr;
35 
36 // Store the real or imagined machine hierarchy here
37 static hierarchy_info machine_hierarchy;
38 
39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
40 
41 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
42  kmp_uint32 depth;
43  // The test below is true if affinity is available, but set to "none". Need to
44  // init on first use of hierarchical barrier.
45  if (TCR_1(machine_hierarchy.uninitialized))
46  machine_hierarchy.init(nproc);
47 
48  // Adjust the hierarchy in case num threads exceeds original
49  if (nproc > machine_hierarchy.base_num_threads)
50  machine_hierarchy.resize(nproc);
51 
52  depth = machine_hierarchy.depth;
53  KMP_DEBUG_ASSERT(depth > 0);
54 
55  thr_bar->depth = depth;
56  __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
57  &(thr_bar->base_leaf_kids));
58  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
59 }
60 
61 static int nCoresPerPkg, nPackages;
62 static int __kmp_nThreadsPerCore;
63 #ifndef KMP_DFLT_NTH_CORES
64 static int __kmp_ncores;
65 #endif
66 
67 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
68  switch (type) {
69  case KMP_HW_SOCKET:
70  return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
71  case KMP_HW_DIE:
72  return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
73  case KMP_HW_MODULE:
74  return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
75  case KMP_HW_TILE:
76  return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
77  case KMP_HW_NUMA:
78  return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
79  case KMP_HW_L3:
80  return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
81  case KMP_HW_L2:
82  return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
83  case KMP_HW_L1:
84  return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
85  case KMP_HW_LLC:
86  return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
87  case KMP_HW_CORE:
88  return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
89  case KMP_HW_THREAD:
90  return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
91  case KMP_HW_PROC_GROUP:
92  return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
93  }
94  return KMP_I18N_STR(Unknown);
95 }
96 
97 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
98  switch (type) {
99  case KMP_HW_SOCKET:
100  return ((plural) ? "sockets" : "socket");
101  case KMP_HW_DIE:
102  return ((plural) ? "dice" : "die");
103  case KMP_HW_MODULE:
104  return ((plural) ? "modules" : "module");
105  case KMP_HW_TILE:
106  return ((plural) ? "tiles" : "tile");
107  case KMP_HW_NUMA:
108  return ((plural) ? "numa_domains" : "numa_domain");
109  case KMP_HW_L3:
110  return ((plural) ? "l3_caches" : "l3_cache");
111  case KMP_HW_L2:
112  return ((plural) ? "l2_caches" : "l2_cache");
113  case KMP_HW_L1:
114  return ((plural) ? "l1_caches" : "l1_cache");
115  case KMP_HW_LLC:
116  return ((plural) ? "ll_caches" : "ll_cache");
117  case KMP_HW_CORE:
118  return ((plural) ? "cores" : "core");
119  case KMP_HW_THREAD:
120  return ((plural) ? "threads" : "thread");
121  case KMP_HW_PROC_GROUP:
122  return ((plural) ? "proc_groups" : "proc_group");
123  }
124  return ((plural) ? "unknowns" : "unknown");
125 }
126 
127 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
128  switch (type) {
129  case KMP_HW_CORE_TYPE_UNKNOWN:
130  return "unknown";
131 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
132  case KMP_HW_CORE_TYPE_ATOM:
133  return "Intel Atom(R) processor";
134  case KMP_HW_CORE_TYPE_CORE:
135  return "Intel(R) Core(TM) processor";
136 #endif
137  }
138  return "unknown";
139 }
140 
141 #if KMP_AFFINITY_SUPPORTED
142 // If affinity is supported, check the affinity
143 // verbose and warning flags before printing warning
144 #define KMP_AFF_WARNING(s, ...) \
145  if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) { \
146  KMP_WARNING(__VA_ARGS__); \
147  }
148 #else
149 #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
150 #endif
151 
153 // kmp_hw_thread_t methods
154 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
155  const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
156  const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
157  int depth = __kmp_topology->get_depth();
158  for (int level = 0; level < depth; ++level) {
159  if (ahwthread->ids[level] < bhwthread->ids[level])
160  return -1;
161  else if (ahwthread->ids[level] > bhwthread->ids[level])
162  return 1;
163  }
164  if (ahwthread->os_id < bhwthread->os_id)
165  return -1;
166  else if (ahwthread->os_id > bhwthread->os_id)
167  return 1;
168  return 0;
169 }
170 
171 #if KMP_AFFINITY_SUPPORTED
172 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
173  int i;
174  const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
175  const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
176  int depth = __kmp_topology->get_depth();
177  int compact = __kmp_topology->compact;
178  KMP_DEBUG_ASSERT(compact >= 0);
179  KMP_DEBUG_ASSERT(compact <= depth);
180  for (i = 0; i < compact; i++) {
181  int j = depth - i - 1;
182  if (aa->sub_ids[j] < bb->sub_ids[j])
183  return -1;
184  if (aa->sub_ids[j] > bb->sub_ids[j])
185  return 1;
186  }
187  for (; i < depth; i++) {
188  int j = i - compact;
189  if (aa->sub_ids[j] < bb->sub_ids[j])
190  return -1;
191  if (aa->sub_ids[j] > bb->sub_ids[j])
192  return 1;
193  }
194  return 0;
195 }
196 #endif
197 
198 void kmp_hw_thread_t::print() const {
199  int depth = __kmp_topology->get_depth();
200  printf("%4d ", os_id);
201  for (int i = 0; i < depth; ++i) {
202  printf("%4d ", ids[i]);
203  }
204  if (attrs) {
205  if (attrs.is_core_type_valid())
206  printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
207  if (attrs.is_core_eff_valid())
208  printf(" (eff=%d)", attrs.get_core_eff());
209  }
210  printf("\n");
211 }
212 
214 // kmp_topology_t methods
215 
216 // Add a layer to the topology based on the ids. Assume the topology
217 // is perfectly nested (i.e., so no object has more than one parent)
218 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
219  // Figure out where the layer should go by comparing the ids of the current
220  // layers with the new ids
221  int target_layer;
222  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
223  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
224 
225  // Start from the highest layer and work down to find target layer
226  // If new layer is equal to another layer then put the new layer above
227  for (target_layer = 0; target_layer < depth; ++target_layer) {
228  bool layers_equal = true;
229  bool strictly_above_target_layer = false;
230  for (int i = 0; i < num_hw_threads; ++i) {
231  int id = hw_threads[i].ids[target_layer];
232  int new_id = ids[i];
233  if (id != previous_id && new_id == previous_new_id) {
234  // Found the layer we are strictly above
235  strictly_above_target_layer = true;
236  layers_equal = false;
237  break;
238  } else if (id == previous_id && new_id != previous_new_id) {
239  // Found a layer we are below. Move to next layer and check.
240  layers_equal = false;
241  break;
242  }
243  previous_id = id;
244  previous_new_id = new_id;
245  }
246  if (strictly_above_target_layer || layers_equal)
247  break;
248  }
249 
250  // Found the layer we are above. Now move everything to accommodate the new
251  // layer. And put the new ids and type into the topology.
252  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
253  types[j] = types[i];
254  types[target_layer] = type;
255  for (int k = 0; k < num_hw_threads; ++k) {
256  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
257  hw_threads[k].ids[j] = hw_threads[k].ids[i];
258  hw_threads[k].ids[target_layer] = ids[k];
259  }
260  equivalent[type] = type;
261  depth++;
262 }
263 
264 #if KMP_GROUP_AFFINITY
265 // Insert the Windows Processor Group structure into the topology
266 void kmp_topology_t::_insert_windows_proc_groups() {
267  // Do not insert the processor group structure for a single group
268  if (__kmp_num_proc_groups == 1)
269  return;
270  kmp_affin_mask_t *mask;
271  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
272  KMP_CPU_ALLOC(mask);
273  for (int i = 0; i < num_hw_threads; ++i) {
274  KMP_CPU_ZERO(mask);
275  KMP_CPU_SET(hw_threads[i].os_id, mask);
276  ids[i] = __kmp_get_proc_group(mask);
277  }
278  KMP_CPU_FREE(mask);
279  _insert_layer(KMP_HW_PROC_GROUP, ids);
280  __kmp_free(ids);
281 }
282 #endif
283 
284 // Remove layers that don't add information to the topology.
285 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
286 void kmp_topology_t::_remove_radix1_layers() {
287  int preference[KMP_HW_LAST];
288  int top_index1, top_index2;
289  // Set up preference associative array
290  preference[KMP_HW_SOCKET] = 110;
291  preference[KMP_HW_PROC_GROUP] = 100;
292  preference[KMP_HW_CORE] = 95;
293  preference[KMP_HW_THREAD] = 90;
294  preference[KMP_HW_NUMA] = 85;
295  preference[KMP_HW_DIE] = 80;
296  preference[KMP_HW_TILE] = 75;
297  preference[KMP_HW_MODULE] = 73;
298  preference[KMP_HW_L3] = 70;
299  preference[KMP_HW_L2] = 65;
300  preference[KMP_HW_L1] = 60;
301  preference[KMP_HW_LLC] = 5;
302  top_index1 = 0;
303  top_index2 = 1;
304  while (top_index1 < depth - 1 && top_index2 < depth) {
305  kmp_hw_t type1 = types[top_index1];
306  kmp_hw_t type2 = types[top_index2];
307  KMP_ASSERT_VALID_HW_TYPE(type1);
308  KMP_ASSERT_VALID_HW_TYPE(type2);
309  // Do not allow the three main topology levels (sockets, cores, threads) to
310  // be compacted down
311  if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
312  type1 == KMP_HW_SOCKET) &&
313  (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
314  type2 == KMP_HW_SOCKET)) {
315  top_index1 = top_index2++;
316  continue;
317  }
318  bool radix1 = true;
319  bool all_same = true;
320  int id1 = hw_threads[0].ids[top_index1];
321  int id2 = hw_threads[0].ids[top_index2];
322  int pref1 = preference[type1];
323  int pref2 = preference[type2];
324  for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
325  if (hw_threads[hwidx].ids[top_index1] == id1 &&
326  hw_threads[hwidx].ids[top_index2] != id2) {
327  radix1 = false;
328  break;
329  }
330  if (hw_threads[hwidx].ids[top_index2] != id2)
331  all_same = false;
332  id1 = hw_threads[hwidx].ids[top_index1];
333  id2 = hw_threads[hwidx].ids[top_index2];
334  }
335  if (radix1) {
336  // Select the layer to remove based on preference
337  kmp_hw_t remove_type, keep_type;
338  int remove_layer, remove_layer_ids;
339  if (pref1 > pref2) {
340  remove_type = type2;
341  remove_layer = remove_layer_ids = top_index2;
342  keep_type = type1;
343  } else {
344  remove_type = type1;
345  remove_layer = remove_layer_ids = top_index1;
346  keep_type = type2;
347  }
348  // If all the indexes for the second (deeper) layer are the same.
349  // e.g., all are zero, then make sure to keep the first layer's ids
350  if (all_same)
351  remove_layer_ids = top_index2;
352  // Remove radix one type by setting the equivalence, removing the id from
353  // the hw threads and removing the layer from types and depth
354  set_equivalent_type(remove_type, keep_type);
355  for (int idx = 0; idx < num_hw_threads; ++idx) {
356  kmp_hw_thread_t &hw_thread = hw_threads[idx];
357  for (int d = remove_layer_ids; d < depth - 1; ++d)
358  hw_thread.ids[d] = hw_thread.ids[d + 1];
359  }
360  for (int idx = remove_layer; idx < depth - 1; ++idx)
361  types[idx] = types[idx + 1];
362  depth--;
363  } else {
364  top_index1 = top_index2++;
365  }
366  }
367  KMP_ASSERT(depth > 0);
368 }
369 
370 void kmp_topology_t::_set_last_level_cache() {
371  if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
372  set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
373  else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
374  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
375 #if KMP_MIC_SUPPORTED
376  else if (__kmp_mic_type == mic3) {
377  if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
378  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
379  else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
380  set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
381  // L2/Tile wasn't detected so just say L1
382  else
383  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
384  }
385 #endif
386  else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
387  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
388  // Fallback is to set last level cache to socket or core
389  if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
390  if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
391  set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
392  else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
393  set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
394  }
395  KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
396 }
397 
398 // Gather the count of each topology layer and the ratio
399 void kmp_topology_t::_gather_enumeration_information() {
400  int previous_id[KMP_HW_LAST];
401  int max[KMP_HW_LAST];
402 
403  for (int i = 0; i < depth; ++i) {
404  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
405  max[i] = 0;
406  count[i] = 0;
407  ratio[i] = 0;
408  }
409  int core_level = get_level(KMP_HW_CORE);
410  for (int i = 0; i < num_hw_threads; ++i) {
411  kmp_hw_thread_t &hw_thread = hw_threads[i];
412  for (int layer = 0; layer < depth; ++layer) {
413  int id = hw_thread.ids[layer];
414  if (id != previous_id[layer]) {
415  // Add an additional increment to each count
416  for (int l = layer; l < depth; ++l)
417  count[l]++;
418  // Keep track of topology layer ratio statistics
419  max[layer]++;
420  for (int l = layer + 1; l < depth; ++l) {
421  if (max[l] > ratio[l])
422  ratio[l] = max[l];
423  max[l] = 1;
424  }
425  // Figure out the number of different core types
426  // and efficiencies for hybrid CPUs
427  if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
428  if (hw_thread.attrs.is_core_eff_valid() &&
429  hw_thread.attrs.core_eff >= num_core_efficiencies) {
430  // Because efficiencies can range from 0 to max efficiency - 1,
431  // the number of efficiencies is max efficiency + 1
432  num_core_efficiencies = hw_thread.attrs.core_eff + 1;
433  }
434  if (hw_thread.attrs.is_core_type_valid()) {
435  bool found = false;
436  for (int j = 0; j < num_core_types; ++j) {
437  if (hw_thread.attrs.get_core_type() == core_types[j]) {
438  found = true;
439  break;
440  }
441  }
442  if (!found) {
443  KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
444  core_types[num_core_types++] = hw_thread.attrs.get_core_type();
445  }
446  }
447  }
448  break;
449  }
450  }
451  for (int layer = 0; layer < depth; ++layer) {
452  previous_id[layer] = hw_thread.ids[layer];
453  }
454  }
455  for (int layer = 0; layer < depth; ++layer) {
456  if (max[layer] > ratio[layer])
457  ratio[layer] = max[layer];
458  }
459 }
460 
461 int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
462  int above_level,
463  bool find_all) const {
464  int current, current_max;
465  int previous_id[KMP_HW_LAST];
466  for (int i = 0; i < depth; ++i)
467  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
468  int core_level = get_level(KMP_HW_CORE);
469  if (find_all)
470  above_level = -1;
471  KMP_ASSERT(above_level < core_level);
472  current_max = 0;
473  current = 0;
474  for (int i = 0; i < num_hw_threads; ++i) {
475  kmp_hw_thread_t &hw_thread = hw_threads[i];
476  if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
477  if (current > current_max)
478  current_max = current;
479  current = hw_thread.attrs.contains(attr);
480  } else {
481  for (int level = above_level + 1; level <= core_level; ++level) {
482  if (hw_thread.ids[level] != previous_id[level]) {
483  if (hw_thread.attrs.contains(attr))
484  current++;
485  break;
486  }
487  }
488  }
489  for (int level = 0; level < depth; ++level)
490  previous_id[level] = hw_thread.ids[level];
491  }
492  if (current > current_max)
493  current_max = current;
494  return current_max;
495 }
496 
497 // Find out if the topology is uniform
498 void kmp_topology_t::_discover_uniformity() {
499  int num = 1;
500  for (int level = 0; level < depth; ++level)
501  num *= ratio[level];
502  flags.uniform = (num == count[depth - 1]);
503 }
504 
505 // Set all the sub_ids for each hardware thread
506 void kmp_topology_t::_set_sub_ids() {
507  int previous_id[KMP_HW_LAST];
508  int sub_id[KMP_HW_LAST];
509 
510  for (int i = 0; i < depth; ++i) {
511  previous_id[i] = -1;
512  sub_id[i] = -1;
513  }
514  for (int i = 0; i < num_hw_threads; ++i) {
515  kmp_hw_thread_t &hw_thread = hw_threads[i];
516  // Setup the sub_id
517  for (int j = 0; j < depth; ++j) {
518  if (hw_thread.ids[j] != previous_id[j]) {
519  sub_id[j]++;
520  for (int k = j + 1; k < depth; ++k) {
521  sub_id[k] = 0;
522  }
523  break;
524  }
525  }
526  // Set previous_id
527  for (int j = 0; j < depth; ++j) {
528  previous_id[j] = hw_thread.ids[j];
529  }
530  // Set the sub_ids field
531  for (int j = 0; j < depth; ++j) {
532  hw_thread.sub_ids[j] = sub_id[j];
533  }
534  }
535 }
536 
537 void kmp_topology_t::_set_globals() {
538  // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
539  int core_level, thread_level, package_level;
540  package_level = get_level(KMP_HW_SOCKET);
541 #if KMP_GROUP_AFFINITY
542  if (package_level == -1)
543  package_level = get_level(KMP_HW_PROC_GROUP);
544 #endif
545  core_level = get_level(KMP_HW_CORE);
546  thread_level = get_level(KMP_HW_THREAD);
547 
548  KMP_ASSERT(core_level != -1);
549  KMP_ASSERT(thread_level != -1);
550 
551  __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
552  if (package_level != -1) {
553  nCoresPerPkg = calculate_ratio(core_level, package_level);
554  nPackages = get_count(package_level);
555  } else {
556  // assume one socket
557  nCoresPerPkg = get_count(core_level);
558  nPackages = 1;
559  }
560 #ifndef KMP_DFLT_NTH_CORES
561  __kmp_ncores = get_count(core_level);
562 #endif
563 }
564 
565 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
566  const kmp_hw_t *types) {
567  kmp_topology_t *retval;
568  // Allocate all data in one large allocation
569  size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
570  sizeof(int) * (size_t)KMP_HW_LAST * 3;
571  char *bytes = (char *)__kmp_allocate(size);
572  retval = (kmp_topology_t *)bytes;
573  if (nproc > 0) {
574  retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
575  } else {
576  retval->hw_threads = nullptr;
577  }
578  retval->num_hw_threads = nproc;
579  retval->depth = ndepth;
580  int *arr =
581  (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
582  retval->types = (kmp_hw_t *)arr;
583  retval->ratio = arr + (size_t)KMP_HW_LAST;
584  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
585  retval->num_core_efficiencies = 0;
586  retval->num_core_types = 0;
587  retval->compact = 0;
588  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
589  retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
590  KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
591  for (int i = 0; i < ndepth; ++i) {
592  retval->types[i] = types[i];
593  retval->equivalent[types[i]] = types[i];
594  }
595  return retval;
596 }
597 
598 void kmp_topology_t::deallocate(kmp_topology_t *topology) {
599  if (topology)
600  __kmp_free(topology);
601 }
602 
603 bool kmp_topology_t::check_ids() const {
604  // Assume ids have been sorted
605  if (num_hw_threads == 0)
606  return true;
607  for (int i = 1; i < num_hw_threads; ++i) {
608  kmp_hw_thread_t &current_thread = hw_threads[i];
609  kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
610  bool unique = false;
611  for (int j = 0; j < depth; ++j) {
612  if (previous_thread.ids[j] != current_thread.ids[j]) {
613  unique = true;
614  break;
615  }
616  }
617  if (unique)
618  continue;
619  return false;
620  }
621  return true;
622 }
623 
624 void kmp_topology_t::dump() const {
625  printf("***********************\n");
626  printf("*** __kmp_topology: ***\n");
627  printf("***********************\n");
628  printf("* depth: %d\n", depth);
629 
630  printf("* types: ");
631  for (int i = 0; i < depth; ++i)
632  printf("%15s ", __kmp_hw_get_keyword(types[i]));
633  printf("\n");
634 
635  printf("* ratio: ");
636  for (int i = 0; i < depth; ++i) {
637  printf("%15d ", ratio[i]);
638  }
639  printf("\n");
640 
641  printf("* count: ");
642  for (int i = 0; i < depth; ++i) {
643  printf("%15d ", count[i]);
644  }
645  printf("\n");
646 
647  printf("* num_core_eff: %d\n", num_core_efficiencies);
648  printf("* num_core_types: %d\n", num_core_types);
649  printf("* core_types: ");
650  for (int i = 0; i < num_core_types; ++i)
651  printf("%3d ", core_types[i]);
652  printf("\n");
653 
654  printf("* equivalent map:\n");
655  KMP_FOREACH_HW_TYPE(i) {
656  const char *key = __kmp_hw_get_keyword(i);
657  const char *value = __kmp_hw_get_keyword(equivalent[i]);
658  printf("%-15s -> %-15s\n", key, value);
659  }
660 
661  printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
662 
663  printf("* num_hw_threads: %d\n", num_hw_threads);
664  printf("* hw_threads:\n");
665  for (int i = 0; i < num_hw_threads; ++i) {
666  hw_threads[i].print();
667  }
668  printf("***********************\n");
669 }
670 
671 void kmp_topology_t::print(const char *env_var) const {
672  kmp_str_buf_t buf;
673  int print_types_depth;
674  __kmp_str_buf_init(&buf);
675  kmp_hw_t print_types[KMP_HW_LAST + 2];
676 
677  // Num Available Threads
678  if (num_hw_threads) {
679  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
680  } else {
681  KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
682  }
683 
684  // Uniform or not
685  if (is_uniform()) {
686  KMP_INFORM(Uniform, env_var);
687  } else {
688  KMP_INFORM(NonUniform, env_var);
689  }
690 
691  // Equivalent types
692  KMP_FOREACH_HW_TYPE(type) {
693  kmp_hw_t eq_type = equivalent[type];
694  if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
695  KMP_INFORM(AffEqualTopologyTypes, env_var,
696  __kmp_hw_get_catalog_string(type),
697  __kmp_hw_get_catalog_string(eq_type));
698  }
699  }
700 
701  // Quick topology
702  KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
703  // Create a print types array that always guarantees printing
704  // the core and thread level
705  print_types_depth = 0;
706  for (int level = 0; level < depth; ++level)
707  print_types[print_types_depth++] = types[level];
708  if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
709  // Force in the core level for quick topology
710  if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
711  // Force core before thread e.g., 1 socket X 2 threads/socket
712  // becomes 1 socket X 1 core/socket X 2 threads/socket
713  print_types[print_types_depth - 1] = KMP_HW_CORE;
714  print_types[print_types_depth++] = KMP_HW_THREAD;
715  } else {
716  print_types[print_types_depth++] = KMP_HW_CORE;
717  }
718  }
719  // Always put threads at very end of quick topology
720  if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
721  print_types[print_types_depth++] = KMP_HW_THREAD;
722 
723  __kmp_str_buf_clear(&buf);
724  kmp_hw_t numerator_type;
725  kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
726  int core_level = get_level(KMP_HW_CORE);
727  int ncores = get_count(core_level);
728 
729  for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
730  int c;
731  bool plural;
732  numerator_type = print_types[plevel];
733  KMP_ASSERT_VALID_HW_TYPE(numerator_type);
734  if (equivalent[numerator_type] != numerator_type)
735  c = 1;
736  else
737  c = get_ratio(level++);
738  plural = (c > 1);
739  if (plevel == 0) {
740  __kmp_str_buf_print(&buf, "%d %s", c,
741  __kmp_hw_get_catalog_string(numerator_type, plural));
742  } else {
743  __kmp_str_buf_print(&buf, " x %d %s/%s", c,
744  __kmp_hw_get_catalog_string(numerator_type, plural),
745  __kmp_hw_get_catalog_string(denominator_type));
746  }
747  denominator_type = numerator_type;
748  }
749  KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
750 
751  // Hybrid topology information
752  if (__kmp_is_hybrid_cpu()) {
753  for (int i = 0; i < num_core_types; ++i) {
754  kmp_hw_core_type_t core_type = core_types[i];
755  kmp_hw_attr_t attr;
756  attr.clear();
757  attr.set_core_type(core_type);
758  int ncores = get_ncores_with_attr(attr);
759  if (ncores > 0) {
760  KMP_INFORM(TopologyHybrid, env_var, ncores,
761  __kmp_hw_get_core_type_string(core_type));
762  KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
763  for (int eff = 0; eff < num_core_efficiencies; ++eff) {
764  attr.set_core_eff(eff);
765  int ncores_with_eff = get_ncores_with_attr(attr);
766  if (ncores_with_eff > 0) {
767  KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
768  }
769  }
770  }
771  }
772  }
773 
774  if (num_hw_threads <= 0) {
775  __kmp_str_buf_free(&buf);
776  return;
777  }
778 
779  // Full OS proc to hardware thread map
780  KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
781  for (int i = 0; i < num_hw_threads; i++) {
782  __kmp_str_buf_clear(&buf);
783  for (int level = 0; level < depth; ++level) {
784  kmp_hw_t type = types[level];
785  __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
786  __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
787  }
788  if (__kmp_is_hybrid_cpu())
789  __kmp_str_buf_print(
790  &buf, "(%s)",
791  __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
792  KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
793  }
794 
795  __kmp_str_buf_free(&buf);
796 }
797 
798 #if KMP_AFFINITY_SUPPORTED
799 void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
800  const char *env_var = affinity.env_var;
801  // Set the number of affinity granularity levels
802  if (affinity.gran_levels < 0) {
803  kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
804  // Check if user's granularity request is valid
805  if (gran_type == KMP_HW_UNKNOWN) {
806  // First try core, then thread, then package
807  kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
808  for (auto g : gran_types) {
809  if (get_equivalent_type(g) != KMP_HW_UNKNOWN) {
810  gran_type = g;
811  break;
812  }
813  }
814  KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
815  // Warn user what granularity setting will be used instead
816  KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
817  __kmp_hw_get_catalog_string(affinity.gran),
818  __kmp_hw_get_catalog_string(gran_type));
819  affinity.gran = gran_type;
820  }
821 #if KMP_GROUP_AFFINITY
822  // If more than one processor group exists, and the level of
823  // granularity specified by the user is too coarse, then the
824  // granularity must be adjusted "down" to processor group affinity
825  // because threads can only exist within one processor group.
826  // For example, if a user sets granularity=socket and there are two
827  // processor groups that cover a socket, then the runtime must
828  // restrict the granularity down to the processor group level.
829  if (__kmp_num_proc_groups > 1) {
830  int gran_depth = get_level(gran_type);
831  int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
832  if (gran_depth >= 0 && proc_group_depth >= 0 &&
833  gran_depth < proc_group_depth) {
834  KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
835  __kmp_hw_get_catalog_string(affinity.gran));
836  affinity.gran = gran_type = KMP_HW_PROC_GROUP;
837  }
838  }
839 #endif
840  affinity.gran_levels = 0;
841  for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
842  affinity.gran_levels++;
843  }
844 }
845 #endif
846 
847 void kmp_topology_t::canonicalize() {
848 #if KMP_GROUP_AFFINITY
849  _insert_windows_proc_groups();
850 #endif
851  _remove_radix1_layers();
852  _gather_enumeration_information();
853  _discover_uniformity();
854  _set_sub_ids();
855  _set_globals();
856  _set_last_level_cache();
857 
858 #if KMP_MIC_SUPPORTED
859  // Manually Add L2 = Tile equivalence
860  if (__kmp_mic_type == mic3) {
861  if (get_level(KMP_HW_L2) != -1)
862  set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
863  else if (get_level(KMP_HW_TILE) != -1)
864  set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
865  }
866 #endif
867 
868  // Perform post canonicalization checking
869  KMP_ASSERT(depth > 0);
870  for (int level = 0; level < depth; ++level) {
871  // All counts, ratios, and types must be valid
872  KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
873  KMP_ASSERT_VALID_HW_TYPE(types[level]);
874  // Detected types must point to themselves
875  KMP_ASSERT(equivalent[types[level]] == types[level]);
876  }
877 }
878 
879 // Canonicalize an explicit packages X cores/pkg X threads/core topology
880 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
881  int nthreads_per_core, int ncores) {
882  int ndepth = 3;
883  depth = ndepth;
884  KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
885  for (int level = 0; level < depth; ++level) {
886  count[level] = 0;
887  ratio[level] = 0;
888  }
889  count[0] = npackages;
890  count[1] = ncores;
891  count[2] = __kmp_xproc;
892  ratio[0] = npackages;
893  ratio[1] = ncores_per_pkg;
894  ratio[2] = nthreads_per_core;
895  equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
896  equivalent[KMP_HW_CORE] = KMP_HW_CORE;
897  equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
898  types[0] = KMP_HW_SOCKET;
899  types[1] = KMP_HW_CORE;
900  types[2] = KMP_HW_THREAD;
901  //__kmp_avail_proc = __kmp_xproc;
902  _discover_uniformity();
903 }
904 
905 // Represents running sub IDs for a single core attribute where
906 // attribute values have SIZE possibilities.
907 template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
908  int last_level; // last level in topology to consider for sub_ids
909  int sub_id[SIZE]; // The sub ID for a given attribute value
910  int prev_sub_id[KMP_HW_LAST];
911  IndexFunc indexer;
912 
913 public:
914  kmp_sub_ids_t(int last_level) : last_level(last_level) {
915  KMP_ASSERT(last_level < KMP_HW_LAST);
916  for (size_t i = 0; i < SIZE; ++i)
917  sub_id[i] = -1;
918  for (size_t i = 0; i < KMP_HW_LAST; ++i)
919  prev_sub_id[i] = -1;
920  }
921  void update(const kmp_hw_thread_t &hw_thread) {
922  int idx = indexer(hw_thread);
923  KMP_ASSERT(idx < (int)SIZE);
924  for (int level = 0; level <= last_level; ++level) {
925  if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
926  if (level < last_level)
927  sub_id[idx] = -1;
928  sub_id[idx]++;
929  break;
930  }
931  }
932  for (int level = 0; level <= last_level; ++level)
933  prev_sub_id[level] = hw_thread.sub_ids[level];
934  }
935  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
936  return sub_id[indexer(hw_thread)];
937  }
938 };
939 
940 static kmp_str_buf_t *
941 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
942  bool plural) {
943  __kmp_str_buf_init(buf);
944  if (attr.is_core_type_valid())
945  __kmp_str_buf_print(buf, "%s %s",
946  __kmp_hw_get_core_type_string(attr.get_core_type()),
947  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
948  else
949  __kmp_str_buf_print(buf, "%s eff=%d",
950  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
951  attr.get_core_eff());
952  return buf;
953 }
954 
955 // Apply the KMP_HW_SUBSET envirable to the topology
956 // Returns true if KMP_HW_SUBSET filtered any processors
957 // otherwise, returns false
958 bool kmp_topology_t::filter_hw_subset() {
959  // If KMP_HW_SUBSET wasn't requested, then do nothing.
960  if (!__kmp_hw_subset)
961  return false;
962 
963  // First, sort the KMP_HW_SUBSET items by the machine topology
964  __kmp_hw_subset->sort();
965 
966  // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
967  bool using_core_types = false;
968  bool using_core_effs = false;
969  int hw_subset_depth = __kmp_hw_subset->get_depth();
970  kmp_hw_t specified[KMP_HW_LAST];
971  int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
972  KMP_ASSERT(hw_subset_depth > 0);
973  KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
974  int core_level = get_level(KMP_HW_CORE);
975  for (int i = 0; i < hw_subset_depth; ++i) {
976  int max_count;
977  const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
978  int num = item.num[0];
979  int offset = item.offset[0];
980  kmp_hw_t type = item.type;
981  kmp_hw_t equivalent_type = equivalent[type];
982  int level = get_level(type);
983  topology_levels[i] = level;
984 
985  // Check to see if current layer is in detected machine topology
986  if (equivalent_type != KMP_HW_UNKNOWN) {
987  __kmp_hw_subset->at(i).type = equivalent_type;
988  } else {
989  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
990  __kmp_hw_get_catalog_string(type));
991  return false;
992  }
993 
994  // Check to see if current layer has already been
995  // specified either directly or through an equivalent type
996  if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
997  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
998  __kmp_hw_get_catalog_string(type),
999  __kmp_hw_get_catalog_string(specified[equivalent_type]));
1000  return false;
1001  }
1002  specified[equivalent_type] = type;
1003 
1004  // Check to see if each layer's num & offset parameters are valid
1005  max_count = get_ratio(level);
1006  if (max_count < 0 ||
1007  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1008  bool plural = (num > 1);
1009  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
1010  __kmp_hw_get_catalog_string(type, plural));
1011  return false;
1012  }
1013 
1014  // Check to see if core attributes are consistent
1015  if (core_level == level) {
1016  // Determine which core attributes are specified
1017  for (int j = 0; j < item.num_attrs; ++j) {
1018  if (item.attr[j].is_core_type_valid())
1019  using_core_types = true;
1020  if (item.attr[j].is_core_eff_valid())
1021  using_core_effs = true;
1022  }
1023 
1024  // Check if using a single core attribute on non-hybrid arch.
1025  // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
1026  //
1027  // Check if using multiple core attributes on non-hyrbid arch.
1028  // Ignore all of KMP_HW_SUBSET if this is the case.
1029  if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
1030  if (item.num_attrs == 1) {
1031  if (using_core_effs) {
1032  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1033  "efficiency");
1034  } else {
1035  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1036  "core_type");
1037  }
1038  using_core_effs = false;
1039  using_core_types = false;
1040  } else {
1041  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
1042  return false;
1043  }
1044  }
1045 
1046  // Check if using both core types and core efficiencies together
1047  if (using_core_types && using_core_effs) {
1048  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
1049  "efficiency");
1050  return false;
1051  }
1052 
1053  // Check that core efficiency values are valid
1054  if (using_core_effs) {
1055  for (int j = 0; j < item.num_attrs; ++j) {
1056  if (item.attr[j].is_core_eff_valid()) {
1057  int core_eff = item.attr[j].get_core_eff();
1058  if (core_eff < 0 || core_eff >= num_core_efficiencies) {
1059  kmp_str_buf_t buf;
1060  __kmp_str_buf_init(&buf);
1061  __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
1062  __kmp_msg(kmp_ms_warning,
1063  KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
1064  KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
1065  __kmp_msg_null);
1066  __kmp_str_buf_free(&buf);
1067  return false;
1068  }
1069  }
1070  }
1071  }
1072 
1073  // Check that the number of requested cores with attributes is valid
1074  if (using_core_types || using_core_effs) {
1075  for (int j = 0; j < item.num_attrs; ++j) {
1076  int num = item.num[j];
1077  int offset = item.offset[j];
1078  int level_above = core_level - 1;
1079  if (level_above >= 0) {
1080  max_count = get_ncores_with_attr_per(item.attr[j], level_above);
1081  if (max_count <= 0 ||
1082  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1083  kmp_str_buf_t buf;
1084  __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
1085  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
1086  __kmp_str_buf_free(&buf);
1087  return false;
1088  }
1089  }
1090  }
1091  }
1092 
1093  if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
1094  for (int j = 0; j < item.num_attrs; ++j) {
1095  // Ambiguous use of specific core attribute + generic core
1096  // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
1097  if (!item.attr[j]) {
1098  kmp_hw_attr_t other_attr;
1099  for (int k = 0; k < item.num_attrs; ++k) {
1100  if (item.attr[k] != item.attr[j]) {
1101  other_attr = item.attr[k];
1102  break;
1103  }
1104  }
1105  kmp_str_buf_t buf;
1106  __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
1107  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
1108  __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
1109  __kmp_str_buf_free(&buf);
1110  return false;
1111  }
1112  // Allow specifying a specific core type or core eff exactly once
1113  for (int k = 0; k < j; ++k) {
1114  if (!item.attr[j] || !item.attr[k])
1115  continue;
1116  if (item.attr[k] == item.attr[j]) {
1117  kmp_str_buf_t buf;
1118  __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
1119  item.num[j] > 0);
1120  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
1121  __kmp_str_buf_free(&buf);
1122  return false;
1123  }
1124  }
1125  }
1126  }
1127  }
1128  }
1129 
1130  struct core_type_indexer {
1131  int operator()(const kmp_hw_thread_t &t) const {
1132  switch (t.attrs.get_core_type()) {
1133 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1134  case KMP_HW_CORE_TYPE_ATOM:
1135  return 1;
1136  case KMP_HW_CORE_TYPE_CORE:
1137  return 2;
1138 #endif
1139  case KMP_HW_CORE_TYPE_UNKNOWN:
1140  return 0;
1141  }
1142  KMP_ASSERT(0);
1143  return 0;
1144  }
1145  };
1146  struct core_eff_indexer {
1147  int operator()(const kmp_hw_thread_t &t) const {
1148  return t.attrs.get_core_eff();
1149  }
1150  };
1151 
1152  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
1153  core_level);
1154  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
1155  core_level);
1156 
1157  // Determine which hardware threads should be filtered.
1158  int num_filtered = 0;
1159  bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
1160  for (int i = 0; i < num_hw_threads; ++i) {
1161  kmp_hw_thread_t &hw_thread = hw_threads[i];
1162  // Update type_sub_id
1163  if (using_core_types)
1164  core_type_sub_ids.update(hw_thread);
1165  if (using_core_effs)
1166  core_eff_sub_ids.update(hw_thread);
1167 
1168  // Check to see if this hardware thread should be filtered
1169  bool should_be_filtered = false;
1170  for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
1171  ++hw_subset_index) {
1172  const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
1173  int level = topology_levels[hw_subset_index];
1174  if (level == -1)
1175  continue;
1176  if ((using_core_effs || using_core_types) && level == core_level) {
1177  // Look for the core attribute in KMP_HW_SUBSET which corresponds
1178  // to this hardware thread's core attribute. Use this num,offset plus
1179  // the running sub_id for the particular core attribute of this hardware
1180  // thread to determine if the hardware thread should be filtered or not.
1181  int attr_idx;
1182  kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
1183  int core_eff = hw_thread.attrs.get_core_eff();
1184  for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
1185  if (using_core_types &&
1186  hw_subset_item.attr[attr_idx].get_core_type() == core_type)
1187  break;
1188  if (using_core_effs &&
1189  hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
1190  break;
1191  }
1192  // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
1193  if (attr_idx == hw_subset_item.num_attrs) {
1194  should_be_filtered = true;
1195  break;
1196  }
1197  int sub_id;
1198  int num = hw_subset_item.num[attr_idx];
1199  int offset = hw_subset_item.offset[attr_idx];
1200  if (using_core_types)
1201  sub_id = core_type_sub_ids.get_sub_id(hw_thread);
1202  else
1203  sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
1204  if (sub_id < offset ||
1205  (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1206  should_be_filtered = true;
1207  break;
1208  }
1209  } else {
1210  int num = hw_subset_item.num[0];
1211  int offset = hw_subset_item.offset[0];
1212  if (hw_thread.sub_ids[level] < offset ||
1213  (num != kmp_hw_subset_t::USE_ALL &&
1214  hw_thread.sub_ids[level] >= offset + num)) {
1215  should_be_filtered = true;
1216  break;
1217  }
1218  }
1219  }
1220  // Collect filtering information
1221  filtered[i] = should_be_filtered;
1222  if (should_be_filtered)
1223  num_filtered++;
1224  }
1225 
1226  // One last check that we shouldn't allow filtering entire machine
1227  if (num_filtered == num_hw_threads) {
1228  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
1229  __kmp_free(filtered);
1230  return false;
1231  }
1232 
1233  // Apply the filter
1234  int new_index = 0;
1235  for (int i = 0; i < num_hw_threads; ++i) {
1236  if (!filtered[i]) {
1237  if (i != new_index)
1238  hw_threads[new_index] = hw_threads[i];
1239  new_index++;
1240  } else {
1241 #if KMP_AFFINITY_SUPPORTED
1242  KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
1243 #endif
1244  __kmp_avail_proc--;
1245  }
1246  }
1247 
1248  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
1249  num_hw_threads = new_index;
1250 
1251  // Post hardware subset canonicalization
1252  _gather_enumeration_information();
1253  _discover_uniformity();
1254  _set_globals();
1255  _set_last_level_cache();
1256  __kmp_free(filtered);
1257  return true;
1258 }
1259 
1260 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
1261  if (hw_level >= depth)
1262  return true;
1263  bool retval = true;
1264  const kmp_hw_thread_t &t1 = hw_threads[hwt1];
1265  const kmp_hw_thread_t &t2 = hw_threads[hwt2];
1266  for (int i = 0; i < (depth - hw_level); ++i) {
1267  if (t1.ids[i] != t2.ids[i])
1268  return false;
1269  }
1270  return retval;
1271 }
1272 
1274 
1275 #if KMP_AFFINITY_SUPPORTED
1276 class kmp_affinity_raii_t {
1277  kmp_affin_mask_t *mask;
1278  bool restored;
1279 
1280 public:
1281  kmp_affinity_raii_t() : restored(false) {
1282  KMP_CPU_ALLOC(mask);
1283  KMP_ASSERT(mask != NULL);
1284  __kmp_get_system_affinity(mask, TRUE);
1285  }
1286  void restore() {
1287  __kmp_set_system_affinity(mask, TRUE);
1288  KMP_CPU_FREE(mask);
1289  restored = true;
1290  }
1291  ~kmp_affinity_raii_t() {
1292  if (!restored) {
1293  __kmp_set_system_affinity(mask, TRUE);
1294  KMP_CPU_FREE(mask);
1295  }
1296  }
1297 };
1298 
1299 bool KMPAffinity::picked_api = false;
1300 
1301 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
1302 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
1303 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
1304 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
1305 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
1306 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
1307 
1308 void KMPAffinity::pick_api() {
1309  KMPAffinity *affinity_dispatch;
1310  if (picked_api)
1311  return;
1312 #if KMP_USE_HWLOC
1313  // Only use Hwloc if affinity isn't explicitly disabled and
1314  // user requests Hwloc topology method
1315  if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
1316  __kmp_affinity.type != affinity_disabled) {
1317  affinity_dispatch = new KMPHwlocAffinity();
1318  } else
1319 #endif
1320  {
1321  affinity_dispatch = new KMPNativeAffinity();
1322  }
1323  __kmp_affinity_dispatch = affinity_dispatch;
1324  picked_api = true;
1325 }
1326 
1327 void KMPAffinity::destroy_api() {
1328  if (__kmp_affinity_dispatch != NULL) {
1329  delete __kmp_affinity_dispatch;
1330  __kmp_affinity_dispatch = NULL;
1331  picked_api = false;
1332  }
1333 }
1334 
1335 #define KMP_ADVANCE_SCAN(scan) \
1336  while (*scan != '\0') { \
1337  scan++; \
1338  }
1339 
1340 // Print the affinity mask to the character array in a pretty format.
1341 // The format is a comma separated list of non-negative integers or integer
1342 // ranges: e.g., 1,2,3-5,7,9-15
1343 // The format can also be the string "{<empty>}" if no bits are set in mask
1344 char *__kmp_affinity_print_mask(char *buf, int buf_len,
1345  kmp_affin_mask_t *mask) {
1346  int start = 0, finish = 0, previous = 0;
1347  bool first_range;
1348  KMP_ASSERT(buf);
1349  KMP_ASSERT(buf_len >= 40);
1350  KMP_ASSERT(mask);
1351  char *scan = buf;
1352  char *end = buf + buf_len - 1;
1353 
1354  // Check for empty set.
1355  if (mask->begin() == mask->end()) {
1356  KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
1357  KMP_ADVANCE_SCAN(scan);
1358  KMP_ASSERT(scan <= end);
1359  return buf;
1360  }
1361 
1362  first_range = true;
1363  start = mask->begin();
1364  while (1) {
1365  // Find next range
1366  // [start, previous] is inclusive range of contiguous bits in mask
1367  for (finish = mask->next(start), previous = start;
1368  finish == previous + 1 && finish != mask->end();
1369  finish = mask->next(finish)) {
1370  previous = finish;
1371  }
1372 
1373  // The first range does not need a comma printed before it, but the rest
1374  // of the ranges do need a comma beforehand
1375  if (!first_range) {
1376  KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
1377  KMP_ADVANCE_SCAN(scan);
1378  } else {
1379  first_range = false;
1380  }
1381  // Range with three or more contiguous bits in the affinity mask
1382  if (previous - start > 1) {
1383  KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
1384  } else {
1385  // Range with one or two contiguous bits in the affinity mask
1386  KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
1387  KMP_ADVANCE_SCAN(scan);
1388  if (previous - start > 0) {
1389  KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
1390  }
1391  }
1392  KMP_ADVANCE_SCAN(scan);
1393  // Start over with new start point
1394  start = finish;
1395  if (start == mask->end())
1396  break;
1397  // Check for overflow
1398  if (end - scan < 2)
1399  break;
1400  }
1401 
1402  // Check for overflow
1403  KMP_ASSERT(scan <= end);
1404  return buf;
1405 }
1406 #undef KMP_ADVANCE_SCAN
1407 
1408 // Print the affinity mask to the string buffer object in a pretty format
1409 // The format is a comma separated list of non-negative integers or integer
1410 // ranges: e.g., 1,2,3-5,7,9-15
1411 // The format can also be the string "{<empty>}" if no bits are set in mask
1412 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
1413  kmp_affin_mask_t *mask) {
1414  int start = 0, finish = 0, previous = 0;
1415  bool first_range;
1416  KMP_ASSERT(buf);
1417  KMP_ASSERT(mask);
1418 
1419  __kmp_str_buf_clear(buf);
1420 
1421  // Check for empty set.
1422  if (mask->begin() == mask->end()) {
1423  __kmp_str_buf_print(buf, "%s", "{<empty>}");
1424  return buf;
1425  }
1426 
1427  first_range = true;
1428  start = mask->begin();
1429  while (1) {
1430  // Find next range
1431  // [start, previous] is inclusive range of contiguous bits in mask
1432  for (finish = mask->next(start), previous = start;
1433  finish == previous + 1 && finish != mask->end();
1434  finish = mask->next(finish)) {
1435  previous = finish;
1436  }
1437 
1438  // The first range does not need a comma printed before it, but the rest
1439  // of the ranges do need a comma beforehand
1440  if (!first_range) {
1441  __kmp_str_buf_print(buf, "%s", ",");
1442  } else {
1443  first_range = false;
1444  }
1445  // Range with three or more contiguous bits in the affinity mask
1446  if (previous - start > 1) {
1447  __kmp_str_buf_print(buf, "%u-%u", start, previous);
1448  } else {
1449  // Range with one or two contiguous bits in the affinity mask
1450  __kmp_str_buf_print(buf, "%u", start);
1451  if (previous - start > 0) {
1452  __kmp_str_buf_print(buf, ",%u", previous);
1453  }
1454  }
1455  // Start over with new start point
1456  start = finish;
1457  if (start == mask->end())
1458  break;
1459  }
1460  return buf;
1461 }
1462 
1463 // Return (possibly empty) affinity mask representing the offline CPUs
1464 // Caller must free the mask
1465 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
1466  kmp_affin_mask_t *offline;
1467  KMP_CPU_ALLOC(offline);
1468  KMP_CPU_ZERO(offline);
1469 #if KMP_OS_LINUX
1470  int n, begin_cpu, end_cpu;
1471  kmp_safe_raii_file_t offline_file;
1472  auto skip_ws = [](FILE *f) {
1473  int c;
1474  do {
1475  c = fgetc(f);
1476  } while (isspace(c));
1477  if (c != EOF)
1478  ungetc(c, f);
1479  };
1480  // File contains CSV of integer ranges representing the offline CPUs
1481  // e.g., 1,2,4-7,9,11-15
1482  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
1483  if (status != 0)
1484  return offline;
1485  while (!feof(offline_file)) {
1486  skip_ws(offline_file);
1487  n = fscanf(offline_file, "%d", &begin_cpu);
1488  if (n != 1)
1489  break;
1490  skip_ws(offline_file);
1491  int c = fgetc(offline_file);
1492  if (c == EOF || c == ',') {
1493  // Just single CPU
1494  end_cpu = begin_cpu;
1495  } else if (c == '-') {
1496  // Range of CPUs
1497  skip_ws(offline_file);
1498  n = fscanf(offline_file, "%d", &end_cpu);
1499  if (n != 1)
1500  break;
1501  skip_ws(offline_file);
1502  c = fgetc(offline_file); // skip ','
1503  } else {
1504  // Syntax problem
1505  break;
1506  }
1507  // Ensure a valid range of CPUs
1508  if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
1509  end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
1510  continue;
1511  }
1512  // Insert [begin_cpu, end_cpu] into offline mask
1513  for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
1514  KMP_CPU_SET(cpu, offline);
1515  }
1516  }
1517 #endif
1518  return offline;
1519 }
1520 
1521 // Return the number of available procs
1522 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1523  int avail_proc = 0;
1524  KMP_CPU_ZERO(mask);
1525 
1526 #if KMP_GROUP_AFFINITY
1527 
1528  if (__kmp_num_proc_groups > 1) {
1529  int group;
1530  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1531  for (group = 0; group < __kmp_num_proc_groups; group++) {
1532  int i;
1533  int num = __kmp_GetActiveProcessorCount(group);
1534  for (i = 0; i < num; i++) {
1535  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1536  avail_proc++;
1537  }
1538  }
1539  } else
1540 
1541 #endif /* KMP_GROUP_AFFINITY */
1542 
1543  {
1544  int proc;
1545  kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
1546  for (proc = 0; proc < __kmp_xproc; proc++) {
1547  // Skip offline CPUs
1548  if (KMP_CPU_ISSET(proc, offline_cpus))
1549  continue;
1550  KMP_CPU_SET(proc, mask);
1551  avail_proc++;
1552  }
1553  KMP_CPU_FREE(offline_cpus);
1554  }
1555 
1556  return avail_proc;
1557 }
1558 
1559 // All of the __kmp_affinity_create_*_map() routines should allocate the
1560 // internal topology object and set the layer ids for it. Each routine
1561 // returns a boolean on whether it was successful at doing so.
1562 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1563 // Original mask is a subset of full mask in multiple processor groups topology
1564 kmp_affin_mask_t *__kmp_affin_origMask = NULL;
1565 
1566 #if KMP_USE_HWLOC
1567 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1568 #if HWLOC_API_VERSION >= 0x00020000
1569  return hwloc_obj_type_is_cache(obj->type);
1570 #else
1571  return obj->type == HWLOC_OBJ_CACHE;
1572 #endif
1573 }
1574 
1575 // Returns KMP_HW_* type derived from HWLOC_* type
1576 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1577 
1578  if (__kmp_hwloc_is_cache_type(obj)) {
1579  if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1580  return KMP_HW_UNKNOWN;
1581  switch (obj->attr->cache.depth) {
1582  case 1:
1583  return KMP_HW_L1;
1584  case 2:
1585 #if KMP_MIC_SUPPORTED
1586  if (__kmp_mic_type == mic3) {
1587  return KMP_HW_TILE;
1588  }
1589 #endif
1590  return KMP_HW_L2;
1591  case 3:
1592  return KMP_HW_L3;
1593  }
1594  return KMP_HW_UNKNOWN;
1595  }
1596 
1597  switch (obj->type) {
1598  case HWLOC_OBJ_PACKAGE:
1599  return KMP_HW_SOCKET;
1600  case HWLOC_OBJ_NUMANODE:
1601  return KMP_HW_NUMA;
1602  case HWLOC_OBJ_CORE:
1603  return KMP_HW_CORE;
1604  case HWLOC_OBJ_PU:
1605  return KMP_HW_THREAD;
1606  case HWLOC_OBJ_GROUP:
1607 #if HWLOC_API_VERSION >= 0x00020000
1608  if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1609  return KMP_HW_DIE;
1610  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1611  return KMP_HW_TILE;
1612  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1613  return KMP_HW_MODULE;
1614  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1615  return KMP_HW_PROC_GROUP;
1616 #endif
1617  return KMP_HW_UNKNOWN;
1618 #if HWLOC_API_VERSION >= 0x00020100
1619  case HWLOC_OBJ_DIE:
1620  return KMP_HW_DIE;
1621 #endif
1622  }
1623  return KMP_HW_UNKNOWN;
1624 }
1625 
1626 // Returns the number of objects of type 'type' below 'obj' within the topology
1627 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1628 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1629 // object.
1630 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1631  hwloc_obj_type_t type) {
1632  int retval = 0;
1633  hwloc_obj_t first;
1634  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1635  obj->logical_index, type, 0);
1636  first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1637  obj->type, first) == obj;
1638  first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1639  first)) {
1640  ++retval;
1641  }
1642  return retval;
1643 }
1644 
1645 // This gets the sub_id for a lower object under a higher object in the
1646 // topology tree
1647 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1648  hwloc_obj_t lower) {
1649  hwloc_obj_t obj;
1650  hwloc_obj_type_t ltype = lower->type;
1651  int lindex = lower->logical_index - 1;
1652  int sub_id = 0;
1653  // Get the previous lower object
1654  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1655  while (obj && lindex >= 0 &&
1656  hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1657  if (obj->userdata) {
1658  sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1659  break;
1660  }
1661  sub_id++;
1662  lindex--;
1663  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1664  }
1665  // store sub_id + 1 so that 0 is differed from NULL
1666  lower->userdata = RCAST(void *, sub_id + 1);
1667  return sub_id;
1668 }
1669 
1670 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1671  kmp_hw_t type;
1672  int hw_thread_index, sub_id;
1673  int depth;
1674  hwloc_obj_t pu, obj, root, prev;
1675  kmp_hw_t types[KMP_HW_LAST];
1676  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1677 
1678  hwloc_topology_t tp = __kmp_hwloc_topology;
1679  *msg_id = kmp_i18n_null;
1680  if (__kmp_affinity.flags.verbose) {
1681  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1682  }
1683 
1684  if (!KMP_AFFINITY_CAPABLE()) {
1685  // Hack to try and infer the machine topology using only the data
1686  // available from hwloc on the current thread, and __kmp_xproc.
1687  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1688  // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1689  hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1690  if (o != NULL)
1691  nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1692  else
1693  nCoresPerPkg = 1; // no PACKAGE found
1694  o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1695  if (o != NULL)
1696  __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1697  else
1698  __kmp_nThreadsPerCore = 1; // no CORE found
1699  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1700  if (nCoresPerPkg == 0)
1701  nCoresPerPkg = 1; // to prevent possible division by 0
1702  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1703  return true;
1704  }
1705 
1706 #if HWLOC_API_VERSION >= 0x00020400
1707  // Handle multiple types of cores if they exist on the system
1708  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
1709 
1710  typedef struct kmp_hwloc_cpukinds_info_t {
1711  int efficiency;
1712  kmp_hw_core_type_t core_type;
1713  hwloc_bitmap_t mask;
1714  } kmp_hwloc_cpukinds_info_t;
1715  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
1716 
1717  if (nr_cpu_kinds > 0) {
1718  unsigned nr_infos;
1719  struct hwloc_info_s *infos;
1720  cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
1721  sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
1722  for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
1723  cpukinds[idx].efficiency = -1;
1724  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
1725  cpukinds[idx].mask = hwloc_bitmap_alloc();
1726  if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
1727  &cpukinds[idx].efficiency, &nr_infos, &infos,
1728  0) == 0) {
1729  for (unsigned i = 0; i < nr_infos; ++i) {
1730  if (__kmp_str_match("CoreType", 8, infos[i].name)) {
1731 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1732  if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
1733  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
1734  break;
1735  } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
1736  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
1737  break;
1738  }
1739 #endif
1740  }
1741  }
1742  }
1743  }
1744  }
1745 #endif
1746 
1747  root = hwloc_get_root_obj(tp);
1748 
1749  // Figure out the depth and types in the topology
1750  depth = 0;
1751  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1752  KMP_ASSERT(pu);
1753  obj = pu;
1754  types[depth] = KMP_HW_THREAD;
1755  hwloc_types[depth] = obj->type;
1756  depth++;
1757  while (obj != root && obj != NULL) {
1758  obj = obj->parent;
1759 #if HWLOC_API_VERSION >= 0x00020000
1760  if (obj->memory_arity) {
1761  hwloc_obj_t memory;
1762  for (memory = obj->memory_first_child; memory;
1763  memory = hwloc_get_next_child(tp, obj, memory)) {
1764  if (memory->type == HWLOC_OBJ_NUMANODE)
1765  break;
1766  }
1767  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1768  types[depth] = KMP_HW_NUMA;
1769  hwloc_types[depth] = memory->type;
1770  depth++;
1771  }
1772  }
1773 #endif
1774  type = __kmp_hwloc_type_2_topology_type(obj);
1775  if (type != KMP_HW_UNKNOWN) {
1776  types[depth] = type;
1777  hwloc_types[depth] = obj->type;
1778  depth++;
1779  }
1780  }
1781  KMP_ASSERT(depth > 0);
1782 
1783  // Get the order for the types correct
1784  for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1785  hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1786  kmp_hw_t temp = types[i];
1787  types[i] = types[j];
1788  types[j] = temp;
1789  hwloc_types[i] = hwloc_types[j];
1790  hwloc_types[j] = hwloc_temp;
1791  }
1792 
1793  // Allocate the data structure to be returned.
1794  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1795 
1796  hw_thread_index = 0;
1797  pu = NULL;
1798  while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
1799  int index = depth - 1;
1800  bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1801  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1802  if (included) {
1803  hw_thread.clear();
1804  hw_thread.ids[index] = pu->logical_index;
1805  hw_thread.os_id = pu->os_index;
1806  // If multiple core types, then set that attribute for the hardware thread
1807 #if HWLOC_API_VERSION >= 0x00020400
1808  if (cpukinds) {
1809  int cpukind_index = -1;
1810  for (int i = 0; i < nr_cpu_kinds; ++i) {
1811  if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
1812  cpukind_index = i;
1813  break;
1814  }
1815  }
1816  if (cpukind_index >= 0) {
1817  hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
1818  hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
1819  }
1820  }
1821 #endif
1822  index--;
1823  }
1824  obj = pu;
1825  prev = obj;
1826  while (obj != root && obj != NULL) {
1827  obj = obj->parent;
1828 #if HWLOC_API_VERSION >= 0x00020000
1829  // NUMA Nodes are handled differently since they are not within the
1830  // parent/child structure anymore. They are separate children
1831  // of obj (memory_first_child points to first memory child)
1832  if (obj->memory_arity) {
1833  hwloc_obj_t memory;
1834  for (memory = obj->memory_first_child; memory;
1835  memory = hwloc_get_next_child(tp, obj, memory)) {
1836  if (memory->type == HWLOC_OBJ_NUMANODE)
1837  break;
1838  }
1839  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1840  sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1841  if (included) {
1842  hw_thread.ids[index] = memory->logical_index;
1843  hw_thread.ids[index + 1] = sub_id;
1844  index--;
1845  }
1846  prev = memory;
1847  }
1848  prev = obj;
1849  }
1850 #endif
1851  type = __kmp_hwloc_type_2_topology_type(obj);
1852  if (type != KMP_HW_UNKNOWN) {
1853  sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1854  if (included) {
1855  hw_thread.ids[index] = obj->logical_index;
1856  hw_thread.ids[index + 1] = sub_id;
1857  index--;
1858  }
1859  prev = obj;
1860  }
1861  }
1862  if (included)
1863  hw_thread_index++;
1864  }
1865 
1866 #if HWLOC_API_VERSION >= 0x00020400
1867  // Free the core types information
1868  if (cpukinds) {
1869  for (int idx = 0; idx < nr_cpu_kinds; ++idx)
1870  hwloc_bitmap_free(cpukinds[idx].mask);
1871  __kmp_free(cpukinds);
1872  }
1873 #endif
1874  __kmp_topology->sort_ids();
1875  return true;
1876 }
1877 #endif // KMP_USE_HWLOC
1878 
1879 // If we don't know how to retrieve the machine's processor topology, or
1880 // encounter an error in doing so, this routine is called to form a "flat"
1881 // mapping of os thread id's <-> processor id's.
1882 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1883  *msg_id = kmp_i18n_null;
1884  int depth = 3;
1885  kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1886 
1887  if (__kmp_affinity.flags.verbose) {
1888  KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1889  }
1890 
1891  // Even if __kmp_affinity.type == affinity_none, this routine might still
1892  // be called to set __kmp_ncores, as well as
1893  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1894  if (!KMP_AFFINITY_CAPABLE()) {
1895  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1896  __kmp_ncores = nPackages = __kmp_xproc;
1897  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1898  return true;
1899  }
1900 
1901  // When affinity is off, this routine will still be called to set
1902  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1903  // Make sure all these vars are set correctly, and return now if affinity is
1904  // not enabled.
1905  __kmp_ncores = nPackages = __kmp_avail_proc;
1906  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1907 
1908  // Construct the data structure to be returned.
1909  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1910  int avail_ct = 0;
1911  int i;
1912  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1913  // Skip this proc if it is not included in the machine model.
1914  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1915  continue;
1916  }
1917  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
1918  hw_thread.clear();
1919  hw_thread.os_id = i;
1920  hw_thread.ids[0] = i;
1921  hw_thread.ids[1] = 0;
1922  hw_thread.ids[2] = 0;
1923  avail_ct++;
1924  }
1925  if (__kmp_affinity.flags.verbose) {
1926  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
1927  }
1928  return true;
1929 }
1930 
1931 #if KMP_GROUP_AFFINITY
1932 // If multiple Windows* OS processor groups exist, we can create a 2-level
1933 // topology map with the groups at level 0 and the individual procs at level 1.
1934 // This facilitates letting the threads float among all procs in a group,
1935 // if granularity=group (the default when there are multiple groups).
1936 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
1937  *msg_id = kmp_i18n_null;
1938  int depth = 3;
1939  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
1940  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
1941 
1942  if (__kmp_affinity.flags.verbose) {
1943  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
1944  }
1945 
1946  // If we aren't affinity capable, then use flat topology
1947  if (!KMP_AFFINITY_CAPABLE()) {
1948  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1949  nPackages = __kmp_num_proc_groups;
1950  __kmp_nThreadsPerCore = 1;
1951  __kmp_ncores = __kmp_xproc;
1952  nCoresPerPkg = nPackages / __kmp_ncores;
1953  return true;
1954  }
1955 
1956  // Construct the data structure to be returned.
1957  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1958  int avail_ct = 0;
1959  int i;
1960  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1961  // Skip this proc if it is not included in the machine model.
1962  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1963  continue;
1964  }
1965  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
1966  hw_thread.clear();
1967  hw_thread.os_id = i;
1968  hw_thread.ids[0] = i / BITS_PER_GROUP;
1969  hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
1970  }
1971  return true;
1972 }
1973 #endif /* KMP_GROUP_AFFINITY */
1974 
1975 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1976 
1977 template <kmp_uint32 LSB, kmp_uint32 MSB>
1978 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
1979  const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
1980  const kmp_uint32 SHIFT_RIGHT = LSB;
1981  kmp_uint32 retval = v;
1982  retval <<= SHIFT_LEFT;
1983  retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
1984  return retval;
1985 }
1986 
1987 static int __kmp_cpuid_mask_width(int count) {
1988  int r = 0;
1989 
1990  while ((1 << r) < count)
1991  ++r;
1992  return r;
1993 }
1994 
1995 class apicThreadInfo {
1996 public:
1997  unsigned osId; // param to __kmp_affinity_bind_thread
1998  unsigned apicId; // from cpuid after binding
1999  unsigned maxCoresPerPkg; // ""
2000  unsigned maxThreadsPerPkg; // ""
2001  unsigned pkgId; // inferred from above values
2002  unsigned coreId; // ""
2003  unsigned threadId; // ""
2004 };
2005 
2006 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
2007  const void *b) {
2008  const apicThreadInfo *aa = (const apicThreadInfo *)a;
2009  const apicThreadInfo *bb = (const apicThreadInfo *)b;
2010  if (aa->pkgId < bb->pkgId)
2011  return -1;
2012  if (aa->pkgId > bb->pkgId)
2013  return 1;
2014  if (aa->coreId < bb->coreId)
2015  return -1;
2016  if (aa->coreId > bb->coreId)
2017  return 1;
2018  if (aa->threadId < bb->threadId)
2019  return -1;
2020  if (aa->threadId > bb->threadId)
2021  return 1;
2022  return 0;
2023 }
2024 
2025 class kmp_cache_info_t {
2026 public:
2027  struct info_t {
2028  unsigned level, mask;
2029  };
2030  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
2031  size_t get_depth() const { return depth; }
2032  info_t &operator[](size_t index) { return table[index]; }
2033  const info_t &operator[](size_t index) const { return table[index]; }
2034 
2035  static kmp_hw_t get_topology_type(unsigned level) {
2036  KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
2037  switch (level) {
2038  case 1:
2039  return KMP_HW_L1;
2040  case 2:
2041  return KMP_HW_L2;
2042  case 3:
2043  return KMP_HW_L3;
2044  }
2045  return KMP_HW_UNKNOWN;
2046  }
2047 
2048 private:
2049  static const int MAX_CACHE_LEVEL = 3;
2050 
2051  size_t depth;
2052  info_t table[MAX_CACHE_LEVEL];
2053 
2054  void get_leaf4_levels() {
2055  unsigned level = 0;
2056  while (depth < MAX_CACHE_LEVEL) {
2057  unsigned cache_type, max_threads_sharing;
2058  unsigned cache_level, cache_mask_width;
2059  kmp_cpuid buf2;
2060  __kmp_x86_cpuid(4, level, &buf2);
2061  cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
2062  if (!cache_type)
2063  break;
2064  // Skip instruction caches
2065  if (cache_type == 2) {
2066  level++;
2067  continue;
2068  }
2069  max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
2070  cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
2071  cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
2072  table[depth].level = cache_level;
2073  table[depth].mask = ((-1) << cache_mask_width);
2074  depth++;
2075  level++;
2076  }
2077  }
2078 };
2079 
2080 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2081 // an algorithm which cycles through the available os threads, setting
2082 // the current thread's affinity mask to that thread, and then retrieves
2083 // the Apic Id for each thread context using the cpuid instruction.
2084 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
2085  kmp_cpuid buf;
2086  *msg_id = kmp_i18n_null;
2087 
2088  if (__kmp_affinity.flags.verbose) {
2089  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
2090  }
2091 
2092  // Check if cpuid leaf 4 is supported.
2093  __kmp_x86_cpuid(0, 0, &buf);
2094  if (buf.eax < 4) {
2095  *msg_id = kmp_i18n_str_NoLeaf4Support;
2096  return false;
2097  }
2098 
2099  // The algorithm used starts by setting the affinity to each available thread
2100  // and retrieving info from the cpuid instruction, so if we are not capable of
2101  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
2102  // need to do something else - use the defaults that we calculated from
2103  // issuing cpuid without binding to each proc.
2104  if (!KMP_AFFINITY_CAPABLE()) {
2105  // Hack to try and infer the machine topology using only the data
2106  // available from cpuid on the current thread, and __kmp_xproc.
2107  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2108 
2109  // Get an upper bound on the number of threads per package using cpuid(1).
2110  // On some OS/chps combinations where HT is supported by the chip but is
2111  // disabled, this value will be 2 on a single core chip. Usually, it will be
2112  // 2 if HT is enabled and 1 if HT is disabled.
2113  __kmp_x86_cpuid(1, 0, &buf);
2114  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2115  if (maxThreadsPerPkg == 0) {
2116  maxThreadsPerPkg = 1;
2117  }
2118 
2119  // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
2120  // value.
2121  //
2122  // The author of cpu_count.cpp treated this only an upper bound on the
2123  // number of cores, but I haven't seen any cases where it was greater than
2124  // the actual number of cores, so we will treat it as exact in this block of
2125  // code.
2126  //
2127  // First, we need to check if cpuid(4) is supported on this chip. To see if
2128  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
2129  // greater.
2130  __kmp_x86_cpuid(0, 0, &buf);
2131  if (buf.eax >= 4) {
2132  __kmp_x86_cpuid(4, 0, &buf);
2133  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2134  } else {
2135  nCoresPerPkg = 1;
2136  }
2137 
2138  // There is no way to reliably tell if HT is enabled without issuing the
2139  // cpuid instruction from every thread, can correlating the cpuid info, so
2140  // if the machine is not affinity capable, we assume that HT is off. We have
2141  // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
2142  // does not support HT.
2143  //
2144  // - Older OSes are usually found on machines with older chips, which do not
2145  // support HT.
2146  // - The performance penalty for mistakenly identifying a machine as HT when
2147  // it isn't (which results in blocktime being incorrectly set to 0) is
2148  // greater than the penalty when for mistakenly identifying a machine as
2149  // being 1 thread/core when it is really HT enabled (which results in
2150  // blocktime being incorrectly set to a positive value).
2151  __kmp_ncores = __kmp_xproc;
2152  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2153  __kmp_nThreadsPerCore = 1;
2154  return true;
2155  }
2156 
2157  // From here on, we can assume that it is safe to call
2158  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2159  // __kmp_affinity.type = affinity_none.
2160 
2161  // Save the affinity mask for the current thread.
2162  kmp_affinity_raii_t previous_affinity;
2163 
2164  // Run through each of the available contexts, binding the current thread
2165  // to it, and obtaining the pertinent information using the cpuid instr.
2166  //
2167  // The relevant information is:
2168  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2169  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
2170  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2171  // of this field determines the width of the core# + thread# fields in the
2172  // Apic Id. It is also an upper bound on the number of threads per
2173  // package, but it has been verified that situations happen were it is not
2174  // exact. In particular, on certain OS/chip combinations where Intel(R)
2175  // Hyper-Threading Technology is supported by the chip but has been
2176  // disabled, the value of this field will be 2 (for a single core chip).
2177  // On other OS/chip combinations supporting Intel(R) Hyper-Threading
2178  // Technology, the value of this field will be 1 when Intel(R)
2179  // Hyper-Threading Technology is disabled and 2 when it is enabled.
2180  // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
2181  // of this field (+1) determines the width of the core# field in the Apic
2182  // Id. The comments in "cpucount.cpp" say that this value is an upper
2183  // bound, but the IA-32 architecture manual says that it is exactly the
2184  // number of cores per package, and I haven't seen any case where it
2185  // wasn't.
2186  //
2187  // From this information, deduce the package Id, core Id, and thread Id,
2188  // and set the corresponding fields in the apicThreadInfo struct.
2189  unsigned i;
2190  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
2191  __kmp_avail_proc * sizeof(apicThreadInfo));
2192  unsigned nApics = 0;
2193  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2194  // Skip this proc if it is not included in the machine model.
2195  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2196  continue;
2197  }
2198  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
2199 
2200  __kmp_affinity_dispatch->bind_thread(i);
2201  threadInfo[nApics].osId = i;
2202 
2203  // The apic id and max threads per pkg come from cpuid(1).
2204  __kmp_x86_cpuid(1, 0, &buf);
2205  if (((buf.edx >> 9) & 1) == 0) {
2206  __kmp_free(threadInfo);
2207  *msg_id = kmp_i18n_str_ApicNotPresent;
2208  return false;
2209  }
2210  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
2211  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2212  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
2213  threadInfo[nApics].maxThreadsPerPkg = 1;
2214  }
2215 
2216  // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
2217  // value.
2218  //
2219  // First, we need to check if cpuid(4) is supported on this chip. To see if
2220  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
2221  // or greater.
2222  __kmp_x86_cpuid(0, 0, &buf);
2223  if (buf.eax >= 4) {
2224  __kmp_x86_cpuid(4, 0, &buf);
2225  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2226  } else {
2227  threadInfo[nApics].maxCoresPerPkg = 1;
2228  }
2229 
2230  // Infer the pkgId / coreId / threadId using only the info obtained locally.
2231  int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
2232  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
2233 
2234  int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
2235  int widthT = widthCT - widthC;
2236  if (widthT < 0) {
2237  // I've never seen this one happen, but I suppose it could, if the cpuid
2238  // instruction on a chip was really screwed up. Make sure to restore the
2239  // affinity mask before the tail call.
2240  __kmp_free(threadInfo);
2241  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2242  return false;
2243  }
2244 
2245  int maskC = (1 << widthC) - 1;
2246  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
2247 
2248  int maskT = (1 << widthT) - 1;
2249  threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
2250 
2251  nApics++;
2252  }
2253 
2254  // We've collected all the info we need.
2255  // Restore the old affinity mask for this thread.
2256  previous_affinity.restore();
2257 
2258  // Sort the threadInfo table by physical Id.
2259  qsort(threadInfo, nApics, sizeof(*threadInfo),
2260  __kmp_affinity_cmp_apicThreadInfo_phys_id);
2261 
2262  // The table is now sorted by pkgId / coreId / threadId, but we really don't
2263  // know the radix of any of the fields. pkgId's may be sparsely assigned among
2264  // the chips on a system. Although coreId's are usually assigned
2265  // [0 .. coresPerPkg-1] and threadId's are usually assigned
2266  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2267  //
2268  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2269  // total # packages) are at this point - we want to determine that now. We
2270  // only have an upper bound on the first two figures.
2271  //
2272  // We also perform a consistency check at this point: the values returned by
2273  // the cpuid instruction for any thread bound to a given package had better
2274  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
2275  nPackages = 1;
2276  nCoresPerPkg = 1;
2277  __kmp_nThreadsPerCore = 1;
2278  unsigned nCores = 1;
2279 
2280  unsigned pkgCt = 1; // to determine radii
2281  unsigned lastPkgId = threadInfo[0].pkgId;
2282  unsigned coreCt = 1;
2283  unsigned lastCoreId = threadInfo[0].coreId;
2284  unsigned threadCt = 1;
2285  unsigned lastThreadId = threadInfo[0].threadId;
2286 
2287  // intra-pkg consist checks
2288  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
2289  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
2290 
2291  for (i = 1; i < nApics; i++) {
2292  if (threadInfo[i].pkgId != lastPkgId) {
2293  nCores++;
2294  pkgCt++;
2295  lastPkgId = threadInfo[i].pkgId;
2296  if ((int)coreCt > nCoresPerPkg)
2297  nCoresPerPkg = coreCt;
2298  coreCt = 1;
2299  lastCoreId = threadInfo[i].coreId;
2300  if ((int)threadCt > __kmp_nThreadsPerCore)
2301  __kmp_nThreadsPerCore = threadCt;
2302  threadCt = 1;
2303  lastThreadId = threadInfo[i].threadId;
2304 
2305  // This is a different package, so go on to the next iteration without
2306  // doing any consistency checks. Reset the consistency check vars, though.
2307  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
2308  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
2309  continue;
2310  }
2311 
2312  if (threadInfo[i].coreId != lastCoreId) {
2313  nCores++;
2314  coreCt++;
2315  lastCoreId = threadInfo[i].coreId;
2316  if ((int)threadCt > __kmp_nThreadsPerCore)
2317  __kmp_nThreadsPerCore = threadCt;
2318  threadCt = 1;
2319  lastThreadId = threadInfo[i].threadId;
2320  } else if (threadInfo[i].threadId != lastThreadId) {
2321  threadCt++;
2322  lastThreadId = threadInfo[i].threadId;
2323  } else {
2324  __kmp_free(threadInfo);
2325  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2326  return false;
2327  }
2328 
2329  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
2330  // fields agree between all the threads bounds to a given package.
2331  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
2332  (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
2333  __kmp_free(threadInfo);
2334  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
2335  return false;
2336  }
2337  }
2338  // When affinity is off, this routine will still be called to set
2339  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2340  // Make sure all these vars are set correctly
2341  nPackages = pkgCt;
2342  if ((int)coreCt > nCoresPerPkg)
2343  nCoresPerPkg = coreCt;
2344  if ((int)threadCt > __kmp_nThreadsPerCore)
2345  __kmp_nThreadsPerCore = threadCt;
2346  __kmp_ncores = nCores;
2347  KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
2348 
2349  // Now that we've determined the number of packages, the number of cores per
2350  // package, and the number of threads per core, we can construct the data
2351  // structure that is to be returned.
2352  int idx = 0;
2353  int pkgLevel = 0;
2354  int coreLevel = 1;
2355  int threadLevel = 2;
2356  //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2357  int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
2358  kmp_hw_t types[3];
2359  if (pkgLevel >= 0)
2360  types[idx++] = KMP_HW_SOCKET;
2361  if (coreLevel >= 0)
2362  types[idx++] = KMP_HW_CORE;
2363  if (threadLevel >= 0)
2364  types[idx++] = KMP_HW_THREAD;
2365 
2366  KMP_ASSERT(depth > 0);
2367  __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
2368 
2369  for (i = 0; i < nApics; ++i) {
2370  idx = 0;
2371  unsigned os = threadInfo[i].osId;
2372  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2373  hw_thread.clear();
2374 
2375  if (pkgLevel >= 0) {
2376  hw_thread.ids[idx++] = threadInfo[i].pkgId;
2377  }
2378  if (coreLevel >= 0) {
2379  hw_thread.ids[idx++] = threadInfo[i].coreId;
2380  }
2381  if (threadLevel >= 0) {
2382  hw_thread.ids[idx++] = threadInfo[i].threadId;
2383  }
2384  hw_thread.os_id = os;
2385  }
2386 
2387  __kmp_free(threadInfo);
2388  __kmp_topology->sort_ids();
2389  if (!__kmp_topology->check_ids()) {
2390  kmp_topology_t::deallocate(__kmp_topology);
2391  __kmp_topology = nullptr;
2392  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2393  return false;
2394  }
2395  return true;
2396 }
2397 
2398 // Hybrid cpu detection using CPUID.1A
2399 // Thread should be pinned to processor already
2400 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
2401  unsigned *native_model_id) {
2402  kmp_cpuid buf;
2403  __kmp_x86_cpuid(0x1a, 0, &buf);
2404  *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
2405  switch (*type) {
2406  case KMP_HW_CORE_TYPE_ATOM:
2407  *efficiency = 0;
2408  break;
2409  case KMP_HW_CORE_TYPE_CORE:
2410  *efficiency = 1;
2411  break;
2412  default:
2413  *efficiency = 0;
2414  }
2415  *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
2416 }
2417 
2418 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
2419 // architectures support a newer interface for specifying the x2APIC Ids,
2420 // based on CPUID.B or CPUID.1F
2421 /*
2422  * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
2423  Bits Bits Bits Bits
2424  31-16 15-8 7-4 4-0
2425 ---+-----------+--------------+-------------+-----------------+
2426 EAX| reserved | reserved | reserved | Bits to Shift |
2427 ---+-----------|--------------+-------------+-----------------|
2428 EBX| reserved | Num logical processors at level (16 bits) |
2429 ---+-----------|--------------+-------------------------------|
2430 ECX| reserved | Level Type | Level Number (8 bits) |
2431 ---+-----------+--------------+-------------------------------|
2432 EDX| X2APIC ID (32 bits) |
2433 ---+----------------------------------------------------------+
2434 */
2435 
2436 enum {
2437  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
2438  INTEL_LEVEL_TYPE_SMT = 1,
2439  INTEL_LEVEL_TYPE_CORE = 2,
2440  INTEL_LEVEL_TYPE_MODULE = 3,
2441  INTEL_LEVEL_TYPE_TILE = 4,
2442  INTEL_LEVEL_TYPE_DIE = 5,
2443  INTEL_LEVEL_TYPE_LAST = 6,
2444 };
2445 
2446 struct cpuid_level_info_t {
2447  unsigned level_type, mask, mask_width, nitems, cache_mask;
2448 };
2449 
2450 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
2451  switch (intel_type) {
2452  case INTEL_LEVEL_TYPE_INVALID:
2453  return KMP_HW_SOCKET;
2454  case INTEL_LEVEL_TYPE_SMT:
2455  return KMP_HW_THREAD;
2456  case INTEL_LEVEL_TYPE_CORE:
2457  return KMP_HW_CORE;
2458  case INTEL_LEVEL_TYPE_TILE:
2459  return KMP_HW_TILE;
2460  case INTEL_LEVEL_TYPE_MODULE:
2461  return KMP_HW_MODULE;
2462  case INTEL_LEVEL_TYPE_DIE:
2463  return KMP_HW_DIE;
2464  }
2465  return KMP_HW_UNKNOWN;
2466 }
2467 
2468 // This function takes the topology leaf, a levels array to store the levels
2469 // detected and a bitmap of the known levels.
2470 // Returns the number of levels in the topology
2471 static unsigned
2472 __kmp_x2apicid_get_levels(int leaf,
2473  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
2474  kmp_uint64 known_levels) {
2475  unsigned level, levels_index;
2476  unsigned level_type, mask_width, nitems;
2477  kmp_cpuid buf;
2478 
2479  // New algorithm has known topology layers act as highest unknown topology
2480  // layers when unknown topology layers exist.
2481  // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
2482  // are unknown topology layers, Then SMT will take the characteristics of
2483  // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
2484  // This eliminates unknown portions of the topology while still keeping the
2485  // correct structure.
2486  level = levels_index = 0;
2487  do {
2488  __kmp_x86_cpuid(leaf, level, &buf);
2489  level_type = __kmp_extract_bits<8, 15>(buf.ecx);
2490  mask_width = __kmp_extract_bits<0, 4>(buf.eax);
2491  nitems = __kmp_extract_bits<0, 15>(buf.ebx);
2492  if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
2493  return 0;
2494 
2495  if (known_levels & (1ull << level_type)) {
2496  // Add a new level to the topology
2497  KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
2498  levels[levels_index].level_type = level_type;
2499  levels[levels_index].mask_width = mask_width;
2500  levels[levels_index].nitems = nitems;
2501  levels_index++;
2502  } else {
2503  // If it is an unknown level, then logically move the previous layer up
2504  if (levels_index > 0) {
2505  levels[levels_index - 1].mask_width = mask_width;
2506  levels[levels_index - 1].nitems = nitems;
2507  }
2508  }
2509  level++;
2510  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
2511 
2512  // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2513  if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
2514  return 0;
2515 
2516  // Set the masks to & with apicid
2517  for (unsigned i = 0; i < levels_index; ++i) {
2518  if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
2519  levels[i].mask = ~((-1) << levels[i].mask_width);
2520  levels[i].cache_mask = (-1) << levels[i].mask_width;
2521  for (unsigned j = 0; j < i; ++j)
2522  levels[i].mask ^= levels[j].mask;
2523  } else {
2524  KMP_DEBUG_ASSERT(i > 0);
2525  levels[i].mask = (-1) << levels[i - 1].mask_width;
2526  levels[i].cache_mask = 0;
2527  }
2528  }
2529  return levels_index;
2530 }
2531 
2532 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
2533 
2534  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
2535  kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
2536  unsigned levels_index;
2537  kmp_cpuid buf;
2538  kmp_uint64 known_levels;
2539  int topology_leaf, highest_leaf, apic_id;
2540  int num_leaves;
2541  static int leaves[] = {0, 0};
2542 
2543  kmp_i18n_id_t leaf_message_id;
2544 
2545  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
2546 
2547  *msg_id = kmp_i18n_null;
2548  if (__kmp_affinity.flags.verbose) {
2549  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
2550  }
2551 
2552  // Figure out the known topology levels
2553  known_levels = 0ull;
2554  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
2555  if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
2556  known_levels |= (1ull << i);
2557  }
2558  }
2559 
2560  // Get the highest cpuid leaf supported
2561  __kmp_x86_cpuid(0, 0, &buf);
2562  highest_leaf = buf.eax;
2563 
2564  // If a specific topology method was requested, only allow that specific leaf
2565  // otherwise, try both leaves 31 and 11 in that order
2566  num_leaves = 0;
2567  if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
2568  num_leaves = 1;
2569  leaves[0] = 11;
2570  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2571  } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
2572  num_leaves = 1;
2573  leaves[0] = 31;
2574  leaf_message_id = kmp_i18n_str_NoLeaf31Support;
2575  } else {
2576  num_leaves = 2;
2577  leaves[0] = 31;
2578  leaves[1] = 11;
2579  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2580  }
2581 
2582  // Check to see if cpuid leaf 31 or 11 is supported.
2583  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2584  topology_leaf = -1;
2585  for (int i = 0; i < num_leaves; ++i) {
2586  int leaf = leaves[i];
2587  if (highest_leaf < leaf)
2588  continue;
2589  __kmp_x86_cpuid(leaf, 0, &buf);
2590  if (buf.ebx == 0)
2591  continue;
2592  topology_leaf = leaf;
2593  levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2594  if (levels_index == 0)
2595  continue;
2596  break;
2597  }
2598  if (topology_leaf == -1 || levels_index == 0) {
2599  *msg_id = leaf_message_id;
2600  return false;
2601  }
2602  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2603 
2604  // The algorithm used starts by setting the affinity to each available thread
2605  // and retrieving info from the cpuid instruction, so if we are not capable of
2606  // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2607  // we need to do something else - use the defaults that we calculated from
2608  // issuing cpuid without binding to each proc.
2609  if (!KMP_AFFINITY_CAPABLE()) {
2610  // Hack to try and infer the machine topology using only the data
2611  // available from cpuid on the current thread, and __kmp_xproc.
2612  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2613  for (unsigned i = 0; i < levels_index; ++i) {
2614  if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2615  __kmp_nThreadsPerCore = levels[i].nitems;
2616  } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2617  nCoresPerPkg = levels[i].nitems;
2618  }
2619  }
2620  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2621  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2622  return true;
2623  }
2624 
2625  // Allocate the data structure to be returned.
2626  int depth = levels_index;
2627  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2628  types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2629  __kmp_topology =
2630  kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2631 
2632  // Insert equivalent cache types if they exist
2633  kmp_cache_info_t cache_info;
2634  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2635  const kmp_cache_info_t::info_t &info = cache_info[i];
2636  unsigned cache_mask = info.mask;
2637  unsigned cache_level = info.level;
2638  for (unsigned j = 0; j < levels_index; ++j) {
2639  unsigned hw_cache_mask = levels[j].cache_mask;
2640  kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2641  if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2642  kmp_hw_t type =
2643  __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2644  __kmp_topology->set_equivalent_type(cache_type, type);
2645  }
2646  }
2647  }
2648 
2649  // From here on, we can assume that it is safe to call
2650  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2651  // __kmp_affinity.type = affinity_none.
2652 
2653  // Save the affinity mask for the current thread.
2654  kmp_affinity_raii_t previous_affinity;
2655 
2656  // Run through each of the available contexts, binding the current thread
2657  // to it, and obtaining the pertinent information using the cpuid instr.
2658  unsigned int proc;
2659  int hw_thread_index = 0;
2660  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2661  cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2662  unsigned my_levels_index;
2663 
2664  // Skip this proc if it is not included in the machine model.
2665  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2666  continue;
2667  }
2668  KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2669 
2670  __kmp_affinity_dispatch->bind_thread(proc);
2671 
2672  // New algorithm
2673  __kmp_x86_cpuid(topology_leaf, 0, &buf);
2674  apic_id = buf.edx;
2675  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2676  my_levels_index =
2677  __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2678  if (my_levels_index == 0 || my_levels_index != levels_index) {
2679  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2680  return false;
2681  }
2682  hw_thread.clear();
2683  hw_thread.os_id = proc;
2684  // Put in topology information
2685  for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2686  hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2687  if (j > 0) {
2688  hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2689  }
2690  }
2691  // Hybrid information
2692  if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2693  kmp_hw_core_type_t type;
2694  unsigned native_model_id;
2695  int efficiency;
2696  __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
2697  hw_thread.attrs.set_core_type(type);
2698  hw_thread.attrs.set_core_eff(efficiency);
2699  }
2700  hw_thread_index++;
2701  }
2702  KMP_ASSERT(hw_thread_index > 0);
2703  __kmp_topology->sort_ids();
2704  if (!__kmp_topology->check_ids()) {
2705  kmp_topology_t::deallocate(__kmp_topology);
2706  __kmp_topology = nullptr;
2707  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2708  return false;
2709  }
2710  return true;
2711 }
2712 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2713 
2714 #define osIdIndex 0
2715 #define threadIdIndex 1
2716 #define coreIdIndex 2
2717 #define pkgIdIndex 3
2718 #define nodeIdIndex 4
2719 
2720 typedef unsigned *ProcCpuInfo;
2721 static unsigned maxIndex = pkgIdIndex;
2722 
2723 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2724  const void *b) {
2725  unsigned i;
2726  const unsigned *aa = *(unsigned *const *)a;
2727  const unsigned *bb = *(unsigned *const *)b;
2728  for (i = maxIndex;; i--) {
2729  if (aa[i] < bb[i])
2730  return -1;
2731  if (aa[i] > bb[i])
2732  return 1;
2733  if (i == osIdIndex)
2734  break;
2735  }
2736  return 0;
2737 }
2738 
2739 #if KMP_USE_HIER_SCHED
2740 // Set the array sizes for the hierarchy layers
2741 static void __kmp_dispatch_set_hierarchy_values() {
2742  // Set the maximum number of L1's to number of cores
2743  // Set the maximum number of L2's to to either number of cores / 2 for
2744  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2745  // Or the number of cores for Intel(R) Xeon(R) processors
2746  // Set the maximum number of NUMA nodes and L3's to number of packages
2747  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2748  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2749  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2750 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2751  KMP_MIC_SUPPORTED
2752  if (__kmp_mic_type >= mic3)
2753  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2754  else
2755 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2756  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2757  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2758  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2759  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2760  // Set the number of threads per unit
2761  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2762  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2763  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2764  __kmp_nThreadsPerCore;
2765 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2766  KMP_MIC_SUPPORTED
2767  if (__kmp_mic_type >= mic3)
2768  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2769  2 * __kmp_nThreadsPerCore;
2770  else
2771 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2772  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2773  __kmp_nThreadsPerCore;
2774  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2775  nCoresPerPkg * __kmp_nThreadsPerCore;
2776  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2777  nCoresPerPkg * __kmp_nThreadsPerCore;
2778  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2779  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2780 }
2781 
2782 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2783 // i.e., this thread's L1 or this thread's L2, etc.
2784 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2785  int index = type + 1;
2786  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2787  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2788  if (type == kmp_hier_layer_e::LAYER_THREAD)
2789  return tid;
2790  else if (type == kmp_hier_layer_e::LAYER_LOOP)
2791  return 0;
2792  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2793  if (tid >= num_hw_threads)
2794  tid = tid % num_hw_threads;
2795  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2796 }
2797 
2798 // Return the number of t1's per t2
2799 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2800  int i1 = t1 + 1;
2801  int i2 = t2 + 1;
2802  KMP_DEBUG_ASSERT(i1 <= i2);
2803  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2804  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2805  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2806  // (nthreads/t2) / (nthreads/t1) = t1 / t2
2807  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2808 }
2809 #endif // KMP_USE_HIER_SCHED
2810 
2811 static inline const char *__kmp_cpuinfo_get_filename() {
2812  const char *filename;
2813  if (__kmp_cpuinfo_file != nullptr)
2814  filename = __kmp_cpuinfo_file;
2815  else
2816  filename = "/proc/cpuinfo";
2817  return filename;
2818 }
2819 
2820 static inline const char *__kmp_cpuinfo_get_envvar() {
2821  const char *envvar = nullptr;
2822  if (__kmp_cpuinfo_file != nullptr)
2823  envvar = "KMP_CPUINFO_FILE";
2824  return envvar;
2825 }
2826 
2827 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2828 // affinity map.
2829 static bool __kmp_affinity_create_cpuinfo_map(int *line,
2830  kmp_i18n_id_t *const msg_id) {
2831  const char *filename = __kmp_cpuinfo_get_filename();
2832  const char *envvar = __kmp_cpuinfo_get_envvar();
2833  *msg_id = kmp_i18n_null;
2834 
2835  if (__kmp_affinity.flags.verbose) {
2836  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2837  }
2838 
2839  kmp_safe_raii_file_t f(filename, "r", envvar);
2840 
2841  // Scan of the file, and count the number of "processor" (osId) fields,
2842  // and find the highest value of <n> for a node_<n> field.
2843  char buf[256];
2844  unsigned num_records = 0;
2845  while (!feof(f)) {
2846  buf[sizeof(buf) - 1] = 1;
2847  if (!fgets(buf, sizeof(buf), f)) {
2848  // Read errors presumably because of EOF
2849  break;
2850  }
2851 
2852  char s1[] = "processor";
2853  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2854  num_records++;
2855  continue;
2856  }
2857 
2858  // FIXME - this will match "node_<n> <garbage>"
2859  unsigned level;
2860  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2861  // validate the input fisrt:
2862  if (level > (unsigned)__kmp_xproc) { // level is too big
2863  level = __kmp_xproc;
2864  }
2865  if (nodeIdIndex + level >= maxIndex) {
2866  maxIndex = nodeIdIndex + level;
2867  }
2868  continue;
2869  }
2870  }
2871 
2872  // Check for empty file / no valid processor records, or too many. The number
2873  // of records can't exceed the number of valid bits in the affinity mask.
2874  if (num_records == 0) {
2875  *msg_id = kmp_i18n_str_NoProcRecords;
2876  return false;
2877  }
2878  if (num_records > (unsigned)__kmp_xproc) {
2879  *msg_id = kmp_i18n_str_TooManyProcRecords;
2880  return false;
2881  }
2882 
2883  // Set the file pointer back to the beginning, so that we can scan the file
2884  // again, this time performing a full parse of the data. Allocate a vector of
2885  // ProcCpuInfo object, where we will place the data. Adding an extra element
2886  // at the end allows us to remove a lot of extra checks for termination
2887  // conditions.
2888  if (fseek(f, 0, SEEK_SET) != 0) {
2889  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2890  return false;
2891  }
2892 
2893  // Allocate the array of records to store the proc info in. The dummy
2894  // element at the end makes the logic in filling them out easier to code.
2895  unsigned **threadInfo =
2896  (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2897  unsigned i;
2898  for (i = 0; i <= num_records; i++) {
2899  threadInfo[i] =
2900  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2901  }
2902 
2903 #define CLEANUP_THREAD_INFO \
2904  for (i = 0; i <= num_records; i++) { \
2905  __kmp_free(threadInfo[i]); \
2906  } \
2907  __kmp_free(threadInfo);
2908 
2909  // A value of UINT_MAX means that we didn't find the field
2910  unsigned __index;
2911 
2912 #define INIT_PROC_INFO(p) \
2913  for (__index = 0; __index <= maxIndex; __index++) { \
2914  (p)[__index] = UINT_MAX; \
2915  }
2916 
2917  for (i = 0; i <= num_records; i++) {
2918  INIT_PROC_INFO(threadInfo[i]);
2919  }
2920 
2921  unsigned num_avail = 0;
2922  *line = 0;
2923  while (!feof(f)) {
2924  // Create an inner scoping level, so that all the goto targets at the end of
2925  // the loop appear in an outer scoping level. This avoids warnings about
2926  // jumping past an initialization to a target in the same block.
2927  {
2928  buf[sizeof(buf) - 1] = 1;
2929  bool long_line = false;
2930  if (!fgets(buf, sizeof(buf), f)) {
2931  // Read errors presumably because of EOF
2932  // If there is valid data in threadInfo[num_avail], then fake
2933  // a blank line in ensure that the last address gets parsed.
2934  bool valid = false;
2935  for (i = 0; i <= maxIndex; i++) {
2936  if (threadInfo[num_avail][i] != UINT_MAX) {
2937  valid = true;
2938  }
2939  }
2940  if (!valid) {
2941  break;
2942  }
2943  buf[0] = 0;
2944  } else if (!buf[sizeof(buf) - 1]) {
2945  // The line is longer than the buffer. Set a flag and don't
2946  // emit an error if we were going to ignore the line, anyway.
2947  long_line = true;
2948 
2949 #define CHECK_LINE \
2950  if (long_line) { \
2951  CLEANUP_THREAD_INFO; \
2952  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2953  return false; \
2954  }
2955  }
2956  (*line)++;
2957 
2958 #if KMP_ARCH_LOONGARCH64
2959  // The parsing logic of /proc/cpuinfo in this function highly depends on
2960  // the blank lines between each processor info block. But on LoongArch a
2961  // blank line exists before the first processor info block (i.e. after the
2962  // "system type" line). This blank line was added because the "system
2963  // type" line is unrelated to any of the CPUs. We must skip this line so
2964  // that the original logic works on LoongArch.
2965  if (*buf == '\n' && *line == 2)
2966  continue;
2967 #endif
2968 
2969  char s1[] = "processor";
2970  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2971  CHECK_LINE;
2972  char *p = strchr(buf + sizeof(s1) - 1, ':');
2973  unsigned val;
2974  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2975  goto no_val;
2976  if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
2977 #if KMP_ARCH_AARCH64
2978  // Handle the old AArch64 /proc/cpuinfo layout differently,
2979  // it contains all of the 'processor' entries listed in a
2980  // single 'Processor' section, therefore the normal looking
2981  // for duplicates in that section will always fail.
2982  num_avail++;
2983 #else
2984  goto dup_field;
2985 #endif
2986  threadInfo[num_avail][osIdIndex] = val;
2987 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2988  char path[256];
2989  KMP_SNPRINTF(
2990  path, sizeof(path),
2991  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2992  threadInfo[num_avail][osIdIndex]);
2993  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2994 
2995  KMP_SNPRINTF(path, sizeof(path),
2996  "/sys/devices/system/cpu/cpu%u/topology/core_id",
2997  threadInfo[num_avail][osIdIndex]);
2998  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2999  continue;
3000 #else
3001  }
3002  char s2[] = "physical id";
3003  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
3004  CHECK_LINE;
3005  char *p = strchr(buf + sizeof(s2) - 1, ':');
3006  unsigned val;
3007  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3008  goto no_val;
3009  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
3010  goto dup_field;
3011  threadInfo[num_avail][pkgIdIndex] = val;
3012  continue;
3013  }
3014  char s3[] = "core id";
3015  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
3016  CHECK_LINE;
3017  char *p = strchr(buf + sizeof(s3) - 1, ':');
3018  unsigned val;
3019  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3020  goto no_val;
3021  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
3022  goto dup_field;
3023  threadInfo[num_avail][coreIdIndex] = val;
3024  continue;
3025 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
3026  }
3027  char s4[] = "thread id";
3028  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
3029  CHECK_LINE;
3030  char *p = strchr(buf + sizeof(s4) - 1, ':');
3031  unsigned val;
3032  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3033  goto no_val;
3034  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
3035  goto dup_field;
3036  threadInfo[num_avail][threadIdIndex] = val;
3037  continue;
3038  }
3039  unsigned level;
3040  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
3041  CHECK_LINE;
3042  char *p = strchr(buf + sizeof(s4) - 1, ':');
3043  unsigned val;
3044  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3045  goto no_val;
3046  // validate the input before using level:
3047  if (level > (unsigned)__kmp_xproc) { // level is too big
3048  level = __kmp_xproc;
3049  }
3050  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
3051  goto dup_field;
3052  threadInfo[num_avail][nodeIdIndex + level] = val;
3053  continue;
3054  }
3055 
3056  // We didn't recognize the leading token on the line. There are lots of
3057  // leading tokens that we don't recognize - if the line isn't empty, go on
3058  // to the next line.
3059  if ((*buf != 0) && (*buf != '\n')) {
3060  // If the line is longer than the buffer, read characters
3061  // until we find a newline.
3062  if (long_line) {
3063  int ch;
3064  while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
3065  ;
3066  }
3067  continue;
3068  }
3069 
3070  // A newline has signalled the end of the processor record.
3071  // Check that there aren't too many procs specified.
3072  if ((int)num_avail == __kmp_xproc) {
3073  CLEANUP_THREAD_INFO;
3074  *msg_id = kmp_i18n_str_TooManyEntries;
3075  return false;
3076  }
3077 
3078  // Check for missing fields. The osId field must be there, and we
3079  // currently require that the physical id field is specified, also.
3080  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
3081  CLEANUP_THREAD_INFO;
3082  *msg_id = kmp_i18n_str_MissingProcField;
3083  return false;
3084  }
3085  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
3086  CLEANUP_THREAD_INFO;
3087  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
3088  return false;
3089  }
3090 
3091  // Skip this proc if it is not included in the machine model.
3092  if (KMP_AFFINITY_CAPABLE() &&
3093  !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
3094  __kmp_affin_fullMask)) {
3095  INIT_PROC_INFO(threadInfo[num_avail]);
3096  continue;
3097  }
3098 
3099  // We have a successful parse of this proc's info.
3100  // Increment the counter, and prepare for the next proc.
3101  num_avail++;
3102  KMP_ASSERT(num_avail <= num_records);
3103  INIT_PROC_INFO(threadInfo[num_avail]);
3104  }
3105  continue;
3106 
3107  no_val:
3108  CLEANUP_THREAD_INFO;
3109  *msg_id = kmp_i18n_str_MissingValCpuinfo;
3110  return false;
3111 
3112  dup_field:
3113  CLEANUP_THREAD_INFO;
3114  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
3115  return false;
3116  }
3117  *line = 0;
3118 
3119 #if KMP_MIC && REDUCE_TEAM_SIZE
3120  unsigned teamSize = 0;
3121 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3122 
3123  // check for num_records == __kmp_xproc ???
3124 
3125  // If it is configured to omit the package level when there is only a single
3126  // package, the logic at the end of this routine won't work if there is only a
3127  // single thread
3128  KMP_ASSERT(num_avail > 0);
3129  KMP_ASSERT(num_avail <= num_records);
3130 
3131  // Sort the threadInfo table by physical Id.
3132  qsort(threadInfo, num_avail, sizeof(*threadInfo),
3133  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
3134 
3135  // The table is now sorted by pkgId / coreId / threadId, but we really don't
3136  // know the radix of any of the fields. pkgId's may be sparsely assigned among
3137  // the chips on a system. Although coreId's are usually assigned
3138  // [0 .. coresPerPkg-1] and threadId's are usually assigned
3139  // [0..threadsPerCore-1], we don't want to make any such assumptions.
3140  //
3141  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
3142  // total # packages) are at this point - we want to determine that now. We
3143  // only have an upper bound on the first two figures.
3144  unsigned *counts =
3145  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3146  unsigned *maxCt =
3147  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3148  unsigned *totals =
3149  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3150  unsigned *lastId =
3151  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3152 
3153  bool assign_thread_ids = false;
3154  unsigned threadIdCt;
3155  unsigned index;
3156 
3157 restart_radix_check:
3158  threadIdCt = 0;
3159 
3160  // Initialize the counter arrays with data from threadInfo[0].
3161  if (assign_thread_ids) {
3162  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
3163  threadInfo[0][threadIdIndex] = threadIdCt++;
3164  } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
3165  threadIdCt = threadInfo[0][threadIdIndex] + 1;
3166  }
3167  }
3168  for (index = 0; index <= maxIndex; index++) {
3169  counts[index] = 1;
3170  maxCt[index] = 1;
3171  totals[index] = 1;
3172  lastId[index] = threadInfo[0][index];
3173  ;
3174  }
3175 
3176  // Run through the rest of the OS procs.
3177  for (i = 1; i < num_avail; i++) {
3178  // Find the most significant index whose id differs from the id for the
3179  // previous OS proc.
3180  for (index = maxIndex; index >= threadIdIndex; index--) {
3181  if (assign_thread_ids && (index == threadIdIndex)) {
3182  // Auto-assign the thread id field if it wasn't specified.
3183  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3184  threadInfo[i][threadIdIndex] = threadIdCt++;
3185  }
3186  // Apparently the thread id field was specified for some entries and not
3187  // others. Start the thread id counter off at the next higher thread id.
3188  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3189  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3190  }
3191  }
3192  if (threadInfo[i][index] != lastId[index]) {
3193  // Run through all indices which are less significant, and reset the
3194  // counts to 1. At all levels up to and including index, we need to
3195  // increment the totals and record the last id.
3196  unsigned index2;
3197  for (index2 = threadIdIndex; index2 < index; index2++) {
3198  totals[index2]++;
3199  if (counts[index2] > maxCt[index2]) {
3200  maxCt[index2] = counts[index2];
3201  }
3202  counts[index2] = 1;
3203  lastId[index2] = threadInfo[i][index2];
3204  }
3205  counts[index]++;
3206  totals[index]++;
3207  lastId[index] = threadInfo[i][index];
3208 
3209  if (assign_thread_ids && (index > threadIdIndex)) {
3210 
3211 #if KMP_MIC && REDUCE_TEAM_SIZE
3212  // The default team size is the total #threads in the machine
3213  // minus 1 thread for every core that has 3 or more threads.
3214  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3215 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3216 
3217  // Restart the thread counter, as we are on a new core.
3218  threadIdCt = 0;
3219 
3220  // Auto-assign the thread id field if it wasn't specified.
3221  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3222  threadInfo[i][threadIdIndex] = threadIdCt++;
3223  }
3224 
3225  // Apparently the thread id field was specified for some entries and
3226  // not others. Start the thread id counter off at the next higher
3227  // thread id.
3228  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3229  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3230  }
3231  }
3232  break;
3233  }
3234  }
3235  if (index < threadIdIndex) {
3236  // If thread ids were specified, it is an error if they are not unique.
3237  // Also, check that we waven't already restarted the loop (to be safe -
3238  // shouldn't need to).
3239  if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
3240  __kmp_free(lastId);
3241  __kmp_free(totals);
3242  __kmp_free(maxCt);
3243  __kmp_free(counts);
3244  CLEANUP_THREAD_INFO;
3245  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3246  return false;
3247  }
3248 
3249  // If the thread ids were not specified and we see entries entries that
3250  // are duplicates, start the loop over and assign the thread ids manually.
3251  assign_thread_ids = true;
3252  goto restart_radix_check;
3253  }
3254  }
3255 
3256 #if KMP_MIC && REDUCE_TEAM_SIZE
3257  // The default team size is the total #threads in the machine
3258  // minus 1 thread for every core that has 3 or more threads.
3259  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3260 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3261 
3262  for (index = threadIdIndex; index <= maxIndex; index++) {
3263  if (counts[index] > maxCt[index]) {
3264  maxCt[index] = counts[index];
3265  }
3266  }
3267 
3268  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
3269  nCoresPerPkg = maxCt[coreIdIndex];
3270  nPackages = totals[pkgIdIndex];
3271 
3272  // When affinity is off, this routine will still be called to set
3273  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
3274  // Make sure all these vars are set correctly, and return now if affinity is
3275  // not enabled.
3276  __kmp_ncores = totals[coreIdIndex];
3277  if (!KMP_AFFINITY_CAPABLE()) {
3278  KMP_ASSERT(__kmp_affinity.type == affinity_none);
3279  return true;
3280  }
3281 
3282 #if KMP_MIC && REDUCE_TEAM_SIZE
3283  // Set the default team size.
3284  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
3285  __kmp_dflt_team_nth = teamSize;
3286  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
3287  "__kmp_dflt_team_nth = %d\n",
3288  __kmp_dflt_team_nth));
3289  }
3290 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3291 
3292  KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
3293 
3294  // Count the number of levels which have more nodes at that level than at the
3295  // parent's level (with there being an implicit root node of the top level).
3296  // This is equivalent to saying that there is at least one node at this level
3297  // which has a sibling. These levels are in the map, and the package level is
3298  // always in the map.
3299  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
3300  for (index = threadIdIndex; index < maxIndex; index++) {
3301  KMP_ASSERT(totals[index] >= totals[index + 1]);
3302  inMap[index] = (totals[index] > totals[index + 1]);
3303  }
3304  inMap[maxIndex] = (totals[maxIndex] > 1);
3305  inMap[pkgIdIndex] = true;
3306  inMap[coreIdIndex] = true;
3307  inMap[threadIdIndex] = true;
3308 
3309  int depth = 0;
3310  int idx = 0;
3311  kmp_hw_t types[KMP_HW_LAST];
3312  int pkgLevel = -1;
3313  int coreLevel = -1;
3314  int threadLevel = -1;
3315  for (index = threadIdIndex; index <= maxIndex; index++) {
3316  if (inMap[index]) {
3317  depth++;
3318  }
3319  }
3320  if (inMap[pkgIdIndex]) {
3321  pkgLevel = idx;
3322  types[idx++] = KMP_HW_SOCKET;
3323  }
3324  if (inMap[coreIdIndex]) {
3325  coreLevel = idx;
3326  types[idx++] = KMP_HW_CORE;
3327  }
3328  if (inMap[threadIdIndex]) {
3329  threadLevel = idx;
3330  types[idx++] = KMP_HW_THREAD;
3331  }
3332  KMP_ASSERT(depth > 0);
3333 
3334  // Construct the data structure that is to be returned.
3335  __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
3336 
3337  for (i = 0; i < num_avail; ++i) {
3338  unsigned os = threadInfo[i][osIdIndex];
3339  int src_index;
3340  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3341  hw_thread.clear();
3342  hw_thread.os_id = os;
3343 
3344  idx = 0;
3345  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
3346  if (!inMap[src_index]) {
3347  continue;
3348  }
3349  if (src_index == pkgIdIndex) {
3350  hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
3351  } else if (src_index == coreIdIndex) {
3352  hw_thread.ids[coreLevel] = threadInfo[i][src_index];
3353  } else if (src_index == threadIdIndex) {
3354  hw_thread.ids[threadLevel] = threadInfo[i][src_index];
3355  }
3356  }
3357  }
3358 
3359  __kmp_free(inMap);
3360  __kmp_free(lastId);
3361  __kmp_free(totals);
3362  __kmp_free(maxCt);
3363  __kmp_free(counts);
3364  CLEANUP_THREAD_INFO;
3365  __kmp_topology->sort_ids();
3366  if (!__kmp_topology->check_ids()) {
3367  kmp_topology_t::deallocate(__kmp_topology);
3368  __kmp_topology = nullptr;
3369  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3370  return false;
3371  }
3372  return true;
3373 }
3374 
3375 // Create and return a table of affinity masks, indexed by OS thread ID.
3376 // This routine handles OR'ing together all the affinity masks of threads
3377 // that are sufficiently close, if granularity > fine.
3378 static void __kmp_create_os_id_masks(unsigned *numUnique,
3379  kmp_affinity_t &affinity) {
3380  // First form a table of affinity masks in order of OS thread id.
3381  int maxOsId;
3382  int i;
3383  int numAddrs = __kmp_topology->get_num_hw_threads();
3384  int depth = __kmp_topology->get_depth();
3385  const char *env_var = affinity.env_var;
3386  KMP_ASSERT(numAddrs);
3387  KMP_ASSERT(depth);
3388 
3389  maxOsId = 0;
3390  for (i = numAddrs - 1;; --i) {
3391  int osId = __kmp_topology->at(i).os_id;
3392  if (osId > maxOsId) {
3393  maxOsId = osId;
3394  }
3395  if (i == 0)
3396  break;
3397  }
3398  affinity.num_os_id_masks = maxOsId + 1;
3399  KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
3400  KMP_ASSERT(affinity.gran_levels >= 0);
3401  if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
3402  KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
3403  }
3404  if (affinity.gran_levels >= (int)depth) {
3405  KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
3406  }
3407 
3408  // Run through the table, forming the masks for all threads on each core.
3409  // Threads on the same core will have identical kmp_hw_thread_t objects, not
3410  // considering the last level, which must be the thread id. All threads on a
3411  // core will appear consecutively.
3412  int unique = 0;
3413  int j = 0; // index of 1st thread on core
3414  int leader = 0;
3415  kmp_affin_mask_t *sum;
3416  KMP_CPU_ALLOC_ON_STACK(sum);
3417  KMP_CPU_ZERO(sum);
3418  KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
3419  for (i = 1; i < numAddrs; i++) {
3420  // If this thread is sufficiently close to the leader (within the
3421  // granularity setting), then set the bit for this os thread in the
3422  // affinity mask for this group, and go on to the next thread.
3423  if (__kmp_topology->is_close(leader, i, affinity.gran_levels)) {
3424  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3425  continue;
3426  }
3427 
3428  // For every thread in this group, copy the mask to the thread's entry in
3429  // the OS Id mask table. Mark the first address as a leader.
3430  for (; j < i; j++) {
3431  int osId = __kmp_topology->at(j).os_id;
3432  KMP_DEBUG_ASSERT(osId <= maxOsId);
3433  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3434  KMP_CPU_COPY(mask, sum);
3435  __kmp_topology->at(j).leader = (j == leader);
3436  }
3437  unique++;
3438 
3439  // Start a new mask.
3440  leader = i;
3441  KMP_CPU_ZERO(sum);
3442  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3443  }
3444 
3445  // For every thread in last group, copy the mask to the thread's
3446  // entry in the OS Id mask table.
3447  for (; j < i; j++) {
3448  int osId = __kmp_topology->at(j).os_id;
3449  KMP_DEBUG_ASSERT(osId <= maxOsId);
3450  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3451  KMP_CPU_COPY(mask, sum);
3452  __kmp_topology->at(j).leader = (j == leader);
3453  }
3454  unique++;
3455  KMP_CPU_FREE_FROM_STACK(sum);
3456 
3457  *numUnique = unique;
3458 }
3459 
3460 // Stuff for the affinity proclist parsers. It's easier to declare these vars
3461 // as file-static than to try and pass them through the calling sequence of
3462 // the recursive-descent OMP_PLACES parser.
3463 static kmp_affin_mask_t *newMasks;
3464 static int numNewMasks;
3465 static int nextNewMask;
3466 
3467 #define ADD_MASK(_mask) \
3468  { \
3469  if (nextNewMask >= numNewMasks) { \
3470  int i; \
3471  numNewMasks *= 2; \
3472  kmp_affin_mask_t *temp; \
3473  KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
3474  for (i = 0; i < numNewMasks / 2; i++) { \
3475  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
3476  kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
3477  KMP_CPU_COPY(dest, src); \
3478  } \
3479  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
3480  newMasks = temp; \
3481  } \
3482  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
3483  nextNewMask++; \
3484  }
3485 
3486 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
3487  { \
3488  if (((_osId) > _maxOsId) || \
3489  (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
3490  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId); \
3491  } else { \
3492  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
3493  } \
3494  }
3495 
3496 // Re-parse the proclist (for the explicit affinity type), and form the list
3497 // of affinity newMasks indexed by gtid.
3498 static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
3499  int i;
3500  kmp_affin_mask_t **out_masks = &affinity.masks;
3501  unsigned *out_numMasks = &affinity.num_masks;
3502  const char *proclist = affinity.proclist;
3503  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3504  int maxOsId = affinity.num_os_id_masks - 1;
3505  const char *scan = proclist;
3506  const char *next = proclist;
3507 
3508  // We use malloc() for the temporary mask vector, so that we can use
3509  // realloc() to extend it.
3510  numNewMasks = 2;
3511  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3512  nextNewMask = 0;
3513  kmp_affin_mask_t *sumMask;
3514  KMP_CPU_ALLOC(sumMask);
3515  int setSize = 0;
3516 
3517  for (;;) {
3518  int start, end, stride;
3519 
3520  SKIP_WS(scan);
3521  next = scan;
3522  if (*next == '\0') {
3523  break;
3524  }
3525 
3526  if (*next == '{') {
3527  int num;
3528  setSize = 0;
3529  next++; // skip '{'
3530  SKIP_WS(next);
3531  scan = next;
3532 
3533  // Read the first integer in the set.
3534  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
3535  SKIP_DIGITS(next);
3536  num = __kmp_str_to_int(scan, *next);
3537  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3538 
3539  // Copy the mask for that osId to the sum (union) mask.
3540  if ((num > maxOsId) ||
3541  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3542  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3543  KMP_CPU_ZERO(sumMask);
3544  } else {
3545  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3546  setSize = 1;
3547  }
3548 
3549  for (;;) {
3550  // Check for end of set.
3551  SKIP_WS(next);
3552  if (*next == '}') {
3553  next++; // skip '}'
3554  break;
3555  }
3556 
3557  // Skip optional comma.
3558  if (*next == ',') {
3559  next++;
3560  }
3561  SKIP_WS(next);
3562 
3563  // Read the next integer in the set.
3564  scan = next;
3565  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3566 
3567  SKIP_DIGITS(next);
3568  num = __kmp_str_to_int(scan, *next);
3569  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3570 
3571  // Add the mask for that osId to the sum mask.
3572  if ((num > maxOsId) ||
3573  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3574  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3575  } else {
3576  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3577  setSize++;
3578  }
3579  }
3580  if (setSize > 0) {
3581  ADD_MASK(sumMask);
3582  }
3583 
3584  SKIP_WS(next);
3585  if (*next == ',') {
3586  next++;
3587  }
3588  scan = next;
3589  continue;
3590  }
3591 
3592  // Read the first integer.
3593  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3594  SKIP_DIGITS(next);
3595  start = __kmp_str_to_int(scan, *next);
3596  KMP_ASSERT2(start >= 0, "bad explicit proc list");
3597  SKIP_WS(next);
3598 
3599  // If this isn't a range, then add a mask to the list and go on.
3600  if (*next != '-') {
3601  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3602 
3603  // Skip optional comma.
3604  if (*next == ',') {
3605  next++;
3606  }
3607  scan = next;
3608  continue;
3609  }
3610 
3611  // This is a range. Skip over the '-' and read in the 2nd int.
3612  next++; // skip '-'
3613  SKIP_WS(next);
3614  scan = next;
3615  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3616  SKIP_DIGITS(next);
3617  end = __kmp_str_to_int(scan, *next);
3618  KMP_ASSERT2(end >= 0, "bad explicit proc list");
3619 
3620  // Check for a stride parameter
3621  stride = 1;
3622  SKIP_WS(next);
3623  if (*next == ':') {
3624  // A stride is specified. Skip over the ':" and read the 3rd int.
3625  int sign = +1;
3626  next++; // skip ':'
3627  SKIP_WS(next);
3628  scan = next;
3629  if (*next == '-') {
3630  sign = -1;
3631  next++;
3632  SKIP_WS(next);
3633  scan = next;
3634  }
3635  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3636  SKIP_DIGITS(next);
3637  stride = __kmp_str_to_int(scan, *next);
3638  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3639  stride *= sign;
3640  }
3641 
3642  // Do some range checks.
3643  KMP_ASSERT2(stride != 0, "bad explicit proc list");
3644  if (stride > 0) {
3645  KMP_ASSERT2(start <= end, "bad explicit proc list");
3646  } else {
3647  KMP_ASSERT2(start >= end, "bad explicit proc list");
3648  }
3649  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3650 
3651  // Add the mask for each OS proc # to the list.
3652  if (stride > 0) {
3653  do {
3654  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3655  start += stride;
3656  } while (start <= end);
3657  } else {
3658  do {
3659  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3660  start += stride;
3661  } while (start >= end);
3662  }
3663 
3664  // Skip optional comma.
3665  SKIP_WS(next);
3666  if (*next == ',') {
3667  next++;
3668  }
3669  scan = next;
3670  }
3671 
3672  *out_numMasks = nextNewMask;
3673  if (nextNewMask == 0) {
3674  *out_masks = NULL;
3675  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3676  return;
3677  }
3678  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3679  for (i = 0; i < nextNewMask; i++) {
3680  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3681  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3682  KMP_CPU_COPY(dest, src);
3683  }
3684  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3685  KMP_CPU_FREE(sumMask);
3686 }
3687 
3688 /*-----------------------------------------------------------------------------
3689 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3690 places. Again, Here is the grammar:
3691 
3692 place_list := place
3693 place_list := place , place_list
3694 place := num
3695 place := place : num
3696 place := place : num : signed
3697 place := { subplacelist }
3698 place := ! place // (lowest priority)
3699 subplace_list := subplace
3700 subplace_list := subplace , subplace_list
3701 subplace := num
3702 subplace := num : num
3703 subplace := num : num : signed
3704 signed := num
3705 signed := + signed
3706 signed := - signed
3707 -----------------------------------------------------------------------------*/
3708 static void __kmp_process_subplace_list(const char **scan,
3709  kmp_affinity_t &affinity, int maxOsId,
3710  kmp_affin_mask_t *tempMask,
3711  int *setSize) {
3712  const char *next;
3713  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3714 
3715  for (;;) {
3716  int start, count, stride, i;
3717 
3718  // Read in the starting proc id
3719  SKIP_WS(*scan);
3720  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3721  next = *scan;
3722  SKIP_DIGITS(next);
3723  start = __kmp_str_to_int(*scan, *next);
3724  KMP_ASSERT(start >= 0);
3725  *scan = next;
3726 
3727  // valid follow sets are ',' ':' and '}'
3728  SKIP_WS(*scan);
3729  if (**scan == '}' || **scan == ',') {
3730  if ((start > maxOsId) ||
3731  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3732  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3733  } else {
3734  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3735  (*setSize)++;
3736  }
3737  if (**scan == '}') {
3738  break;
3739  }
3740  (*scan)++; // skip ','
3741  continue;
3742  }
3743  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3744  (*scan)++; // skip ':'
3745 
3746  // Read count parameter
3747  SKIP_WS(*scan);
3748  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3749  next = *scan;
3750  SKIP_DIGITS(next);
3751  count = __kmp_str_to_int(*scan, *next);
3752  KMP_ASSERT(count >= 0);
3753  *scan = next;
3754 
3755  // valid follow sets are ',' ':' and '}'
3756  SKIP_WS(*scan);
3757  if (**scan == '}' || **scan == ',') {
3758  for (i = 0; i < count; i++) {
3759  if ((start > maxOsId) ||
3760  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3761  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3762  break; // don't proliferate warnings for large count
3763  } else {
3764  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3765  start++;
3766  (*setSize)++;
3767  }
3768  }
3769  if (**scan == '}') {
3770  break;
3771  }
3772  (*scan)++; // skip ','
3773  continue;
3774  }
3775  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3776  (*scan)++; // skip ':'
3777 
3778  // Read stride parameter
3779  int sign = +1;
3780  for (;;) {
3781  SKIP_WS(*scan);
3782  if (**scan == '+') {
3783  (*scan)++; // skip '+'
3784  continue;
3785  }
3786  if (**scan == '-') {
3787  sign *= -1;
3788  (*scan)++; // skip '-'
3789  continue;
3790  }
3791  break;
3792  }
3793  SKIP_WS(*scan);
3794  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3795  next = *scan;
3796  SKIP_DIGITS(next);
3797  stride = __kmp_str_to_int(*scan, *next);
3798  KMP_ASSERT(stride >= 0);
3799  *scan = next;
3800  stride *= sign;
3801 
3802  // valid follow sets are ',' and '}'
3803  SKIP_WS(*scan);
3804  if (**scan == '}' || **scan == ',') {
3805  for (i = 0; i < count; i++) {
3806  if ((start > maxOsId) ||
3807  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3808  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3809  break; // don't proliferate warnings for large count
3810  } else {
3811  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3812  start += stride;
3813  (*setSize)++;
3814  }
3815  }
3816  if (**scan == '}') {
3817  break;
3818  }
3819  (*scan)++; // skip ','
3820  continue;
3821  }
3822 
3823  KMP_ASSERT2(0, "bad explicit places list");
3824  }
3825 }
3826 
3827 static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
3828  int maxOsId, kmp_affin_mask_t *tempMask,
3829  int *setSize) {
3830  const char *next;
3831  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3832 
3833  // valid follow sets are '{' '!' and num
3834  SKIP_WS(*scan);
3835  if (**scan == '{') {
3836  (*scan)++; // skip '{'
3837  __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
3838  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3839  (*scan)++; // skip '}'
3840  } else if (**scan == '!') {
3841  (*scan)++; // skip '!'
3842  __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
3843  KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3844  } else if ((**scan >= '0') && (**scan <= '9')) {
3845  next = *scan;
3846  SKIP_DIGITS(next);
3847  int num = __kmp_str_to_int(*scan, *next);
3848  KMP_ASSERT(num >= 0);
3849  if ((num > maxOsId) ||
3850  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3851  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3852  } else {
3853  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3854  (*setSize)++;
3855  }
3856  *scan = next; // skip num
3857  } else {
3858  KMP_ASSERT2(0, "bad explicit places list");
3859  }
3860 }
3861 
3862 // static void
3863 void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
3864  int i, j, count, stride, sign;
3865  kmp_affin_mask_t **out_masks = &affinity.masks;
3866  unsigned *out_numMasks = &affinity.num_masks;
3867  const char *placelist = affinity.proclist;
3868  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3869  int maxOsId = affinity.num_os_id_masks - 1;
3870  const char *scan = placelist;
3871  const char *next = placelist;
3872 
3873  numNewMasks = 2;
3874  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3875  nextNewMask = 0;
3876 
3877  // tempMask is modified based on the previous or initial
3878  // place to form the current place
3879  // previousMask contains the previous place
3880  kmp_affin_mask_t *tempMask;
3881  kmp_affin_mask_t *previousMask;
3882  KMP_CPU_ALLOC(tempMask);
3883  KMP_CPU_ZERO(tempMask);
3884  KMP_CPU_ALLOC(previousMask);
3885  KMP_CPU_ZERO(previousMask);
3886  int setSize = 0;
3887 
3888  for (;;) {
3889  __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
3890 
3891  // valid follow sets are ',' ':' and EOL
3892  SKIP_WS(scan);
3893  if (*scan == '\0' || *scan == ',') {
3894  if (setSize > 0) {
3895  ADD_MASK(tempMask);
3896  }
3897  KMP_CPU_ZERO(tempMask);
3898  setSize = 0;
3899  if (*scan == '\0') {
3900  break;
3901  }
3902  scan++; // skip ','
3903  continue;
3904  }
3905 
3906  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3907  scan++; // skip ':'
3908 
3909  // Read count parameter
3910  SKIP_WS(scan);
3911  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3912  next = scan;
3913  SKIP_DIGITS(next);
3914  count = __kmp_str_to_int(scan, *next);
3915  KMP_ASSERT(count >= 0);
3916  scan = next;
3917 
3918  // valid follow sets are ',' ':' and EOL
3919  SKIP_WS(scan);
3920  if (*scan == '\0' || *scan == ',') {
3921  stride = +1;
3922  } else {
3923  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3924  scan++; // skip ':'
3925 
3926  // Read stride parameter
3927  sign = +1;
3928  for (;;) {
3929  SKIP_WS(scan);
3930  if (*scan == '+') {
3931  scan++; // skip '+'
3932  continue;
3933  }
3934  if (*scan == '-') {
3935  sign *= -1;
3936  scan++; // skip '-'
3937  continue;
3938  }
3939  break;
3940  }
3941  SKIP_WS(scan);
3942  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3943  next = scan;
3944  SKIP_DIGITS(next);
3945  stride = __kmp_str_to_int(scan, *next);
3946  KMP_DEBUG_ASSERT(stride >= 0);
3947  scan = next;
3948  stride *= sign;
3949  }
3950 
3951  // Add places determined by initial_place : count : stride
3952  for (i = 0; i < count; i++) {
3953  if (setSize == 0) {
3954  break;
3955  }
3956  // Add the current place, then build the next place (tempMask) from that
3957  KMP_CPU_COPY(previousMask, tempMask);
3958  ADD_MASK(previousMask);
3959  KMP_CPU_ZERO(tempMask);
3960  setSize = 0;
3961  KMP_CPU_SET_ITERATE(j, previousMask) {
3962  if (!KMP_CPU_ISSET(j, previousMask)) {
3963  continue;
3964  }
3965  if ((j + stride > maxOsId) || (j + stride < 0) ||
3966  (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3967  (!KMP_CPU_ISSET(j + stride,
3968  KMP_CPU_INDEX(osId2Mask, j + stride)))) {
3969  if (i < count - 1) {
3970  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
3971  }
3972  continue;
3973  }
3974  KMP_CPU_SET(j + stride, tempMask);
3975  setSize++;
3976  }
3977  }
3978  KMP_CPU_ZERO(tempMask);
3979  setSize = 0;
3980 
3981  // valid follow sets are ',' and EOL
3982  SKIP_WS(scan);
3983  if (*scan == '\0') {
3984  break;
3985  }
3986  if (*scan == ',') {
3987  scan++; // skip ','
3988  continue;
3989  }
3990 
3991  KMP_ASSERT2(0, "bad explicit places list");
3992  }
3993 
3994  *out_numMasks = nextNewMask;
3995  if (nextNewMask == 0) {
3996  *out_masks = NULL;
3997  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3998  return;
3999  }
4000  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
4001  KMP_CPU_FREE(tempMask);
4002  KMP_CPU_FREE(previousMask);
4003  for (i = 0; i < nextNewMask; i++) {
4004  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
4005  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
4006  KMP_CPU_COPY(dest, src);
4007  }
4008  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4009 }
4010 
4011 #undef ADD_MASK
4012 #undef ADD_MASK_OSID
4013 
4014 // This function figures out the deepest level at which there is at least one
4015 // cluster/core with more than one processing unit bound to it.
4016 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
4017  int core_level = 0;
4018 
4019  for (int i = 0; i < nprocs; i++) {
4020  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
4021  for (int j = bottom_level; j > 0; j--) {
4022  if (hw_thread.ids[j] > 0) {
4023  if (core_level < (j - 1)) {
4024  core_level = j - 1;
4025  }
4026  }
4027  }
4028  }
4029  return core_level;
4030 }
4031 
4032 // This function counts number of clusters/cores at given level.
4033 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
4034  int core_level) {
4035  return __kmp_topology->get_count(core_level);
4036 }
4037 // This function finds to which cluster/core given processing unit is bound.
4038 static int __kmp_affinity_find_core(int proc, int bottom_level,
4039  int core_level) {
4040  int core = 0;
4041  KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
4042  for (int i = 0; i <= proc; ++i) {
4043  if (i + 1 <= proc) {
4044  for (int j = 0; j <= core_level; ++j) {
4045  if (__kmp_topology->at(i + 1).sub_ids[j] !=
4046  __kmp_topology->at(i).sub_ids[j]) {
4047  core++;
4048  break;
4049  }
4050  }
4051  }
4052  }
4053  return core;
4054 }
4055 
4056 // This function finds maximal number of processing units bound to a
4057 // cluster/core at given level.
4058 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
4059  int core_level) {
4060  if (core_level >= bottom_level)
4061  return 1;
4062  int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
4063  return __kmp_topology->calculate_ratio(thread_level, core_level);
4064 }
4065 
4066 static int *procarr = NULL;
4067 static int __kmp_aff_depth = 0;
4068 static int *__kmp_osid_to_hwthread_map = NULL;
4069 
4070 static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
4071  kmp_affinity_ids_t &ids,
4072  kmp_affinity_attrs_t &attrs) {
4073  if (!KMP_AFFINITY_CAPABLE())
4074  return;
4075 
4076  // Initiailze ids and attrs thread data
4077  for (int i = 0; i < KMP_HW_LAST; ++i)
4078  ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
4079  attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4080 
4081  // Iterate through each os id within the mask and determine
4082  // the topology id and attribute information
4083  int cpu;
4084  int depth = __kmp_topology->get_depth();
4085  KMP_CPU_SET_ITERATE(cpu, mask) {
4086  int osid_idx = __kmp_osid_to_hwthread_map[cpu];
4087  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
4088  for (int level = 0; level < depth; ++level) {
4089  kmp_hw_t type = __kmp_topology->get_type(level);
4090  int id = hw_thread.sub_ids[level];
4091  if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
4092  ids[type] = id;
4093  } else {
4094  // This mask spans across multiple topology units, set it as such
4095  // and mark every level below as such as well.
4096  ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4097  for (; level < depth; ++level) {
4098  kmp_hw_t type = __kmp_topology->get_type(level);
4099  ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4100  }
4101  }
4102  }
4103  if (!attrs.valid) {
4104  attrs.core_type = hw_thread.attrs.get_core_type();
4105  attrs.core_eff = hw_thread.attrs.get_core_eff();
4106  attrs.valid = 1;
4107  } else {
4108  // This mask spans across multiple attributes, set it as such
4109  if (attrs.core_type != hw_thread.attrs.get_core_type())
4110  attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
4111  if (attrs.core_eff != hw_thread.attrs.get_core_eff())
4112  attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
4113  }
4114  }
4115 }
4116 
4117 static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
4118  if (!KMP_AFFINITY_CAPABLE())
4119  return;
4120  const kmp_affin_mask_t *mask = th->th.th_affin_mask;
4121  kmp_affinity_ids_t &ids = th->th.th_topology_ids;
4122  kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
4123  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4124 }
4125 
4126 // Assign the topology information to each place in the place list
4127 // A thread can then grab not only its affinity mask, but the topology
4128 // information associated with that mask. e.g., Which socket is a thread on
4129 static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
4130  if (!KMP_AFFINITY_CAPABLE())
4131  return;
4132  if (affinity.type != affinity_none) {
4133  KMP_ASSERT(affinity.num_os_id_masks);
4134  KMP_ASSERT(affinity.os_id_masks);
4135  }
4136  KMP_ASSERT(affinity.num_masks);
4137  KMP_ASSERT(affinity.masks);
4138  KMP_ASSERT(__kmp_affin_fullMask);
4139 
4140  int max_cpu = __kmp_affin_fullMask->get_max_cpu();
4141  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4142 
4143  // Allocate thread topology information
4144  if (!affinity.ids) {
4145  affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
4146  sizeof(kmp_affinity_ids_t) * affinity.num_masks);
4147  }
4148  if (!affinity.attrs) {
4149  affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
4150  sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
4151  }
4152  if (!__kmp_osid_to_hwthread_map) {
4153  // Want the +1 because max_cpu should be valid index into map
4154  __kmp_osid_to_hwthread_map =
4155  (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
4156  }
4157 
4158  // Create the OS proc to hardware thread map
4159  for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread)
4160  __kmp_osid_to_hwthread_map[__kmp_topology->at(hw_thread).os_id] = hw_thread;
4161 
4162  for (unsigned i = 0; i < affinity.num_masks; ++i) {
4163  kmp_affinity_ids_t &ids = affinity.ids[i];
4164  kmp_affinity_attrs_t &attrs = affinity.attrs[i];
4165  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
4166  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4167  }
4168 }
4169 
4170 // Create a one element mask array (set of places) which only contains the
4171 // initial process's affinity mask
4172 static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
4173  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4174  KMP_ASSERT(affinity.type == affinity_none);
4175  affinity.num_masks = 1;
4176  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4177  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
4178  KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4179  __kmp_affinity_get_topology_info(affinity);
4180 }
4181 
4182 static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
4183  // Create the "full" mask - this defines all of the processors that we
4184  // consider to be in the machine model. If respect is set, then it is the
4185  // initialization thread's affinity mask. Otherwise, it is all processors that
4186  // we know about on the machine.
4187  int verbose = affinity.flags.verbose;
4188  const char *env_var = affinity.env_var;
4189 
4190  // Already initialized
4191  if (__kmp_affin_fullMask && __kmp_affin_origMask)
4192  return;
4193 
4194  if (__kmp_affin_fullMask == NULL) {
4195  KMP_CPU_ALLOC(__kmp_affin_fullMask);
4196  }
4197  if (__kmp_affin_origMask == NULL) {
4198  KMP_CPU_ALLOC(__kmp_affin_origMask);
4199  }
4200  if (KMP_AFFINITY_CAPABLE()) {
4201  __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4202  // Make a copy before possible expanding to the entire machine mask
4203  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4204  if (affinity.flags.respect) {
4205  // Count the number of available processors.
4206  unsigned i;
4207  __kmp_avail_proc = 0;
4208  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4209  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4210  continue;
4211  }
4212  __kmp_avail_proc++;
4213  }
4214  if (__kmp_avail_proc > __kmp_xproc) {
4215  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4216  affinity.type = affinity_none;
4217  KMP_AFFINITY_DISABLE();
4218  return;
4219  }
4220 
4221  if (verbose) {
4222  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4223  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4224  __kmp_affin_fullMask);
4225  KMP_INFORM(InitOSProcSetRespect, env_var, buf);
4226  }
4227  } else {
4228  if (verbose) {
4229  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4230  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4231  __kmp_affin_fullMask);
4232  KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
4233  }
4234  __kmp_avail_proc =
4235  __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4236 #if KMP_OS_WINDOWS
4237  if (__kmp_num_proc_groups <= 1) {
4238  // Copy expanded full mask if topology has single processor group
4239  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4240  }
4241  // Set the process affinity mask since threads' affinity
4242  // masks must be subset of process mask in Windows* OS
4243  __kmp_affin_fullMask->set_process_affinity(true);
4244 #endif
4245  }
4246  }
4247 }
4248 
4249 static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
4250  bool success = false;
4251  const char *env_var = affinity.env_var;
4252  kmp_i18n_id_t msg_id = kmp_i18n_null;
4253  int verbose = affinity.flags.verbose;
4254 
4255  // For backward compatibility, setting KMP_CPUINFO_FILE =>
4256  // KMP_TOPOLOGY_METHOD=cpuinfo
4257  if ((__kmp_cpuinfo_file != NULL) &&
4258  (__kmp_affinity_top_method == affinity_top_method_all)) {
4259  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4260  }
4261 
4262  if (__kmp_affinity_top_method == affinity_top_method_all) {
4263 // In the default code path, errors are not fatal - we just try using
4264 // another method. We only emit a warning message if affinity is on, or the
4265 // verbose flag is set, an the nowarnings flag was not set.
4266 #if KMP_USE_HWLOC
4267  if (!success &&
4268  __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4269  if (!__kmp_hwloc_error) {
4270  success = __kmp_affinity_create_hwloc_map(&msg_id);
4271  if (!success && verbose) {
4272  KMP_INFORM(AffIgnoringHwloc, env_var);
4273  }
4274  } else if (verbose) {
4275  KMP_INFORM(AffIgnoringHwloc, env_var);
4276  }
4277  }
4278 #endif
4279 
4280 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4281  if (!success) {
4282  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4283  if (!success && verbose && msg_id != kmp_i18n_null) {
4284  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4285  }
4286  }
4287  if (!success) {
4288  success = __kmp_affinity_create_apicid_map(&msg_id);
4289  if (!success && verbose && msg_id != kmp_i18n_null) {
4290  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4291  }
4292  }
4293 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4294 
4295 #if KMP_OS_LINUX
4296  if (!success) {
4297  int line = 0;
4298  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4299  if (!success && verbose && msg_id != kmp_i18n_null) {
4300  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4301  }
4302  }
4303 #endif /* KMP_OS_LINUX */
4304 
4305 #if KMP_GROUP_AFFINITY
4306  if (!success && (__kmp_num_proc_groups > 1)) {
4307  success = __kmp_affinity_create_proc_group_map(&msg_id);
4308  if (!success && verbose && msg_id != kmp_i18n_null) {
4309  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4310  }
4311  }
4312 #endif /* KMP_GROUP_AFFINITY */
4313 
4314  if (!success) {
4315  success = __kmp_affinity_create_flat_map(&msg_id);
4316  if (!success && verbose && msg_id != kmp_i18n_null) {
4317  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4318  }
4319  KMP_ASSERT(success);
4320  }
4321  }
4322 
4323 // If the user has specified that a paricular topology discovery method is to be
4324 // used, then we abort if that method fails. The exception is group affinity,
4325 // which might have been implicitly set.
4326 #if KMP_USE_HWLOC
4327  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4328  KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4329  success = __kmp_affinity_create_hwloc_map(&msg_id);
4330  if (!success) {
4331  KMP_ASSERT(msg_id != kmp_i18n_null);
4332  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4333  }
4334  }
4335 #endif // KMP_USE_HWLOC
4336 
4337 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4338  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
4339  __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
4340  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4341  if (!success) {
4342  KMP_ASSERT(msg_id != kmp_i18n_null);
4343  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4344  }
4345  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4346  success = __kmp_affinity_create_apicid_map(&msg_id);
4347  if (!success) {
4348  KMP_ASSERT(msg_id != kmp_i18n_null);
4349  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4350  }
4351  }
4352 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4353 
4354  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4355  int line = 0;
4356  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4357  if (!success) {
4358  KMP_ASSERT(msg_id != kmp_i18n_null);
4359  const char *filename = __kmp_cpuinfo_get_filename();
4360  if (line > 0) {
4361  KMP_FATAL(FileLineMsgExiting, filename, line,
4362  __kmp_i18n_catgets(msg_id));
4363  } else {
4364  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4365  }
4366  }
4367  }
4368 
4369 #if KMP_GROUP_AFFINITY
4370  else if (__kmp_affinity_top_method == affinity_top_method_group) {
4371  success = __kmp_affinity_create_proc_group_map(&msg_id);
4372  KMP_ASSERT(success);
4373  if (!success) {
4374  KMP_ASSERT(msg_id != kmp_i18n_null);
4375  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4376  }
4377  }
4378 #endif /* KMP_GROUP_AFFINITY */
4379 
4380  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4381  success = __kmp_affinity_create_flat_map(&msg_id);
4382  // should not fail
4383  KMP_ASSERT(success);
4384  }
4385 
4386  // Early exit if topology could not be created
4387  if (!__kmp_topology) {
4388  if (KMP_AFFINITY_CAPABLE()) {
4389  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4390  }
4391  if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
4392  __kmp_ncores > 0) {
4393  __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
4394  __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
4395  __kmp_nThreadsPerCore, __kmp_ncores);
4396  if (verbose) {
4397  __kmp_topology->print(env_var);
4398  }
4399  }
4400  return false;
4401  }
4402 
4403  // Canonicalize, print (if requested), apply KMP_HW_SUBSET
4404  __kmp_topology->canonicalize();
4405  if (verbose)
4406  __kmp_topology->print(env_var);
4407  bool filtered = __kmp_topology->filter_hw_subset();
4408  if (filtered) {
4409 #if KMP_OS_WINDOWS
4410  // Copy filtered full mask if topology has single processor group
4411  if (__kmp_num_proc_groups <= 1)
4412 #endif
4413  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4414  }
4415  if (filtered && verbose)
4416  __kmp_topology->print("KMP_HW_SUBSET");
4417  return success;
4418 }
4419 
4420 static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
4421  bool is_regular_affinity = (&affinity == &__kmp_affinity);
4422  bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
4423  const char *env_var = affinity.env_var;
4424 
4425  if (affinity.flags.initialized) {
4426  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4427  return;
4428  }
4429 
4430  if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
4431  __kmp_aux_affinity_initialize_masks(affinity);
4432 
4433  if (is_regular_affinity && !__kmp_topology) {
4434  bool success = __kmp_aux_affinity_initialize_topology(affinity);
4435  if (success) {
4436  // Initialize other data structures which depend on the topology
4437  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
4438  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4439  } else {
4440  affinity.type = affinity_none;
4441  KMP_AFFINITY_DISABLE();
4442  }
4443  }
4444 
4445  // If KMP_AFFINITY=none, then only create the single "none" place
4446  // which is the process's initial affinity mask or the number of
4447  // hardware threads depending on respect,norespect
4448  if (affinity.type == affinity_none) {
4449  __kmp_create_affinity_none_places(affinity);
4450 #if KMP_USE_HIER_SCHED
4451  __kmp_dispatch_set_hierarchy_values();
4452 #endif
4453  affinity.flags.initialized = TRUE;
4454  return;
4455  }
4456 
4457  __kmp_topology->set_granularity(affinity);
4458  int depth = __kmp_topology->get_depth();
4459 
4460  // Create the table of masks, indexed by thread Id.
4461  unsigned numUnique;
4462  __kmp_create_os_id_masks(&numUnique, affinity);
4463  if (affinity.gran_levels == 0) {
4464  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4465  }
4466 
4467  switch (affinity.type) {
4468 
4469  case affinity_explicit:
4470  KMP_DEBUG_ASSERT(affinity.proclist != NULL);
4471  if (is_hidden_helper_affinity ||
4472  __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4473  __kmp_affinity_process_proclist(affinity);
4474  } else {
4475  __kmp_affinity_process_placelist(affinity);
4476  }
4477  if (affinity.num_masks == 0) {
4478  KMP_AFF_WARNING(affinity, AffNoValidProcID);
4479  affinity.type = affinity_none;
4480  __kmp_create_affinity_none_places(affinity);
4481  affinity.flags.initialized = TRUE;
4482  return;
4483  }
4484  break;
4485 
4486  // The other affinity types rely on sorting the hardware threads according to
4487  // some permutation of the machine topology tree. Set affinity.compact
4488  // and affinity.offset appropriately, then jump to a common code
4489  // fragment to do the sort and create the array of affinity masks.
4490  case affinity_logical:
4491  affinity.compact = 0;
4492  if (affinity.offset) {
4493  affinity.offset =
4494  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4495  }
4496  goto sortTopology;
4497 
4498  case affinity_physical:
4499  if (__kmp_nThreadsPerCore > 1) {
4500  affinity.compact = 1;
4501  if (affinity.compact >= depth) {
4502  affinity.compact = 0;
4503  }
4504  } else {
4505  affinity.compact = 0;
4506  }
4507  if (affinity.offset) {
4508  affinity.offset =
4509  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4510  }
4511  goto sortTopology;
4512 
4513  case affinity_scatter:
4514  if (affinity.compact >= depth) {
4515  affinity.compact = 0;
4516  } else {
4517  affinity.compact = depth - 1 - affinity.compact;
4518  }
4519  goto sortTopology;
4520 
4521  case affinity_compact:
4522  if (affinity.compact >= depth) {
4523  affinity.compact = depth - 1;
4524  }
4525  goto sortTopology;
4526 
4527  case affinity_balanced:
4528  if (depth <= 1 || is_hidden_helper_affinity) {
4529  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4530  affinity.type = affinity_none;
4531  __kmp_create_affinity_none_places(affinity);
4532  affinity.flags.initialized = TRUE;
4533  return;
4534  } else if (!__kmp_topology->is_uniform()) {
4535  // Save the depth for further usage
4536  __kmp_aff_depth = depth;
4537 
4538  int core_level =
4539  __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
4540  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
4541  core_level);
4542  int maxprocpercore = __kmp_affinity_max_proc_per_core(
4543  __kmp_avail_proc, depth - 1, core_level);
4544 
4545  int nproc = ncores * maxprocpercore;
4546  if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4547  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4548  affinity.type = affinity_none;
4549  __kmp_create_affinity_none_places(affinity);
4550  affinity.flags.initialized = TRUE;
4551  return;
4552  }
4553 
4554  procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4555  for (int i = 0; i < nproc; i++) {
4556  procarr[i] = -1;
4557  }
4558 
4559  int lastcore = -1;
4560  int inlastcore = 0;
4561  for (int i = 0; i < __kmp_avail_proc; i++) {
4562  int proc = __kmp_topology->at(i).os_id;
4563  int core = __kmp_affinity_find_core(i, depth - 1, core_level);
4564 
4565  if (core == lastcore) {
4566  inlastcore++;
4567  } else {
4568  inlastcore = 0;
4569  }
4570  lastcore = core;
4571 
4572  procarr[core * maxprocpercore + inlastcore] = proc;
4573  }
4574  }
4575  if (affinity.compact >= depth) {
4576  affinity.compact = depth - 1;
4577  }
4578 
4579  sortTopology:
4580  // Allocate the gtid->affinity mask table.
4581  if (affinity.flags.dups) {
4582  affinity.num_masks = __kmp_avail_proc;
4583  } else {
4584  affinity.num_masks = numUnique;
4585  }
4586 
4587  if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4588  (__kmp_affinity_num_places > 0) &&
4589  ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
4590  !is_hidden_helper_affinity) {
4591  affinity.num_masks = __kmp_affinity_num_places;
4592  }
4593 
4594  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4595 
4596  // Sort the topology table according to the current setting of
4597  // affinity.compact, then fill out affinity.masks.
4598  __kmp_topology->sort_compact(affinity);
4599  {
4600  int i;
4601  unsigned j;
4602  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4603  for (i = 0, j = 0; i < num_hw_threads; i++) {
4604  if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
4605  continue;
4606  }
4607  int osId = __kmp_topology->at(i).os_id;
4608 
4609  kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
4610  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
4611  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4612  KMP_CPU_COPY(dest, src);
4613  if (++j >= affinity.num_masks) {
4614  break;
4615  }
4616  }
4617  KMP_DEBUG_ASSERT(j == affinity.num_masks);
4618  }
4619  // Sort the topology back using ids
4620  __kmp_topology->sort_ids();
4621  break;
4622 
4623  default:
4624  KMP_ASSERT2(0, "Unexpected affinity setting");
4625  }
4626  __kmp_affinity_get_topology_info(affinity);
4627  affinity.flags.initialized = TRUE;
4628 }
4629 
4630 void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
4631  // Much of the code above was written assuming that if a machine was not
4632  // affinity capable, then affinity type == affinity_none.
4633  // We now explicitly represent this as affinity type == affinity_disabled.
4634  // There are too many checks for affinity type == affinity_none in this code.
4635  // Instead of trying to change them all, check if
4636  // affinity type == affinity_disabled, and if so, slam it with affinity_none,
4637  // call the real initialization routine, then restore affinity type to
4638  // affinity_disabled.
4639  int disabled = (affinity.type == affinity_disabled);
4640  if (!KMP_AFFINITY_CAPABLE())
4641  KMP_ASSERT(disabled);
4642  if (disabled)
4643  affinity.type = affinity_none;
4644  __kmp_aux_affinity_initialize(affinity);
4645  if (disabled)
4646  affinity.type = affinity_disabled;
4647 }
4648 
4649 void __kmp_affinity_uninitialize(void) {
4650  for (kmp_affinity_t *affinity : __kmp_affinities) {
4651  if (affinity->masks != NULL)
4652  KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
4653  if (affinity->os_id_masks != NULL)
4654  KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
4655  if (affinity->proclist != NULL)
4656  __kmp_free(affinity->proclist);
4657  if (affinity->ids != NULL)
4658  __kmp_free(affinity->ids);
4659  if (affinity->attrs != NULL)
4660  __kmp_free(affinity->attrs);
4661  *affinity = KMP_AFFINITY_INIT(affinity->env_var);
4662  }
4663  if (__kmp_affin_origMask != NULL) {
4664  if (KMP_AFFINITY_CAPABLE()) {
4665  __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
4666  }
4667  KMP_CPU_FREE(__kmp_affin_origMask);
4668  __kmp_affin_origMask = NULL;
4669  }
4670  __kmp_affinity_num_places = 0;
4671  if (procarr != NULL) {
4672  __kmp_free(procarr);
4673  procarr = NULL;
4674  }
4675  if (__kmp_osid_to_hwthread_map) {
4676  __kmp_free(__kmp_osid_to_hwthread_map);
4677  __kmp_osid_to_hwthread_map = NULL;
4678  }
4679 #if KMP_USE_HWLOC
4680  if (__kmp_hwloc_topology != NULL) {
4681  hwloc_topology_destroy(__kmp_hwloc_topology);
4682  __kmp_hwloc_topology = NULL;
4683  }
4684 #endif
4685  if (__kmp_hw_subset) {
4686  kmp_hw_subset_t::deallocate(__kmp_hw_subset);
4687  __kmp_hw_subset = nullptr;
4688  }
4689  if (__kmp_topology) {
4690  kmp_topology_t::deallocate(__kmp_topology);
4691  __kmp_topology = nullptr;
4692  }
4693  KMPAffinity::destroy_api();
4694 }
4695 
4696 static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
4697  int *place, kmp_affin_mask_t **mask) {
4698  int mask_idx;
4699  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4700  if (is_hidden_helper)
4701  // The first gtid is the regular primary thread, the second gtid is the main
4702  // thread of hidden team which does not participate in task execution.
4703  mask_idx = gtid - 2;
4704  else
4705  mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
4706  KMP_DEBUG_ASSERT(affinity->num_masks > 0);
4707  *place = (mask_idx + affinity->offset) % affinity->num_masks;
4708  *mask = KMP_CPU_INDEX(affinity->masks, *place);
4709 }
4710 
4711 // This function initializes the per-thread data concerning affinity including
4712 // the mask and topology information
4713 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4714 
4715  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4716 
4717  // Set the thread topology information to default of unknown
4718  for (int id = 0; id < KMP_HW_LAST; ++id)
4719  th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
4720  th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4721 
4722  if (!KMP_AFFINITY_CAPABLE()) {
4723  return;
4724  }
4725 
4726  if (th->th.th_affin_mask == NULL) {
4727  KMP_CPU_ALLOC(th->th.th_affin_mask);
4728  } else {
4729  KMP_CPU_ZERO(th->th.th_affin_mask);
4730  }
4731 
4732  // Copy the thread mask to the kmp_info_t structure. If
4733  // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
4734  // one that has all of the OS proc ids set, or if
4735  // __kmp_affinity.flags.respect is set, then the full mask is the
4736  // same as the mask of the initialization thread.
4737  kmp_affin_mask_t *mask;
4738  int i;
4739  const kmp_affinity_t *affinity;
4740  const char *env_var;
4741  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4742 
4743  if (is_hidden_helper)
4744  affinity = &__kmp_hh_affinity;
4745  else
4746  affinity = &__kmp_affinity;
4747  env_var = affinity->env_var;
4748 
4749  if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
4750  if ((affinity->type == affinity_none) ||
4751  (affinity->type == affinity_balanced) ||
4752  KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
4753 #if KMP_GROUP_AFFINITY
4754  if (__kmp_num_proc_groups > 1) {
4755  return;
4756  }
4757 #endif
4758  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4759  i = 0;
4760  mask = __kmp_affin_fullMask;
4761  } else {
4762  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
4763  }
4764  } else {
4765  if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
4766 #if KMP_GROUP_AFFINITY
4767  if (__kmp_num_proc_groups > 1) {
4768  return;
4769  }
4770 #endif
4771  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4772  i = KMP_PLACE_ALL;
4773  mask = __kmp_affin_fullMask;
4774  } else {
4775  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
4776  }
4777  }
4778 
4779  th->th.th_current_place = i;
4780  if (isa_root && !is_hidden_helper) {
4781  th->th.th_new_place = i;
4782  th->th.th_first_place = 0;
4783  th->th.th_last_place = affinity->num_masks - 1;
4784  } else if (KMP_AFFINITY_NON_PROC_BIND) {
4785  // When using a Non-OMP_PROC_BIND affinity method,
4786  // set all threads' place-partition-var to the entire place list
4787  th->th.th_first_place = 0;
4788  th->th.th_last_place = affinity->num_masks - 1;
4789  }
4790  // Copy topology information associated with the place
4791  if (i >= 0) {
4792  th->th.th_topology_ids = __kmp_affinity.ids[i];
4793  th->th.th_topology_attrs = __kmp_affinity.attrs[i];
4794  }
4795 
4796  if (i == KMP_PLACE_ALL) {
4797  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4798  gtid));
4799  } else {
4800  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4801  gtid, i));
4802  }
4803 
4804  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4805 
4806  /* to avoid duplicate printing (will be correctly printed on barrier) */
4807  if (affinity->flags.verbose &&
4808  (affinity->type == affinity_none ||
4809  (i != KMP_PLACE_ALL && affinity->type != affinity_balanced)) &&
4810  !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
4811  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4812  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4813  th->th.th_affin_mask);
4814  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
4815  gtid, buf);
4816  }
4817 
4818 #if KMP_OS_WINDOWS
4819  // On Windows* OS, the process affinity mask might have changed. If the user
4820  // didn't request affinity and this call fails, just continue silently.
4821  // See CQ171393.
4822  if (affinity->type == affinity_none) {
4823  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4824  } else
4825 #endif
4826  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4827 }
4828 
4829 void __kmp_affinity_set_place(int gtid) {
4830  // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
4831  if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
4832  return;
4833  }
4834 
4835  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4836 
4837  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4838  "place = %d)\n",
4839  gtid, th->th.th_new_place, th->th.th_current_place));
4840 
4841  // Check that the new place is within this thread's partition.
4842  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4843  KMP_ASSERT(th->th.th_new_place >= 0);
4844  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
4845  if (th->th.th_first_place <= th->th.th_last_place) {
4846  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4847  (th->th.th_new_place <= th->th.th_last_place));
4848  } else {
4849  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4850  (th->th.th_new_place >= th->th.th_last_place));
4851  }
4852 
4853  // Copy the thread mask to the kmp_info_t structure,
4854  // and set this thread's affinity.
4855  kmp_affin_mask_t *mask =
4856  KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
4857  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4858  th->th.th_current_place = th->th.th_new_place;
4859  // Copy topology information associated with the place
4860  th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4861  th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4862 
4863  if (__kmp_affinity.flags.verbose) {
4864  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4865  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4866  th->th.th_affin_mask);
4867  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4868  __kmp_gettid(), gtid, buf);
4869  }
4870  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4871 }
4872 
4873 int __kmp_aux_set_affinity(void **mask) {
4874  int gtid;
4875  kmp_info_t *th;
4876  int retval;
4877 
4878  if (!KMP_AFFINITY_CAPABLE()) {
4879  return -1;
4880  }
4881 
4882  gtid = __kmp_entry_gtid();
4883  KA_TRACE(
4884  1000, (""); {
4885  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4886  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4887  (kmp_affin_mask_t *)(*mask));
4888  __kmp_debug_printf(
4889  "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4890  gtid, buf);
4891  });
4892 
4893  if (__kmp_env_consistency_check) {
4894  if ((mask == NULL) || (*mask == NULL)) {
4895  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4896  } else {
4897  unsigned proc;
4898  int num_procs = 0;
4899 
4900  KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4901  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4902  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4903  }
4904  if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4905  continue;
4906  }
4907  num_procs++;
4908  }
4909  if (num_procs == 0) {
4910  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4911  }
4912 
4913 #if KMP_GROUP_AFFINITY
4914  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4915  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4916  }
4917 #endif /* KMP_GROUP_AFFINITY */
4918  }
4919  }
4920 
4921  th = __kmp_threads[gtid];
4922  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4923  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4924  if (retval == 0) {
4925  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4926  }
4927 
4928  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4929  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4930  th->th.th_first_place = 0;
4931  th->th.th_last_place = __kmp_affinity.num_masks - 1;
4932 
4933  // Turn off 4.0 affinity for the current tread at this parallel level.
4934  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4935 
4936  return retval;
4937 }
4938 
4939 int __kmp_aux_get_affinity(void **mask) {
4940  int gtid;
4941  int retval;
4942 #if KMP_OS_WINDOWS || KMP_DEBUG
4943  kmp_info_t *th;
4944 #endif
4945  if (!KMP_AFFINITY_CAPABLE()) {
4946  return -1;
4947  }
4948 
4949  gtid = __kmp_entry_gtid();
4950 #if KMP_OS_WINDOWS || KMP_DEBUG
4951  th = __kmp_threads[gtid];
4952 #else
4953  (void)gtid; // unused variable
4954 #endif
4955  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4956 
4957  KA_TRACE(
4958  1000, (""); {
4959  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4960  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4961  th->th.th_affin_mask);
4962  __kmp_printf(
4963  "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
4964  buf);
4965  });
4966 
4967  if (__kmp_env_consistency_check) {
4968  if ((mask == NULL) || (*mask == NULL)) {
4969  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4970  }
4971  }
4972 
4973 #if !KMP_OS_WINDOWS
4974 
4975  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4976  KA_TRACE(
4977  1000, (""); {
4978  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4979  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4980  (kmp_affin_mask_t *)(*mask));
4981  __kmp_printf(
4982  "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
4983  buf);
4984  });
4985  return retval;
4986 
4987 #else
4988  (void)retval;
4989 
4990  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4991  return 0;
4992 
4993 #endif /* KMP_OS_WINDOWS */
4994 }
4995 
4996 int __kmp_aux_get_affinity_max_proc() {
4997  if (!KMP_AFFINITY_CAPABLE()) {
4998  return 0;
4999  }
5000 #if KMP_GROUP_AFFINITY
5001  if (__kmp_num_proc_groups > 1) {
5002  return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
5003  }
5004 #endif
5005  return __kmp_xproc;
5006 }
5007 
5008 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
5009  if (!KMP_AFFINITY_CAPABLE()) {
5010  return -1;
5011  }
5012 
5013  KA_TRACE(
5014  1000, (""); {
5015  int gtid = __kmp_entry_gtid();
5016  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5017  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5018  (kmp_affin_mask_t *)(*mask));
5019  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
5020  "affinity mask for thread %d = %s\n",
5021  proc, gtid, buf);
5022  });
5023 
5024  if (__kmp_env_consistency_check) {
5025  if ((mask == NULL) || (*mask == NULL)) {
5026  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
5027  }
5028  }
5029 
5030  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5031  return -1;
5032  }
5033  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5034  return -2;
5035  }
5036 
5037  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5038  return 0;
5039 }
5040 
5041 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5042  if (!KMP_AFFINITY_CAPABLE()) {
5043  return -1;
5044  }
5045 
5046  KA_TRACE(
5047  1000, (""); {
5048  int gtid = __kmp_entry_gtid();
5049  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5050  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5051  (kmp_affin_mask_t *)(*mask));
5052  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5053  "affinity mask for thread %d = %s\n",
5054  proc, gtid, buf);
5055  });
5056 
5057  if (__kmp_env_consistency_check) {
5058  if ((mask == NULL) || (*mask == NULL)) {
5059  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5060  }
5061  }
5062 
5063  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5064  return -1;
5065  }
5066  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5067  return -2;
5068  }
5069 
5070  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5071  return 0;
5072 }
5073 
5074 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5075  if (!KMP_AFFINITY_CAPABLE()) {
5076  return -1;
5077  }
5078 
5079  KA_TRACE(
5080  1000, (""); {
5081  int gtid = __kmp_entry_gtid();
5082  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5083  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5084  (kmp_affin_mask_t *)(*mask));
5085  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5086  "affinity mask for thread %d = %s\n",
5087  proc, gtid, buf);
5088  });
5089 
5090  if (__kmp_env_consistency_check) {
5091  if ((mask == NULL) || (*mask == NULL)) {
5092  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5093  }
5094  }
5095 
5096  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5097  return -1;
5098  }
5099  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5100  return 0;
5101  }
5102 
5103  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5104 }
5105 
5106 // Dynamic affinity settings - Affinity balanced
5107 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5108  KMP_DEBUG_ASSERT(th);
5109  bool fine_gran = true;
5110  int tid = th->th.th_info.ds.ds_tid;
5111  const char *env_var = "KMP_AFFINITY";
5112 
5113  // Do not perform balanced affinity for the hidden helper threads
5114  if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
5115  return;
5116 
5117  switch (__kmp_affinity.gran) {
5118  case KMP_HW_THREAD:
5119  break;
5120  case KMP_HW_CORE:
5121  if (__kmp_nThreadsPerCore > 1) {
5122  fine_gran = false;
5123  }
5124  break;
5125  case KMP_HW_SOCKET:
5126  if (nCoresPerPkg > 1) {
5127  fine_gran = false;
5128  }
5129  break;
5130  default:
5131  fine_gran = false;
5132  }
5133 
5134  if (__kmp_topology->is_uniform()) {
5135  int coreID;
5136  int threadID;
5137  // Number of hyper threads per core in HT machine
5138  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5139  // Number of cores
5140  int ncores = __kmp_ncores;
5141  if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5142  __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5143  ncores = nPackages;
5144  }
5145  // How many threads will be bound to each core
5146  int chunk = nthreads / ncores;
5147  // How many cores will have an additional thread bound to it - "big cores"
5148  int big_cores = nthreads % ncores;
5149  // Number of threads on the big cores
5150  int big_nth = (chunk + 1) * big_cores;
5151  if (tid < big_nth) {
5152  coreID = tid / (chunk + 1);
5153  threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5154  } else { // tid >= big_nth
5155  coreID = (tid - big_cores) / chunk;
5156  threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5157  }
5158  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5159  "Illegal set affinity operation when not capable");
5160 
5161  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5162  KMP_CPU_ZERO(mask);
5163 
5164  if (fine_gran) {
5165  int osID =
5166  __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
5167  KMP_CPU_SET(osID, mask);
5168  } else {
5169  for (int i = 0; i < __kmp_nth_per_core; i++) {
5170  int osID;
5171  osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
5172  KMP_CPU_SET(osID, mask);
5173  }
5174  }
5175  if (__kmp_affinity.flags.verbose) {
5176  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5177  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5178  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5179  tid, buf);
5180  }
5181  __kmp_affinity_get_thread_topology_info(th);
5182  __kmp_set_system_affinity(mask, TRUE);
5183  } else { // Non-uniform topology
5184 
5185  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5186  KMP_CPU_ZERO(mask);
5187 
5188  int core_level =
5189  __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
5190  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
5191  __kmp_aff_depth - 1, core_level);
5192  int nth_per_core = __kmp_affinity_max_proc_per_core(
5193  __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5194 
5195  // For performance gain consider the special case nthreads ==
5196  // __kmp_avail_proc
5197  if (nthreads == __kmp_avail_proc) {
5198  if (fine_gran) {
5199  int osID = __kmp_topology->at(tid).os_id;
5200  KMP_CPU_SET(osID, mask);
5201  } else {
5202  int core =
5203  __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
5204  for (int i = 0; i < __kmp_avail_proc; i++) {
5205  int osID = __kmp_topology->at(i).os_id;
5206  if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
5207  core) {
5208  KMP_CPU_SET(osID, mask);
5209  }
5210  }
5211  }
5212  } else if (nthreads <= ncores) {
5213 
5214  int core = 0;
5215  for (int i = 0; i < ncores; i++) {
5216  // Check if this core from procarr[] is in the mask
5217  int in_mask = 0;
5218  for (int j = 0; j < nth_per_core; j++) {
5219  if (procarr[i * nth_per_core + j] != -1) {
5220  in_mask = 1;
5221  break;
5222  }
5223  }
5224  if (in_mask) {
5225  if (tid == core) {
5226  for (int j = 0; j < nth_per_core; j++) {
5227  int osID = procarr[i * nth_per_core + j];
5228  if (osID != -1) {
5229  KMP_CPU_SET(osID, mask);
5230  // For fine granularity it is enough to set the first available
5231  // osID for this core
5232  if (fine_gran) {
5233  break;
5234  }
5235  }
5236  }
5237  break;
5238  } else {
5239  core++;
5240  }
5241  }
5242  }
5243  } else { // nthreads > ncores
5244  // Array to save the number of processors at each core
5245  int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5246  // Array to save the number of cores with "x" available processors;
5247  int *ncores_with_x_procs =
5248  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5249  // Array to save the number of cores with # procs from x to nth_per_core
5250  int *ncores_with_x_to_max_procs =
5251  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5252 
5253  for (int i = 0; i <= nth_per_core; i++) {
5254  ncores_with_x_procs[i] = 0;
5255  ncores_with_x_to_max_procs[i] = 0;
5256  }
5257 
5258  for (int i = 0; i < ncores; i++) {
5259  int cnt = 0;
5260  for (int j = 0; j < nth_per_core; j++) {
5261  if (procarr[i * nth_per_core + j] != -1) {
5262  cnt++;
5263  }
5264  }
5265  nproc_at_core[i] = cnt;
5266  ncores_with_x_procs[cnt]++;
5267  }
5268 
5269  for (int i = 0; i <= nth_per_core; i++) {
5270  for (int j = i; j <= nth_per_core; j++) {
5271  ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5272  }
5273  }
5274 
5275  // Max number of processors
5276  int nproc = nth_per_core * ncores;
5277  // An array to keep number of threads per each context
5278  int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5279  for (int i = 0; i < nproc; i++) {
5280  newarr[i] = 0;
5281  }
5282 
5283  int nth = nthreads;
5284  int flag = 0;
5285  while (nth > 0) {
5286  for (int j = 1; j <= nth_per_core; j++) {
5287  int cnt = ncores_with_x_to_max_procs[j];
5288  for (int i = 0; i < ncores; i++) {
5289  // Skip the core with 0 processors
5290  if (nproc_at_core[i] == 0) {
5291  continue;
5292  }
5293  for (int k = 0; k < nth_per_core; k++) {
5294  if (procarr[i * nth_per_core + k] != -1) {
5295  if (newarr[i * nth_per_core + k] == 0) {
5296  newarr[i * nth_per_core + k] = 1;
5297  cnt--;
5298  nth--;
5299  break;
5300  } else {
5301  if (flag != 0) {
5302  newarr[i * nth_per_core + k]++;
5303  cnt--;
5304  nth--;
5305  break;
5306  }
5307  }
5308  }
5309  }
5310  if (cnt == 0 || nth == 0) {
5311  break;
5312  }
5313  }
5314  if (nth == 0) {
5315  break;
5316  }
5317  }
5318  flag = 1;
5319  }
5320  int sum = 0;
5321  for (int i = 0; i < nproc; i++) {
5322  sum += newarr[i];
5323  if (sum > tid) {
5324  if (fine_gran) {
5325  int osID = procarr[i];
5326  KMP_CPU_SET(osID, mask);
5327  } else {
5328  int coreID = i / nth_per_core;
5329  for (int ii = 0; ii < nth_per_core; ii++) {
5330  int osID = procarr[coreID * nth_per_core + ii];
5331  if (osID != -1) {
5332  KMP_CPU_SET(osID, mask);
5333  }
5334  }
5335  }
5336  break;
5337  }
5338  }
5339  __kmp_free(newarr);
5340  }
5341 
5342  if (__kmp_affinity.flags.verbose) {
5343  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5344  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5345  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5346  tid, buf);
5347  }
5348  __kmp_affinity_get_thread_topology_info(th);
5349  __kmp_set_system_affinity(mask, TRUE);
5350  }
5351 }
5352 
5353 #if KMP_OS_LINUX || KMP_OS_FREEBSD
5354 // We don't need this entry for Windows because
5355 // there is GetProcessAffinityMask() api
5356 //
5357 // The intended usage is indicated by these steps:
5358 // 1) The user gets the current affinity mask
5359 // 2) Then sets the affinity by calling this function
5360 // 3) Error check the return value
5361 // 4) Use non-OpenMP parallelization
5362 // 5) Reset the affinity to what was stored in step 1)
5363 #ifdef __cplusplus
5364 extern "C"
5365 #endif
5366  int
5367  kmp_set_thread_affinity_mask_initial()
5368 // the function returns 0 on success,
5369 // -1 if we cannot bind thread
5370 // >0 (errno) if an error happened during binding
5371 {
5372  int gtid = __kmp_get_gtid();
5373  if (gtid < 0) {
5374  // Do not touch non-omp threads
5375  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5376  "non-omp thread, returning\n"));
5377  return -1;
5378  }
5379  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5380  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5381  "affinity not initialized, returning\n"));
5382  return -1;
5383  }
5384  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5385  "set full mask for thread %d\n",
5386  gtid));
5387  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5388  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5389 }
5390 #endif
5391 
5392 #endif // KMP_AFFINITY_SUPPORTED
int try_open(const char *filename, const char *mode)
Definition: kmp.h:4544