LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 #if KMP_USE_HWLOC
23 // Copied from hwloc
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28 #endif
29 #include <ctype.h>
30 
31 // The machine topology
32 kmp_topology_t *__kmp_topology = nullptr;
33 // KMP_HW_SUBSET environment variable
34 kmp_hw_subset_t *__kmp_hw_subset = nullptr;
35 
36 // Store the real or imagined machine hierarchy here
37 static hierarchy_info machine_hierarchy;
38 
39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
40 
41 #if KMP_AFFINITY_SUPPORTED
42 // Helper class to see if place lists further restrict the fullMask
43 class kmp_full_mask_modifier_t {
44  kmp_affin_mask_t *mask;
45 
46 public:
47  kmp_full_mask_modifier_t() {
48  KMP_CPU_ALLOC(mask);
49  KMP_CPU_ZERO(mask);
50  }
51  ~kmp_full_mask_modifier_t() {
52  KMP_CPU_FREE(mask);
53  mask = nullptr;
54  }
55  void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
56  // If the new full mask is different from the current full mask,
57  // then switch them. Returns true if full mask was affected, false otherwise.
58  bool restrict_to_mask() {
59  // See if the new mask further restricts or changes the full mask
60  if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
61  return false;
62  return __kmp_topology->restrict_to_mask(mask);
63  }
64 };
65 
66 static inline const char *
67 __kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
68  bool for_binding = false) {
69  if (affinity.flags.omp_places) {
70  if (for_binding)
71  return "OMP_PROC_BIND";
72  return "OMP_PLACES";
73  }
74  return affinity.env_var;
75 }
76 #endif // KMP_AFFINITY_SUPPORTED
77 
78 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
79  kmp_uint32 depth;
80  // The test below is true if affinity is available, but set to "none". Need to
81  // init on first use of hierarchical barrier.
82  if (TCR_1(machine_hierarchy.uninitialized))
83  machine_hierarchy.init(nproc);
84 
85  // Adjust the hierarchy in case num threads exceeds original
86  if (nproc > machine_hierarchy.base_num_threads)
87  machine_hierarchy.resize(nproc);
88 
89  depth = machine_hierarchy.depth;
90  KMP_DEBUG_ASSERT(depth > 0);
91 
92  thr_bar->depth = depth;
93  __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
94  &(thr_bar->base_leaf_kids));
95  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
96 }
97 
98 static int nCoresPerPkg, nPackages;
99 static int __kmp_nThreadsPerCore;
100 #ifndef KMP_DFLT_NTH_CORES
101 static int __kmp_ncores;
102 #endif
103 
104 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
105  switch (type) {
106  case KMP_HW_SOCKET:
107  return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
108  case KMP_HW_DIE:
109  return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
110  case KMP_HW_MODULE:
111  return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
112  case KMP_HW_TILE:
113  return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
114  case KMP_HW_NUMA:
115  return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
116  case KMP_HW_L3:
117  return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
118  case KMP_HW_L2:
119  return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
120  case KMP_HW_L1:
121  return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
122  case KMP_HW_LLC:
123  return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
124  case KMP_HW_CORE:
125  return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
126  case KMP_HW_THREAD:
127  return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
128  case KMP_HW_PROC_GROUP:
129  return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
130  }
131  return KMP_I18N_STR(Unknown);
132 }
133 
134 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
135  switch (type) {
136  case KMP_HW_SOCKET:
137  return ((plural) ? "sockets" : "socket");
138  case KMP_HW_DIE:
139  return ((plural) ? "dice" : "die");
140  case KMP_HW_MODULE:
141  return ((plural) ? "modules" : "module");
142  case KMP_HW_TILE:
143  return ((plural) ? "tiles" : "tile");
144  case KMP_HW_NUMA:
145  return ((plural) ? "numa_domains" : "numa_domain");
146  case KMP_HW_L3:
147  return ((plural) ? "l3_caches" : "l3_cache");
148  case KMP_HW_L2:
149  return ((plural) ? "l2_caches" : "l2_cache");
150  case KMP_HW_L1:
151  return ((plural) ? "l1_caches" : "l1_cache");
152  case KMP_HW_LLC:
153  return ((plural) ? "ll_caches" : "ll_cache");
154  case KMP_HW_CORE:
155  return ((plural) ? "cores" : "core");
156  case KMP_HW_THREAD:
157  return ((plural) ? "threads" : "thread");
158  case KMP_HW_PROC_GROUP:
159  return ((plural) ? "proc_groups" : "proc_group");
160  }
161  return ((plural) ? "unknowns" : "unknown");
162 }
163 
164 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
165  switch (type) {
166  case KMP_HW_CORE_TYPE_UNKNOWN:
167  return "unknown";
168 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
169  case KMP_HW_CORE_TYPE_ATOM:
170  return "Intel Atom(R) processor";
171  case KMP_HW_CORE_TYPE_CORE:
172  return "Intel(R) Core(TM) processor";
173 #endif
174  }
175  return "unknown";
176 }
177 
178 #if KMP_AFFINITY_SUPPORTED
179 // If affinity is supported, check the affinity
180 // verbose and warning flags before printing warning
181 #define KMP_AFF_WARNING(s, ...) \
182  if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) { \
183  KMP_WARNING(__VA_ARGS__); \
184  }
185 #else
186 #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
187 #endif
188 
190 // kmp_hw_thread_t methods
191 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
192  const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
193  const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
194  int depth = __kmp_topology->get_depth();
195  for (int level = 0; level < depth; ++level) {
196  if (ahwthread->ids[level] < bhwthread->ids[level])
197  return -1;
198  else if (ahwthread->ids[level] > bhwthread->ids[level])
199  return 1;
200  }
201  if (ahwthread->os_id < bhwthread->os_id)
202  return -1;
203  else if (ahwthread->os_id > bhwthread->os_id)
204  return 1;
205  return 0;
206 }
207 
208 #if KMP_AFFINITY_SUPPORTED
209 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
210  int i;
211  const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
212  const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
213  int depth = __kmp_topology->get_depth();
214  int compact = __kmp_topology->compact;
215  KMP_DEBUG_ASSERT(compact >= 0);
216  KMP_DEBUG_ASSERT(compact <= depth);
217  for (i = 0; i < compact; i++) {
218  int j = depth - i - 1;
219  if (aa->sub_ids[j] < bb->sub_ids[j])
220  return -1;
221  if (aa->sub_ids[j] > bb->sub_ids[j])
222  return 1;
223  }
224  for (; i < depth; i++) {
225  int j = i - compact;
226  if (aa->sub_ids[j] < bb->sub_ids[j])
227  return -1;
228  if (aa->sub_ids[j] > bb->sub_ids[j])
229  return 1;
230  }
231  return 0;
232 }
233 #endif
234 
235 void kmp_hw_thread_t::print() const {
236  int depth = __kmp_topology->get_depth();
237  printf("%4d ", os_id);
238  for (int i = 0; i < depth; ++i) {
239  printf("%4d ", ids[i]);
240  }
241  if (attrs) {
242  if (attrs.is_core_type_valid())
243  printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
244  if (attrs.is_core_eff_valid())
245  printf(" (eff=%d)", attrs.get_core_eff());
246  }
247  if (leader)
248  printf(" (leader)");
249  printf("\n");
250 }
251 
253 // kmp_topology_t methods
254 
255 // Add a layer to the topology based on the ids. Assume the topology
256 // is perfectly nested (i.e., so no object has more than one parent)
257 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
258  // Figure out where the layer should go by comparing the ids of the current
259  // layers with the new ids
260  int target_layer;
261  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
262  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
263 
264  // Start from the highest layer and work down to find target layer
265  // If new layer is equal to another layer then put the new layer above
266  for (target_layer = 0; target_layer < depth; ++target_layer) {
267  bool layers_equal = true;
268  bool strictly_above_target_layer = false;
269  for (int i = 0; i < num_hw_threads; ++i) {
270  int id = hw_threads[i].ids[target_layer];
271  int new_id = ids[i];
272  if (id != previous_id && new_id == previous_new_id) {
273  // Found the layer we are strictly above
274  strictly_above_target_layer = true;
275  layers_equal = false;
276  break;
277  } else if (id == previous_id && new_id != previous_new_id) {
278  // Found a layer we are below. Move to next layer and check.
279  layers_equal = false;
280  break;
281  }
282  previous_id = id;
283  previous_new_id = new_id;
284  }
285  if (strictly_above_target_layer || layers_equal)
286  break;
287  }
288 
289  // Found the layer we are above. Now move everything to accommodate the new
290  // layer. And put the new ids and type into the topology.
291  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
292  types[j] = types[i];
293  types[target_layer] = type;
294  for (int k = 0; k < num_hw_threads; ++k) {
295  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
296  hw_threads[k].ids[j] = hw_threads[k].ids[i];
297  hw_threads[k].ids[target_layer] = ids[k];
298  }
299  equivalent[type] = type;
300  depth++;
301 }
302 
303 #if KMP_GROUP_AFFINITY
304 // Insert the Windows Processor Group structure into the topology
305 void kmp_topology_t::_insert_windows_proc_groups() {
306  // Do not insert the processor group structure for a single group
307  if (__kmp_num_proc_groups == 1)
308  return;
309  kmp_affin_mask_t *mask;
310  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
311  KMP_CPU_ALLOC(mask);
312  for (int i = 0; i < num_hw_threads; ++i) {
313  KMP_CPU_ZERO(mask);
314  KMP_CPU_SET(hw_threads[i].os_id, mask);
315  ids[i] = __kmp_get_proc_group(mask);
316  }
317  KMP_CPU_FREE(mask);
318  _insert_layer(KMP_HW_PROC_GROUP, ids);
319  __kmp_free(ids);
320 }
321 #endif
322 
323 // Remove layers that don't add information to the topology.
324 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
325 void kmp_topology_t::_remove_radix1_layers() {
326  int preference[KMP_HW_LAST];
327  int top_index1, top_index2;
328  // Set up preference associative array
329  preference[KMP_HW_SOCKET] = 110;
330  preference[KMP_HW_PROC_GROUP] = 100;
331  preference[KMP_HW_CORE] = 95;
332  preference[KMP_HW_THREAD] = 90;
333  preference[KMP_HW_NUMA] = 85;
334  preference[KMP_HW_DIE] = 80;
335  preference[KMP_HW_TILE] = 75;
336  preference[KMP_HW_MODULE] = 73;
337  preference[KMP_HW_L3] = 70;
338  preference[KMP_HW_L2] = 65;
339  preference[KMP_HW_L1] = 60;
340  preference[KMP_HW_LLC] = 5;
341  top_index1 = 0;
342  top_index2 = 1;
343  while (top_index1 < depth - 1 && top_index2 < depth) {
344  kmp_hw_t type1 = types[top_index1];
345  kmp_hw_t type2 = types[top_index2];
346  KMP_ASSERT_VALID_HW_TYPE(type1);
347  KMP_ASSERT_VALID_HW_TYPE(type2);
348  // Do not allow the three main topology levels (sockets, cores, threads) to
349  // be compacted down
350  if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
351  type1 == KMP_HW_SOCKET) &&
352  (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
353  type2 == KMP_HW_SOCKET)) {
354  top_index1 = top_index2++;
355  continue;
356  }
357  bool radix1 = true;
358  bool all_same = true;
359  int id1 = hw_threads[0].ids[top_index1];
360  int id2 = hw_threads[0].ids[top_index2];
361  int pref1 = preference[type1];
362  int pref2 = preference[type2];
363  for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
364  if (hw_threads[hwidx].ids[top_index1] == id1 &&
365  hw_threads[hwidx].ids[top_index2] != id2) {
366  radix1 = false;
367  break;
368  }
369  if (hw_threads[hwidx].ids[top_index2] != id2)
370  all_same = false;
371  id1 = hw_threads[hwidx].ids[top_index1];
372  id2 = hw_threads[hwidx].ids[top_index2];
373  }
374  if (radix1) {
375  // Select the layer to remove based on preference
376  kmp_hw_t remove_type, keep_type;
377  int remove_layer, remove_layer_ids;
378  if (pref1 > pref2) {
379  remove_type = type2;
380  remove_layer = remove_layer_ids = top_index2;
381  keep_type = type1;
382  } else {
383  remove_type = type1;
384  remove_layer = remove_layer_ids = top_index1;
385  keep_type = type2;
386  }
387  // If all the indexes for the second (deeper) layer are the same.
388  // e.g., all are zero, then make sure to keep the first layer's ids
389  if (all_same)
390  remove_layer_ids = top_index2;
391  // Remove radix one type by setting the equivalence, removing the id from
392  // the hw threads and removing the layer from types and depth
393  set_equivalent_type(remove_type, keep_type);
394  for (int idx = 0; idx < num_hw_threads; ++idx) {
395  kmp_hw_thread_t &hw_thread = hw_threads[idx];
396  for (int d = remove_layer_ids; d < depth - 1; ++d)
397  hw_thread.ids[d] = hw_thread.ids[d + 1];
398  }
399  for (int idx = remove_layer; idx < depth - 1; ++idx)
400  types[idx] = types[idx + 1];
401  depth--;
402  } else {
403  top_index1 = top_index2++;
404  }
405  }
406  KMP_ASSERT(depth > 0);
407 }
408 
409 void kmp_topology_t::_set_last_level_cache() {
410  if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
411  set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
412  else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
413  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
414 #if KMP_MIC_SUPPORTED
415  else if (__kmp_mic_type == mic3) {
416  if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
417  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
418  else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
419  set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
420  // L2/Tile wasn't detected so just say L1
421  else
422  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
423  }
424 #endif
425  else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
426  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
427  // Fallback is to set last level cache to socket or core
428  if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
429  if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
430  set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
431  else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
432  set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
433  }
434  KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
435 }
436 
437 // Gather the count of each topology layer and the ratio
438 void kmp_topology_t::_gather_enumeration_information() {
439  int previous_id[KMP_HW_LAST];
440  int max[KMP_HW_LAST];
441 
442  for (int i = 0; i < depth; ++i) {
443  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
444  max[i] = 0;
445  count[i] = 0;
446  ratio[i] = 0;
447  }
448  int core_level = get_level(KMP_HW_CORE);
449  for (int i = 0; i < num_hw_threads; ++i) {
450  kmp_hw_thread_t &hw_thread = hw_threads[i];
451  for (int layer = 0; layer < depth; ++layer) {
452  int id = hw_thread.ids[layer];
453  if (id != previous_id[layer]) {
454  // Add an additional increment to each count
455  for (int l = layer; l < depth; ++l)
456  count[l]++;
457  // Keep track of topology layer ratio statistics
458  max[layer]++;
459  for (int l = layer + 1; l < depth; ++l) {
460  if (max[l] > ratio[l])
461  ratio[l] = max[l];
462  max[l] = 1;
463  }
464  // Figure out the number of different core types
465  // and efficiencies for hybrid CPUs
466  if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
467  if (hw_thread.attrs.is_core_eff_valid() &&
468  hw_thread.attrs.core_eff >= num_core_efficiencies) {
469  // Because efficiencies can range from 0 to max efficiency - 1,
470  // the number of efficiencies is max efficiency + 1
471  num_core_efficiencies = hw_thread.attrs.core_eff + 1;
472  }
473  if (hw_thread.attrs.is_core_type_valid()) {
474  bool found = false;
475  for (int j = 0; j < num_core_types; ++j) {
476  if (hw_thread.attrs.get_core_type() == core_types[j]) {
477  found = true;
478  break;
479  }
480  }
481  if (!found) {
482  KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
483  core_types[num_core_types++] = hw_thread.attrs.get_core_type();
484  }
485  }
486  }
487  break;
488  }
489  }
490  for (int layer = 0; layer < depth; ++layer) {
491  previous_id[layer] = hw_thread.ids[layer];
492  }
493  }
494  for (int layer = 0; layer < depth; ++layer) {
495  if (max[layer] > ratio[layer])
496  ratio[layer] = max[layer];
497  }
498 }
499 
500 int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
501  int above_level,
502  bool find_all) const {
503  int current, current_max;
504  int previous_id[KMP_HW_LAST];
505  for (int i = 0; i < depth; ++i)
506  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
507  int core_level = get_level(KMP_HW_CORE);
508  if (find_all)
509  above_level = -1;
510  KMP_ASSERT(above_level < core_level);
511  current_max = 0;
512  current = 0;
513  for (int i = 0; i < num_hw_threads; ++i) {
514  kmp_hw_thread_t &hw_thread = hw_threads[i];
515  if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
516  if (current > current_max)
517  current_max = current;
518  current = hw_thread.attrs.contains(attr);
519  } else {
520  for (int level = above_level + 1; level <= core_level; ++level) {
521  if (hw_thread.ids[level] != previous_id[level]) {
522  if (hw_thread.attrs.contains(attr))
523  current++;
524  break;
525  }
526  }
527  }
528  for (int level = 0; level < depth; ++level)
529  previous_id[level] = hw_thread.ids[level];
530  }
531  if (current > current_max)
532  current_max = current;
533  return current_max;
534 }
535 
536 // Find out if the topology is uniform
537 void kmp_topology_t::_discover_uniformity() {
538  int num = 1;
539  for (int level = 0; level < depth; ++level)
540  num *= ratio[level];
541  flags.uniform = (num == count[depth - 1]);
542 }
543 
544 // Set all the sub_ids for each hardware thread
545 void kmp_topology_t::_set_sub_ids() {
546  int previous_id[KMP_HW_LAST];
547  int sub_id[KMP_HW_LAST];
548 
549  for (int i = 0; i < depth; ++i) {
550  previous_id[i] = -1;
551  sub_id[i] = -1;
552  }
553  for (int i = 0; i < num_hw_threads; ++i) {
554  kmp_hw_thread_t &hw_thread = hw_threads[i];
555  // Setup the sub_id
556  for (int j = 0; j < depth; ++j) {
557  if (hw_thread.ids[j] != previous_id[j]) {
558  sub_id[j]++;
559  for (int k = j + 1; k < depth; ++k) {
560  sub_id[k] = 0;
561  }
562  break;
563  }
564  }
565  // Set previous_id
566  for (int j = 0; j < depth; ++j) {
567  previous_id[j] = hw_thread.ids[j];
568  }
569  // Set the sub_ids field
570  for (int j = 0; j < depth; ++j) {
571  hw_thread.sub_ids[j] = sub_id[j];
572  }
573  }
574 }
575 
576 void kmp_topology_t::_set_globals() {
577  // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
578  int core_level, thread_level, package_level;
579  package_level = get_level(KMP_HW_SOCKET);
580 #if KMP_GROUP_AFFINITY
581  if (package_level == -1)
582  package_level = get_level(KMP_HW_PROC_GROUP);
583 #endif
584  core_level = get_level(KMP_HW_CORE);
585  thread_level = get_level(KMP_HW_THREAD);
586 
587  KMP_ASSERT(core_level != -1);
588  KMP_ASSERT(thread_level != -1);
589 
590  __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
591  if (package_level != -1) {
592  nCoresPerPkg = calculate_ratio(core_level, package_level);
593  nPackages = get_count(package_level);
594  } else {
595  // assume one socket
596  nCoresPerPkg = get_count(core_level);
597  nPackages = 1;
598  }
599 #ifndef KMP_DFLT_NTH_CORES
600  __kmp_ncores = get_count(core_level);
601 #endif
602 }
603 
604 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
605  const kmp_hw_t *types) {
606  kmp_topology_t *retval;
607  // Allocate all data in one large allocation
608  size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
609  sizeof(int) * (size_t)KMP_HW_LAST * 3;
610  char *bytes = (char *)__kmp_allocate(size);
611  retval = (kmp_topology_t *)bytes;
612  if (nproc > 0) {
613  retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
614  } else {
615  retval->hw_threads = nullptr;
616  }
617  retval->num_hw_threads = nproc;
618  retval->depth = ndepth;
619  int *arr =
620  (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
621  retval->types = (kmp_hw_t *)arr;
622  retval->ratio = arr + (size_t)KMP_HW_LAST;
623  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
624  retval->num_core_efficiencies = 0;
625  retval->num_core_types = 0;
626  retval->compact = 0;
627  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
628  retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
629  KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
630  for (int i = 0; i < ndepth; ++i) {
631  retval->types[i] = types[i];
632  retval->equivalent[types[i]] = types[i];
633  }
634  return retval;
635 }
636 
637 void kmp_topology_t::deallocate(kmp_topology_t *topology) {
638  if (topology)
639  __kmp_free(topology);
640 }
641 
642 bool kmp_topology_t::check_ids() const {
643  // Assume ids have been sorted
644  if (num_hw_threads == 0)
645  return true;
646  for (int i = 1; i < num_hw_threads; ++i) {
647  kmp_hw_thread_t &current_thread = hw_threads[i];
648  kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
649  bool unique = false;
650  for (int j = 0; j < depth; ++j) {
651  if (previous_thread.ids[j] != current_thread.ids[j]) {
652  unique = true;
653  break;
654  }
655  }
656  if (unique)
657  continue;
658  return false;
659  }
660  return true;
661 }
662 
663 void kmp_topology_t::dump() const {
664  printf("***********************\n");
665  printf("*** __kmp_topology: ***\n");
666  printf("***********************\n");
667  printf("* depth: %d\n", depth);
668 
669  printf("* types: ");
670  for (int i = 0; i < depth; ++i)
671  printf("%15s ", __kmp_hw_get_keyword(types[i]));
672  printf("\n");
673 
674  printf("* ratio: ");
675  for (int i = 0; i < depth; ++i) {
676  printf("%15d ", ratio[i]);
677  }
678  printf("\n");
679 
680  printf("* count: ");
681  for (int i = 0; i < depth; ++i) {
682  printf("%15d ", count[i]);
683  }
684  printf("\n");
685 
686  printf("* num_core_eff: %d\n", num_core_efficiencies);
687  printf("* num_core_types: %d\n", num_core_types);
688  printf("* core_types: ");
689  for (int i = 0; i < num_core_types; ++i)
690  printf("%3d ", core_types[i]);
691  printf("\n");
692 
693  printf("* equivalent map:\n");
694  KMP_FOREACH_HW_TYPE(i) {
695  const char *key = __kmp_hw_get_keyword(i);
696  const char *value = __kmp_hw_get_keyword(equivalent[i]);
697  printf("%-15s -> %-15s\n", key, value);
698  }
699 
700  printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
701 
702  printf("* num_hw_threads: %d\n", num_hw_threads);
703  printf("* hw_threads:\n");
704  for (int i = 0; i < num_hw_threads; ++i) {
705  hw_threads[i].print();
706  }
707  printf("***********************\n");
708 }
709 
710 void kmp_topology_t::print(const char *env_var) const {
711  kmp_str_buf_t buf;
712  int print_types_depth;
713  __kmp_str_buf_init(&buf);
714  kmp_hw_t print_types[KMP_HW_LAST + 2];
715 
716  // Num Available Threads
717  if (num_hw_threads) {
718  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
719  } else {
720  KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
721  }
722 
723  // Uniform or not
724  if (is_uniform()) {
725  KMP_INFORM(Uniform, env_var);
726  } else {
727  KMP_INFORM(NonUniform, env_var);
728  }
729 
730  // Equivalent types
731  KMP_FOREACH_HW_TYPE(type) {
732  kmp_hw_t eq_type = equivalent[type];
733  if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
734  KMP_INFORM(AffEqualTopologyTypes, env_var,
735  __kmp_hw_get_catalog_string(type),
736  __kmp_hw_get_catalog_string(eq_type));
737  }
738  }
739 
740  // Quick topology
741  KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
742  // Create a print types array that always guarantees printing
743  // the core and thread level
744  print_types_depth = 0;
745  for (int level = 0; level < depth; ++level)
746  print_types[print_types_depth++] = types[level];
747  if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
748  // Force in the core level for quick topology
749  if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
750  // Force core before thread e.g., 1 socket X 2 threads/socket
751  // becomes 1 socket X 1 core/socket X 2 threads/socket
752  print_types[print_types_depth - 1] = KMP_HW_CORE;
753  print_types[print_types_depth++] = KMP_HW_THREAD;
754  } else {
755  print_types[print_types_depth++] = KMP_HW_CORE;
756  }
757  }
758  // Always put threads at very end of quick topology
759  if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
760  print_types[print_types_depth++] = KMP_HW_THREAD;
761 
762  __kmp_str_buf_clear(&buf);
763  kmp_hw_t numerator_type;
764  kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
765  int core_level = get_level(KMP_HW_CORE);
766  int ncores = get_count(core_level);
767 
768  for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
769  int c;
770  bool plural;
771  numerator_type = print_types[plevel];
772  KMP_ASSERT_VALID_HW_TYPE(numerator_type);
773  if (equivalent[numerator_type] != numerator_type)
774  c = 1;
775  else
776  c = get_ratio(level++);
777  plural = (c > 1);
778  if (plevel == 0) {
779  __kmp_str_buf_print(&buf, "%d %s", c,
780  __kmp_hw_get_catalog_string(numerator_type, plural));
781  } else {
782  __kmp_str_buf_print(&buf, " x %d %s/%s", c,
783  __kmp_hw_get_catalog_string(numerator_type, plural),
784  __kmp_hw_get_catalog_string(denominator_type));
785  }
786  denominator_type = numerator_type;
787  }
788  KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
789 
790  // Hybrid topology information
791  if (__kmp_is_hybrid_cpu()) {
792  for (int i = 0; i < num_core_types; ++i) {
793  kmp_hw_core_type_t core_type = core_types[i];
794  kmp_hw_attr_t attr;
795  attr.clear();
796  attr.set_core_type(core_type);
797  int ncores = get_ncores_with_attr(attr);
798  if (ncores > 0) {
799  KMP_INFORM(TopologyHybrid, env_var, ncores,
800  __kmp_hw_get_core_type_string(core_type));
801  KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
802  for (int eff = 0; eff < num_core_efficiencies; ++eff) {
803  attr.set_core_eff(eff);
804  int ncores_with_eff = get_ncores_with_attr(attr);
805  if (ncores_with_eff > 0) {
806  KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
807  }
808  }
809  }
810  }
811  }
812 
813  if (num_hw_threads <= 0) {
814  __kmp_str_buf_free(&buf);
815  return;
816  }
817 
818  // Full OS proc to hardware thread map
819  KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
820  for (int i = 0; i < num_hw_threads; i++) {
821  __kmp_str_buf_clear(&buf);
822  for (int level = 0; level < depth; ++level) {
823  kmp_hw_t type = types[level];
824  __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
825  __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
826  }
827  if (__kmp_is_hybrid_cpu())
828  __kmp_str_buf_print(
829  &buf, "(%s)",
830  __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
831  KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
832  }
833 
834  __kmp_str_buf_free(&buf);
835 }
836 
837 #if KMP_AFFINITY_SUPPORTED
838 void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
839  const char *env_var = __kmp_get_affinity_env_var(affinity);
840  // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
841  // KMP_AFFINITY), but none exist, then reset granularity and have below method
842  // select a granularity and warn user.
843  if (!__kmp_is_hybrid_cpu()) {
844  if (affinity.core_attr_gran.valid) {
845  // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
846  // instead
847  KMP_AFF_WARNING(
848  affinity, AffIgnoringNonHybrid, env_var,
849  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
850  affinity.gran = KMP_HW_CORE;
851  affinity.gran_levels = -1;
852  affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
853  affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
854  } else if (affinity.flags.core_types_gran ||
855  affinity.flags.core_effs_gran) {
856  // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
857  if (affinity.flags.omp_places) {
858  KMP_AFF_WARNING(
859  affinity, AffIgnoringNonHybrid, env_var,
860  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
861  } else {
862  // KMP_AFFINITY=granularity=core_type|core_eff,...
863  KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
864  "Intel(R) Hybrid Technology core attribute",
865  __kmp_hw_get_catalog_string(KMP_HW_CORE));
866  }
867  affinity.gran = KMP_HW_CORE;
868  affinity.gran_levels = -1;
869  affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
870  affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
871  }
872  }
873  // Set the number of affinity granularity levels
874  if (affinity.gran_levels < 0) {
875  kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
876  // Check if user's granularity request is valid
877  if (gran_type == KMP_HW_UNKNOWN) {
878  // First try core, then thread, then package
879  kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
880  for (auto g : gran_types) {
881  if (get_equivalent_type(g) != KMP_HW_UNKNOWN) {
882  gran_type = g;
883  break;
884  }
885  }
886  KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
887  // Warn user what granularity setting will be used instead
888  KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
889  __kmp_hw_get_catalog_string(affinity.gran),
890  __kmp_hw_get_catalog_string(gran_type));
891  affinity.gran = gran_type;
892  }
893 #if KMP_GROUP_AFFINITY
894  // If more than one processor group exists, and the level of
895  // granularity specified by the user is too coarse, then the
896  // granularity must be adjusted "down" to processor group affinity
897  // because threads can only exist within one processor group.
898  // For example, if a user sets granularity=socket and there are two
899  // processor groups that cover a socket, then the runtime must
900  // restrict the granularity down to the processor group level.
901  if (__kmp_num_proc_groups > 1) {
902  int gran_depth = get_level(gran_type);
903  int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
904  if (gran_depth >= 0 && proc_group_depth >= 0 &&
905  gran_depth < proc_group_depth) {
906  KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
907  __kmp_hw_get_catalog_string(affinity.gran));
908  affinity.gran = gran_type = KMP_HW_PROC_GROUP;
909  }
910  }
911 #endif
912  affinity.gran_levels = 0;
913  for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
914  affinity.gran_levels++;
915  }
916 }
917 #endif
918 
919 void kmp_topology_t::canonicalize() {
920 #if KMP_GROUP_AFFINITY
921  _insert_windows_proc_groups();
922 #endif
923  _remove_radix1_layers();
924  _gather_enumeration_information();
925  _discover_uniformity();
926  _set_sub_ids();
927  _set_globals();
928  _set_last_level_cache();
929 
930 #if KMP_MIC_SUPPORTED
931  // Manually Add L2 = Tile equivalence
932  if (__kmp_mic_type == mic3) {
933  if (get_level(KMP_HW_L2) != -1)
934  set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
935  else if (get_level(KMP_HW_TILE) != -1)
936  set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
937  }
938 #endif
939 
940  // Perform post canonicalization checking
941  KMP_ASSERT(depth > 0);
942  for (int level = 0; level < depth; ++level) {
943  // All counts, ratios, and types must be valid
944  KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
945  KMP_ASSERT_VALID_HW_TYPE(types[level]);
946  // Detected types must point to themselves
947  KMP_ASSERT(equivalent[types[level]] == types[level]);
948  }
949 }
950 
951 // Canonicalize an explicit packages X cores/pkg X threads/core topology
952 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
953  int nthreads_per_core, int ncores) {
954  int ndepth = 3;
955  depth = ndepth;
956  KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
957  for (int level = 0; level < depth; ++level) {
958  count[level] = 0;
959  ratio[level] = 0;
960  }
961  count[0] = npackages;
962  count[1] = ncores;
963  count[2] = __kmp_xproc;
964  ratio[0] = npackages;
965  ratio[1] = ncores_per_pkg;
966  ratio[2] = nthreads_per_core;
967  equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
968  equivalent[KMP_HW_CORE] = KMP_HW_CORE;
969  equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
970  types[0] = KMP_HW_SOCKET;
971  types[1] = KMP_HW_CORE;
972  types[2] = KMP_HW_THREAD;
973  //__kmp_avail_proc = __kmp_xproc;
974  _discover_uniformity();
975 }
976 
977 // Represents running sub IDs for a single core attribute where
978 // attribute values have SIZE possibilities.
979 template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
980  int last_level; // last level in topology to consider for sub_ids
981  int sub_id[SIZE]; // The sub ID for a given attribute value
982  int prev_sub_id[KMP_HW_LAST];
983  IndexFunc indexer;
984 
985 public:
986  kmp_sub_ids_t(int last_level) : last_level(last_level) {
987  KMP_ASSERT(last_level < KMP_HW_LAST);
988  for (size_t i = 0; i < SIZE; ++i)
989  sub_id[i] = -1;
990  for (size_t i = 0; i < KMP_HW_LAST; ++i)
991  prev_sub_id[i] = -1;
992  }
993  void update(const kmp_hw_thread_t &hw_thread) {
994  int idx = indexer(hw_thread);
995  KMP_ASSERT(idx < (int)SIZE);
996  for (int level = 0; level <= last_level; ++level) {
997  if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
998  if (level < last_level)
999  sub_id[idx] = -1;
1000  sub_id[idx]++;
1001  break;
1002  }
1003  }
1004  for (int level = 0; level <= last_level; ++level)
1005  prev_sub_id[level] = hw_thread.sub_ids[level];
1006  }
1007  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
1008  return sub_id[indexer(hw_thread)];
1009  }
1010 };
1011 
1012 #if KMP_AFFINITY_SUPPORTED
1013 static kmp_str_buf_t *
1014 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
1015  bool plural) {
1016  __kmp_str_buf_init(buf);
1017  if (attr.is_core_type_valid())
1018  __kmp_str_buf_print(buf, "%s %s",
1019  __kmp_hw_get_core_type_string(attr.get_core_type()),
1020  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
1021  else
1022  __kmp_str_buf_print(buf, "%s eff=%d",
1023  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
1024  attr.get_core_eff());
1025  return buf;
1026 }
1027 
1028 bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
1029  // Apply the filter
1030  bool affected;
1031  int new_index = 0;
1032  for (int i = 0; i < num_hw_threads; ++i) {
1033  int os_id = hw_threads[i].os_id;
1034  if (KMP_CPU_ISSET(os_id, mask)) {
1035  if (i != new_index)
1036  hw_threads[new_index] = hw_threads[i];
1037  new_index++;
1038  } else {
1039  KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
1040  __kmp_avail_proc--;
1041  }
1042  }
1043 
1044  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
1045  affected = (num_hw_threads != new_index);
1046  num_hw_threads = new_index;
1047 
1048  // Post hardware subset canonicalization
1049  if (affected) {
1050  _gather_enumeration_information();
1051  _discover_uniformity();
1052  _set_globals();
1053  _set_last_level_cache();
1054 #if KMP_OS_WINDOWS
1055  // Copy filtered full mask if topology has single processor group
1056  if (__kmp_num_proc_groups <= 1)
1057 #endif
1058  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
1059  }
1060  return affected;
1061 }
1062 
1063 // Apply the KMP_HW_SUBSET envirable to the topology
1064 // Returns true if KMP_HW_SUBSET filtered any processors
1065 // otherwise, returns false
1066 bool kmp_topology_t::filter_hw_subset() {
1067  // If KMP_HW_SUBSET wasn't requested, then do nothing.
1068  if (!__kmp_hw_subset)
1069  return false;
1070 
1071  // First, sort the KMP_HW_SUBSET items by the machine topology
1072  __kmp_hw_subset->sort();
1073 
1074  // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
1075  bool using_core_types = false;
1076  bool using_core_effs = false;
1077  int hw_subset_depth = __kmp_hw_subset->get_depth();
1078  kmp_hw_t specified[KMP_HW_LAST];
1079  int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
1080  KMP_ASSERT(hw_subset_depth > 0);
1081  KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
1082  int core_level = get_level(KMP_HW_CORE);
1083  for (int i = 0; i < hw_subset_depth; ++i) {
1084  int max_count;
1085  const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
1086  int num = item.num[0];
1087  int offset = item.offset[0];
1088  kmp_hw_t type = item.type;
1089  kmp_hw_t equivalent_type = equivalent[type];
1090  int level = get_level(type);
1091  topology_levels[i] = level;
1092 
1093  // Check to see if current layer is in detected machine topology
1094  if (equivalent_type != KMP_HW_UNKNOWN) {
1095  __kmp_hw_subset->at(i).type = equivalent_type;
1096  } else {
1097  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
1098  __kmp_hw_get_catalog_string(type));
1099  return false;
1100  }
1101 
1102  // Check to see if current layer has already been
1103  // specified either directly or through an equivalent type
1104  if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
1105  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
1106  __kmp_hw_get_catalog_string(type),
1107  __kmp_hw_get_catalog_string(specified[equivalent_type]));
1108  return false;
1109  }
1110  specified[equivalent_type] = type;
1111 
1112  // Check to see if each layer's num & offset parameters are valid
1113  max_count = get_ratio(level);
1114  if (max_count < 0 ||
1115  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1116  bool plural = (num > 1);
1117  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
1118  __kmp_hw_get_catalog_string(type, plural));
1119  return false;
1120  }
1121 
1122  // Check to see if core attributes are consistent
1123  if (core_level == level) {
1124  // Determine which core attributes are specified
1125  for (int j = 0; j < item.num_attrs; ++j) {
1126  if (item.attr[j].is_core_type_valid())
1127  using_core_types = true;
1128  if (item.attr[j].is_core_eff_valid())
1129  using_core_effs = true;
1130  }
1131 
1132  // Check if using a single core attribute on non-hybrid arch.
1133  // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
1134  //
1135  // Check if using multiple core attributes on non-hyrbid arch.
1136  // Ignore all of KMP_HW_SUBSET if this is the case.
1137  if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
1138  if (item.num_attrs == 1) {
1139  if (using_core_effs) {
1140  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1141  "efficiency");
1142  } else {
1143  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1144  "core_type");
1145  }
1146  using_core_effs = false;
1147  using_core_types = false;
1148  } else {
1149  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
1150  return false;
1151  }
1152  }
1153 
1154  // Check if using both core types and core efficiencies together
1155  if (using_core_types && using_core_effs) {
1156  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
1157  "efficiency");
1158  return false;
1159  }
1160 
1161  // Check that core efficiency values are valid
1162  if (using_core_effs) {
1163  for (int j = 0; j < item.num_attrs; ++j) {
1164  if (item.attr[j].is_core_eff_valid()) {
1165  int core_eff = item.attr[j].get_core_eff();
1166  if (core_eff < 0 || core_eff >= num_core_efficiencies) {
1167  kmp_str_buf_t buf;
1168  __kmp_str_buf_init(&buf);
1169  __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
1170  __kmp_msg(kmp_ms_warning,
1171  KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
1172  KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
1173  __kmp_msg_null);
1174  __kmp_str_buf_free(&buf);
1175  return false;
1176  }
1177  }
1178  }
1179  }
1180 
1181  // Check that the number of requested cores with attributes is valid
1182  if (using_core_types || using_core_effs) {
1183  for (int j = 0; j < item.num_attrs; ++j) {
1184  int num = item.num[j];
1185  int offset = item.offset[j];
1186  int level_above = core_level - 1;
1187  if (level_above >= 0) {
1188  max_count = get_ncores_with_attr_per(item.attr[j], level_above);
1189  if (max_count <= 0 ||
1190  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1191  kmp_str_buf_t buf;
1192  __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
1193  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
1194  __kmp_str_buf_free(&buf);
1195  return false;
1196  }
1197  }
1198  }
1199  }
1200 
1201  if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
1202  for (int j = 0; j < item.num_attrs; ++j) {
1203  // Ambiguous use of specific core attribute + generic core
1204  // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
1205  if (!item.attr[j]) {
1206  kmp_hw_attr_t other_attr;
1207  for (int k = 0; k < item.num_attrs; ++k) {
1208  if (item.attr[k] != item.attr[j]) {
1209  other_attr = item.attr[k];
1210  break;
1211  }
1212  }
1213  kmp_str_buf_t buf;
1214  __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
1215  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
1216  __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
1217  __kmp_str_buf_free(&buf);
1218  return false;
1219  }
1220  // Allow specifying a specific core type or core eff exactly once
1221  for (int k = 0; k < j; ++k) {
1222  if (!item.attr[j] || !item.attr[k])
1223  continue;
1224  if (item.attr[k] == item.attr[j]) {
1225  kmp_str_buf_t buf;
1226  __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
1227  item.num[j] > 0);
1228  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
1229  __kmp_str_buf_free(&buf);
1230  return false;
1231  }
1232  }
1233  }
1234  }
1235  }
1236  }
1237 
1238  struct core_type_indexer {
1239  int operator()(const kmp_hw_thread_t &t) const {
1240  switch (t.attrs.get_core_type()) {
1241 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1242  case KMP_HW_CORE_TYPE_ATOM:
1243  return 1;
1244  case KMP_HW_CORE_TYPE_CORE:
1245  return 2;
1246 #endif
1247  case KMP_HW_CORE_TYPE_UNKNOWN:
1248  return 0;
1249  }
1250  KMP_ASSERT(0);
1251  return 0;
1252  }
1253  };
1254  struct core_eff_indexer {
1255  int operator()(const kmp_hw_thread_t &t) const {
1256  return t.attrs.get_core_eff();
1257  }
1258  };
1259 
1260  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
1261  core_level);
1262  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
1263  core_level);
1264 
1265  // Determine which hardware threads should be filtered.
1266  int num_filtered = 0;
1267  kmp_affin_mask_t *filtered_mask;
1268  KMP_CPU_ALLOC(filtered_mask);
1269  KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
1270  for (int i = 0; i < num_hw_threads; ++i) {
1271  kmp_hw_thread_t &hw_thread = hw_threads[i];
1272  // Update type_sub_id
1273  if (using_core_types)
1274  core_type_sub_ids.update(hw_thread);
1275  if (using_core_effs)
1276  core_eff_sub_ids.update(hw_thread);
1277 
1278  // Check to see if this hardware thread should be filtered
1279  bool should_be_filtered = false;
1280  for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
1281  ++hw_subset_index) {
1282  const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
1283  int level = topology_levels[hw_subset_index];
1284  if (level == -1)
1285  continue;
1286  if ((using_core_effs || using_core_types) && level == core_level) {
1287  // Look for the core attribute in KMP_HW_SUBSET which corresponds
1288  // to this hardware thread's core attribute. Use this num,offset plus
1289  // the running sub_id for the particular core attribute of this hardware
1290  // thread to determine if the hardware thread should be filtered or not.
1291  int attr_idx;
1292  kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
1293  int core_eff = hw_thread.attrs.get_core_eff();
1294  for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
1295  if (using_core_types &&
1296  hw_subset_item.attr[attr_idx].get_core_type() == core_type)
1297  break;
1298  if (using_core_effs &&
1299  hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
1300  break;
1301  }
1302  // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
1303  if (attr_idx == hw_subset_item.num_attrs) {
1304  should_be_filtered = true;
1305  break;
1306  }
1307  int sub_id;
1308  int num = hw_subset_item.num[attr_idx];
1309  int offset = hw_subset_item.offset[attr_idx];
1310  if (using_core_types)
1311  sub_id = core_type_sub_ids.get_sub_id(hw_thread);
1312  else
1313  sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
1314  if (sub_id < offset ||
1315  (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1316  should_be_filtered = true;
1317  break;
1318  }
1319  } else {
1320  int num = hw_subset_item.num[0];
1321  int offset = hw_subset_item.offset[0];
1322  if (hw_thread.sub_ids[level] < offset ||
1323  (num != kmp_hw_subset_t::USE_ALL &&
1324  hw_thread.sub_ids[level] >= offset + num)) {
1325  should_be_filtered = true;
1326  break;
1327  }
1328  }
1329  }
1330  // Collect filtering information
1331  if (should_be_filtered) {
1332  KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
1333  num_filtered++;
1334  }
1335  }
1336 
1337  // One last check that we shouldn't allow filtering entire machine
1338  if (num_filtered == num_hw_threads) {
1339  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
1340  return false;
1341  }
1342 
1343  // Apply the filter
1344  restrict_to_mask(filtered_mask);
1345  return true;
1346 }
1347 
1348 bool kmp_topology_t::is_close(int hwt1, int hwt2,
1349  const kmp_affinity_t &stgs) const {
1350  int hw_level = stgs.gran_levels;
1351  if (hw_level >= depth)
1352  return true;
1353  bool retval = true;
1354  const kmp_hw_thread_t &t1 = hw_threads[hwt1];
1355  const kmp_hw_thread_t &t2 = hw_threads[hwt2];
1356  if (stgs.flags.core_types_gran)
1357  return t1.attrs.get_core_type() == t2.attrs.get_core_type();
1358  if (stgs.flags.core_effs_gran)
1359  return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
1360  for (int i = 0; i < (depth - hw_level); ++i) {
1361  if (t1.ids[i] != t2.ids[i])
1362  return false;
1363  }
1364  return retval;
1365 }
1366 
1368 
1369 bool KMPAffinity::picked_api = false;
1370 
1371 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
1372 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
1373 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
1374 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
1375 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
1376 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
1377 
1378 void KMPAffinity::pick_api() {
1379  KMPAffinity *affinity_dispatch;
1380  if (picked_api)
1381  return;
1382 #if KMP_USE_HWLOC
1383  // Only use Hwloc if affinity isn't explicitly disabled and
1384  // user requests Hwloc topology method
1385  if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
1386  __kmp_affinity.type != affinity_disabled) {
1387  affinity_dispatch = new KMPHwlocAffinity();
1388  } else
1389 #endif
1390  {
1391  affinity_dispatch = new KMPNativeAffinity();
1392  }
1393  __kmp_affinity_dispatch = affinity_dispatch;
1394  picked_api = true;
1395 }
1396 
1397 void KMPAffinity::destroy_api() {
1398  if (__kmp_affinity_dispatch != NULL) {
1399  delete __kmp_affinity_dispatch;
1400  __kmp_affinity_dispatch = NULL;
1401  picked_api = false;
1402  }
1403 }
1404 
1405 #define KMP_ADVANCE_SCAN(scan) \
1406  while (*scan != '\0') { \
1407  scan++; \
1408  }
1409 
1410 // Print the affinity mask to the character array in a pretty format.
1411 // The format is a comma separated list of non-negative integers or integer
1412 // ranges: e.g., 1,2,3-5,7,9-15
1413 // The format can also be the string "{<empty>}" if no bits are set in mask
1414 char *__kmp_affinity_print_mask(char *buf, int buf_len,
1415  kmp_affin_mask_t *mask) {
1416  int start = 0, finish = 0, previous = 0;
1417  bool first_range;
1418  KMP_ASSERT(buf);
1419  KMP_ASSERT(buf_len >= 40);
1420  KMP_ASSERT(mask);
1421  char *scan = buf;
1422  char *end = buf + buf_len - 1;
1423 
1424  // Check for empty set.
1425  if (mask->begin() == mask->end()) {
1426  KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
1427  KMP_ADVANCE_SCAN(scan);
1428  KMP_ASSERT(scan <= end);
1429  return buf;
1430  }
1431 
1432  first_range = true;
1433  start = mask->begin();
1434  while (1) {
1435  // Find next range
1436  // [start, previous] is inclusive range of contiguous bits in mask
1437  for (finish = mask->next(start), previous = start;
1438  finish == previous + 1 && finish != mask->end();
1439  finish = mask->next(finish)) {
1440  previous = finish;
1441  }
1442 
1443  // The first range does not need a comma printed before it, but the rest
1444  // of the ranges do need a comma beforehand
1445  if (!first_range) {
1446  KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
1447  KMP_ADVANCE_SCAN(scan);
1448  } else {
1449  first_range = false;
1450  }
1451  // Range with three or more contiguous bits in the affinity mask
1452  if (previous - start > 1) {
1453  KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
1454  } else {
1455  // Range with one or two contiguous bits in the affinity mask
1456  KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
1457  KMP_ADVANCE_SCAN(scan);
1458  if (previous - start > 0) {
1459  KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
1460  }
1461  }
1462  KMP_ADVANCE_SCAN(scan);
1463  // Start over with new start point
1464  start = finish;
1465  if (start == mask->end())
1466  break;
1467  // Check for overflow
1468  if (end - scan < 2)
1469  break;
1470  }
1471 
1472  // Check for overflow
1473  KMP_ASSERT(scan <= end);
1474  return buf;
1475 }
1476 #undef KMP_ADVANCE_SCAN
1477 
1478 // Print the affinity mask to the string buffer object in a pretty format
1479 // The format is a comma separated list of non-negative integers or integer
1480 // ranges: e.g., 1,2,3-5,7,9-15
1481 // The format can also be the string "{<empty>}" if no bits are set in mask
1482 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
1483  kmp_affin_mask_t *mask) {
1484  int start = 0, finish = 0, previous = 0;
1485  bool first_range;
1486  KMP_ASSERT(buf);
1487  KMP_ASSERT(mask);
1488 
1489  __kmp_str_buf_clear(buf);
1490 
1491  // Check for empty set.
1492  if (mask->begin() == mask->end()) {
1493  __kmp_str_buf_print(buf, "%s", "{<empty>}");
1494  return buf;
1495  }
1496 
1497  first_range = true;
1498  start = mask->begin();
1499  while (1) {
1500  // Find next range
1501  // [start, previous] is inclusive range of contiguous bits in mask
1502  for (finish = mask->next(start), previous = start;
1503  finish == previous + 1 && finish != mask->end();
1504  finish = mask->next(finish)) {
1505  previous = finish;
1506  }
1507 
1508  // The first range does not need a comma printed before it, but the rest
1509  // of the ranges do need a comma beforehand
1510  if (!first_range) {
1511  __kmp_str_buf_print(buf, "%s", ",");
1512  } else {
1513  first_range = false;
1514  }
1515  // Range with three or more contiguous bits in the affinity mask
1516  if (previous - start > 1) {
1517  __kmp_str_buf_print(buf, "%u-%u", start, previous);
1518  } else {
1519  // Range with one or two contiguous bits in the affinity mask
1520  __kmp_str_buf_print(buf, "%u", start);
1521  if (previous - start > 0) {
1522  __kmp_str_buf_print(buf, ",%u", previous);
1523  }
1524  }
1525  // Start over with new start point
1526  start = finish;
1527  if (start == mask->end())
1528  break;
1529  }
1530  return buf;
1531 }
1532 
1533 // Return (possibly empty) affinity mask representing the offline CPUs
1534 // Caller must free the mask
1535 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
1536  kmp_affin_mask_t *offline;
1537  KMP_CPU_ALLOC(offline);
1538  KMP_CPU_ZERO(offline);
1539 #if KMP_OS_LINUX
1540  int n, begin_cpu, end_cpu;
1541  kmp_safe_raii_file_t offline_file;
1542  auto skip_ws = [](FILE *f) {
1543  int c;
1544  do {
1545  c = fgetc(f);
1546  } while (isspace(c));
1547  if (c != EOF)
1548  ungetc(c, f);
1549  };
1550  // File contains CSV of integer ranges representing the offline CPUs
1551  // e.g., 1,2,4-7,9,11-15
1552  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
1553  if (status != 0)
1554  return offline;
1555  while (!feof(offline_file)) {
1556  skip_ws(offline_file);
1557  n = fscanf(offline_file, "%d", &begin_cpu);
1558  if (n != 1)
1559  break;
1560  skip_ws(offline_file);
1561  int c = fgetc(offline_file);
1562  if (c == EOF || c == ',') {
1563  // Just single CPU
1564  end_cpu = begin_cpu;
1565  } else if (c == '-') {
1566  // Range of CPUs
1567  skip_ws(offline_file);
1568  n = fscanf(offline_file, "%d", &end_cpu);
1569  if (n != 1)
1570  break;
1571  skip_ws(offline_file);
1572  c = fgetc(offline_file); // skip ','
1573  } else {
1574  // Syntax problem
1575  break;
1576  }
1577  // Ensure a valid range of CPUs
1578  if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
1579  end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
1580  continue;
1581  }
1582  // Insert [begin_cpu, end_cpu] into offline mask
1583  for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
1584  KMP_CPU_SET(cpu, offline);
1585  }
1586  }
1587 #endif
1588  return offline;
1589 }
1590 
1591 // Return the number of available procs
1592 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1593  int avail_proc = 0;
1594  KMP_CPU_ZERO(mask);
1595 
1596 #if KMP_GROUP_AFFINITY
1597 
1598  if (__kmp_num_proc_groups > 1) {
1599  int group;
1600  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1601  for (group = 0; group < __kmp_num_proc_groups; group++) {
1602  int i;
1603  int num = __kmp_GetActiveProcessorCount(group);
1604  for (i = 0; i < num; i++) {
1605  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1606  avail_proc++;
1607  }
1608  }
1609  } else
1610 
1611 #endif /* KMP_GROUP_AFFINITY */
1612 
1613  {
1614  int proc;
1615  kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
1616  for (proc = 0; proc < __kmp_xproc; proc++) {
1617  // Skip offline CPUs
1618  if (KMP_CPU_ISSET(proc, offline_cpus))
1619  continue;
1620  KMP_CPU_SET(proc, mask);
1621  avail_proc++;
1622  }
1623  KMP_CPU_FREE(offline_cpus);
1624  }
1625 
1626  return avail_proc;
1627 }
1628 
1629 // All of the __kmp_affinity_create_*_map() routines should allocate the
1630 // internal topology object and set the layer ids for it. Each routine
1631 // returns a boolean on whether it was successful at doing so.
1632 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1633 // Original mask is a subset of full mask in multiple processor groups topology
1634 kmp_affin_mask_t *__kmp_affin_origMask = NULL;
1635 
1636 #if KMP_USE_HWLOC
1637 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1638 #if HWLOC_API_VERSION >= 0x00020000
1639  return hwloc_obj_type_is_cache(obj->type);
1640 #else
1641  return obj->type == HWLOC_OBJ_CACHE;
1642 #endif
1643 }
1644 
1645 // Returns KMP_HW_* type derived from HWLOC_* type
1646 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1647 
1648  if (__kmp_hwloc_is_cache_type(obj)) {
1649  if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1650  return KMP_HW_UNKNOWN;
1651  switch (obj->attr->cache.depth) {
1652  case 1:
1653  return KMP_HW_L1;
1654  case 2:
1655 #if KMP_MIC_SUPPORTED
1656  if (__kmp_mic_type == mic3) {
1657  return KMP_HW_TILE;
1658  }
1659 #endif
1660  return KMP_HW_L2;
1661  case 3:
1662  return KMP_HW_L3;
1663  }
1664  return KMP_HW_UNKNOWN;
1665  }
1666 
1667  switch (obj->type) {
1668  case HWLOC_OBJ_PACKAGE:
1669  return KMP_HW_SOCKET;
1670  case HWLOC_OBJ_NUMANODE:
1671  return KMP_HW_NUMA;
1672  case HWLOC_OBJ_CORE:
1673  return KMP_HW_CORE;
1674  case HWLOC_OBJ_PU:
1675  return KMP_HW_THREAD;
1676  case HWLOC_OBJ_GROUP:
1677 #if HWLOC_API_VERSION >= 0x00020000
1678  if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1679  return KMP_HW_DIE;
1680  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1681  return KMP_HW_TILE;
1682  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1683  return KMP_HW_MODULE;
1684  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1685  return KMP_HW_PROC_GROUP;
1686 #endif
1687  return KMP_HW_UNKNOWN;
1688 #if HWLOC_API_VERSION >= 0x00020100
1689  case HWLOC_OBJ_DIE:
1690  return KMP_HW_DIE;
1691 #endif
1692  }
1693  return KMP_HW_UNKNOWN;
1694 }
1695 
1696 // Returns the number of objects of type 'type' below 'obj' within the topology
1697 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1698 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1699 // object.
1700 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1701  hwloc_obj_type_t type) {
1702  int retval = 0;
1703  hwloc_obj_t first;
1704  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1705  obj->logical_index, type, 0);
1706  first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1707  obj->type, first) == obj;
1708  first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1709  first)) {
1710  ++retval;
1711  }
1712  return retval;
1713 }
1714 
1715 // This gets the sub_id for a lower object under a higher object in the
1716 // topology tree
1717 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1718  hwloc_obj_t lower) {
1719  hwloc_obj_t obj;
1720  hwloc_obj_type_t ltype = lower->type;
1721  int lindex = lower->logical_index - 1;
1722  int sub_id = 0;
1723  // Get the previous lower object
1724  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1725  while (obj && lindex >= 0 &&
1726  hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1727  if (obj->userdata) {
1728  sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1729  break;
1730  }
1731  sub_id++;
1732  lindex--;
1733  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1734  }
1735  // store sub_id + 1 so that 0 is differed from NULL
1736  lower->userdata = RCAST(void *, sub_id + 1);
1737  return sub_id;
1738 }
1739 
1740 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1741  kmp_hw_t type;
1742  int hw_thread_index, sub_id;
1743  int depth;
1744  hwloc_obj_t pu, obj, root, prev;
1745  kmp_hw_t types[KMP_HW_LAST];
1746  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1747 
1748  hwloc_topology_t tp = __kmp_hwloc_topology;
1749  *msg_id = kmp_i18n_null;
1750  if (__kmp_affinity.flags.verbose) {
1751  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1752  }
1753 
1754  if (!KMP_AFFINITY_CAPABLE()) {
1755  // Hack to try and infer the machine topology using only the data
1756  // available from hwloc on the current thread, and __kmp_xproc.
1757  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1758  // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1759  hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1760  if (o != NULL)
1761  nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1762  else
1763  nCoresPerPkg = 1; // no PACKAGE found
1764  o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1765  if (o != NULL)
1766  __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1767  else
1768  __kmp_nThreadsPerCore = 1; // no CORE found
1769  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1770  if (nCoresPerPkg == 0)
1771  nCoresPerPkg = 1; // to prevent possible division by 0
1772  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1773  return true;
1774  }
1775 
1776 #if HWLOC_API_VERSION >= 0x00020400
1777  // Handle multiple types of cores if they exist on the system
1778  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
1779 
1780  typedef struct kmp_hwloc_cpukinds_info_t {
1781  int efficiency;
1782  kmp_hw_core_type_t core_type;
1783  hwloc_bitmap_t mask;
1784  } kmp_hwloc_cpukinds_info_t;
1785  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
1786 
1787  if (nr_cpu_kinds > 0) {
1788  unsigned nr_infos;
1789  struct hwloc_info_s *infos;
1790  cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
1791  sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
1792  for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
1793  cpukinds[idx].efficiency = -1;
1794  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
1795  cpukinds[idx].mask = hwloc_bitmap_alloc();
1796  if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
1797  &cpukinds[idx].efficiency, &nr_infos, &infos,
1798  0) == 0) {
1799  for (unsigned i = 0; i < nr_infos; ++i) {
1800  if (__kmp_str_match("CoreType", 8, infos[i].name)) {
1801 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1802  if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
1803  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
1804  break;
1805  } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
1806  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
1807  break;
1808  }
1809 #endif
1810  }
1811  }
1812  }
1813  }
1814  }
1815 #endif
1816 
1817  root = hwloc_get_root_obj(tp);
1818 
1819  // Figure out the depth and types in the topology
1820  depth = 0;
1821  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1822  KMP_ASSERT(pu);
1823  obj = pu;
1824  types[depth] = KMP_HW_THREAD;
1825  hwloc_types[depth] = obj->type;
1826  depth++;
1827  while (obj != root && obj != NULL) {
1828  obj = obj->parent;
1829 #if HWLOC_API_VERSION >= 0x00020000
1830  if (obj->memory_arity) {
1831  hwloc_obj_t memory;
1832  for (memory = obj->memory_first_child; memory;
1833  memory = hwloc_get_next_child(tp, obj, memory)) {
1834  if (memory->type == HWLOC_OBJ_NUMANODE)
1835  break;
1836  }
1837  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1838  types[depth] = KMP_HW_NUMA;
1839  hwloc_types[depth] = memory->type;
1840  depth++;
1841  }
1842  }
1843 #endif
1844  type = __kmp_hwloc_type_2_topology_type(obj);
1845  if (type != KMP_HW_UNKNOWN) {
1846  types[depth] = type;
1847  hwloc_types[depth] = obj->type;
1848  depth++;
1849  }
1850  }
1851  KMP_ASSERT(depth > 0);
1852 
1853  // Get the order for the types correct
1854  for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1855  hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1856  kmp_hw_t temp = types[i];
1857  types[i] = types[j];
1858  types[j] = temp;
1859  hwloc_types[i] = hwloc_types[j];
1860  hwloc_types[j] = hwloc_temp;
1861  }
1862 
1863  // Allocate the data structure to be returned.
1864  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1865 
1866  hw_thread_index = 0;
1867  pu = NULL;
1868  while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
1869  int index = depth - 1;
1870  bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1871  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1872  if (included) {
1873  hw_thread.clear();
1874  hw_thread.ids[index] = pu->logical_index;
1875  hw_thread.os_id = pu->os_index;
1876  // If multiple core types, then set that attribute for the hardware thread
1877 #if HWLOC_API_VERSION >= 0x00020400
1878  if (cpukinds) {
1879  int cpukind_index = -1;
1880  for (int i = 0; i < nr_cpu_kinds; ++i) {
1881  if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
1882  cpukind_index = i;
1883  break;
1884  }
1885  }
1886  if (cpukind_index >= 0) {
1887  hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
1888  hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
1889  }
1890  }
1891 #endif
1892  index--;
1893  }
1894  obj = pu;
1895  prev = obj;
1896  while (obj != root && obj != NULL) {
1897  obj = obj->parent;
1898 #if HWLOC_API_VERSION >= 0x00020000
1899  // NUMA Nodes are handled differently since they are not within the
1900  // parent/child structure anymore. They are separate children
1901  // of obj (memory_first_child points to first memory child)
1902  if (obj->memory_arity) {
1903  hwloc_obj_t memory;
1904  for (memory = obj->memory_first_child; memory;
1905  memory = hwloc_get_next_child(tp, obj, memory)) {
1906  if (memory->type == HWLOC_OBJ_NUMANODE)
1907  break;
1908  }
1909  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1910  sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1911  if (included) {
1912  hw_thread.ids[index] = memory->logical_index;
1913  hw_thread.ids[index + 1] = sub_id;
1914  index--;
1915  }
1916  prev = memory;
1917  }
1918  prev = obj;
1919  }
1920 #endif
1921  type = __kmp_hwloc_type_2_topology_type(obj);
1922  if (type != KMP_HW_UNKNOWN) {
1923  sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1924  if (included) {
1925  hw_thread.ids[index] = obj->logical_index;
1926  hw_thread.ids[index + 1] = sub_id;
1927  index--;
1928  }
1929  prev = obj;
1930  }
1931  }
1932  if (included)
1933  hw_thread_index++;
1934  }
1935 
1936 #if HWLOC_API_VERSION >= 0x00020400
1937  // Free the core types information
1938  if (cpukinds) {
1939  for (int idx = 0; idx < nr_cpu_kinds; ++idx)
1940  hwloc_bitmap_free(cpukinds[idx].mask);
1941  __kmp_free(cpukinds);
1942  }
1943 #endif
1944  __kmp_topology->sort_ids();
1945  return true;
1946 }
1947 #endif // KMP_USE_HWLOC
1948 
1949 // If we don't know how to retrieve the machine's processor topology, or
1950 // encounter an error in doing so, this routine is called to form a "flat"
1951 // mapping of os thread id's <-> processor id's.
1952 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1953  *msg_id = kmp_i18n_null;
1954  int depth = 3;
1955  kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1956 
1957  if (__kmp_affinity.flags.verbose) {
1958  KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1959  }
1960 
1961  // Even if __kmp_affinity.type == affinity_none, this routine might still
1962  // be called to set __kmp_ncores, as well as
1963  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1964  if (!KMP_AFFINITY_CAPABLE()) {
1965  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1966  __kmp_ncores = nPackages = __kmp_xproc;
1967  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1968  return true;
1969  }
1970 
1971  // When affinity is off, this routine will still be called to set
1972  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1973  // Make sure all these vars are set correctly, and return now if affinity is
1974  // not enabled.
1975  __kmp_ncores = nPackages = __kmp_avail_proc;
1976  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1977 
1978  // Construct the data structure to be returned.
1979  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1980  int avail_ct = 0;
1981  int i;
1982  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1983  // Skip this proc if it is not included in the machine model.
1984  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1985  continue;
1986  }
1987  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
1988  hw_thread.clear();
1989  hw_thread.os_id = i;
1990  hw_thread.ids[0] = i;
1991  hw_thread.ids[1] = 0;
1992  hw_thread.ids[2] = 0;
1993  avail_ct++;
1994  }
1995  if (__kmp_affinity.flags.verbose) {
1996  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
1997  }
1998  return true;
1999 }
2000 
2001 #if KMP_GROUP_AFFINITY
2002 // If multiple Windows* OS processor groups exist, we can create a 2-level
2003 // topology map with the groups at level 0 and the individual procs at level 1.
2004 // This facilitates letting the threads float among all procs in a group,
2005 // if granularity=group (the default when there are multiple groups).
2006 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
2007  *msg_id = kmp_i18n_null;
2008  int depth = 3;
2009  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
2010  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
2011 
2012  if (__kmp_affinity.flags.verbose) {
2013  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
2014  }
2015 
2016  // If we aren't affinity capable, then use flat topology
2017  if (!KMP_AFFINITY_CAPABLE()) {
2018  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2019  nPackages = __kmp_num_proc_groups;
2020  __kmp_nThreadsPerCore = 1;
2021  __kmp_ncores = __kmp_xproc;
2022  nCoresPerPkg = nPackages / __kmp_ncores;
2023  return true;
2024  }
2025 
2026  // Construct the data structure to be returned.
2027  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2028  int avail_ct = 0;
2029  int i;
2030  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2031  // Skip this proc if it is not included in the machine model.
2032  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2033  continue;
2034  }
2035  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
2036  hw_thread.clear();
2037  hw_thread.os_id = i;
2038  hw_thread.ids[0] = i / BITS_PER_GROUP;
2039  hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
2040  }
2041  return true;
2042 }
2043 #endif /* KMP_GROUP_AFFINITY */
2044 
2045 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2046 
2047 template <kmp_uint32 LSB, kmp_uint32 MSB>
2048 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
2049  const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
2050  const kmp_uint32 SHIFT_RIGHT = LSB;
2051  kmp_uint32 retval = v;
2052  retval <<= SHIFT_LEFT;
2053  retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
2054  return retval;
2055 }
2056 
2057 static int __kmp_cpuid_mask_width(int count) {
2058  int r = 0;
2059 
2060  while ((1 << r) < count)
2061  ++r;
2062  return r;
2063 }
2064 
2065 class apicThreadInfo {
2066 public:
2067  unsigned osId; // param to __kmp_affinity_bind_thread
2068  unsigned apicId; // from cpuid after binding
2069  unsigned maxCoresPerPkg; // ""
2070  unsigned maxThreadsPerPkg; // ""
2071  unsigned pkgId; // inferred from above values
2072  unsigned coreId; // ""
2073  unsigned threadId; // ""
2074 };
2075 
2076 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
2077  const void *b) {
2078  const apicThreadInfo *aa = (const apicThreadInfo *)a;
2079  const apicThreadInfo *bb = (const apicThreadInfo *)b;
2080  if (aa->pkgId < bb->pkgId)
2081  return -1;
2082  if (aa->pkgId > bb->pkgId)
2083  return 1;
2084  if (aa->coreId < bb->coreId)
2085  return -1;
2086  if (aa->coreId > bb->coreId)
2087  return 1;
2088  if (aa->threadId < bb->threadId)
2089  return -1;
2090  if (aa->threadId > bb->threadId)
2091  return 1;
2092  return 0;
2093 }
2094 
2095 class kmp_cache_info_t {
2096 public:
2097  struct info_t {
2098  unsigned level, mask;
2099  };
2100  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
2101  size_t get_depth() const { return depth; }
2102  info_t &operator[](size_t index) { return table[index]; }
2103  const info_t &operator[](size_t index) const { return table[index]; }
2104 
2105  static kmp_hw_t get_topology_type(unsigned level) {
2106  KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
2107  switch (level) {
2108  case 1:
2109  return KMP_HW_L1;
2110  case 2:
2111  return KMP_HW_L2;
2112  case 3:
2113  return KMP_HW_L3;
2114  }
2115  return KMP_HW_UNKNOWN;
2116  }
2117 
2118 private:
2119  static const int MAX_CACHE_LEVEL = 3;
2120 
2121  size_t depth;
2122  info_t table[MAX_CACHE_LEVEL];
2123 
2124  void get_leaf4_levels() {
2125  unsigned level = 0;
2126  while (depth < MAX_CACHE_LEVEL) {
2127  unsigned cache_type, max_threads_sharing;
2128  unsigned cache_level, cache_mask_width;
2129  kmp_cpuid buf2;
2130  __kmp_x86_cpuid(4, level, &buf2);
2131  cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
2132  if (!cache_type)
2133  break;
2134  // Skip instruction caches
2135  if (cache_type == 2) {
2136  level++;
2137  continue;
2138  }
2139  max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
2140  cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
2141  cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
2142  table[depth].level = cache_level;
2143  table[depth].mask = ((-1) << cache_mask_width);
2144  depth++;
2145  level++;
2146  }
2147  }
2148 };
2149 
2150 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2151 // an algorithm which cycles through the available os threads, setting
2152 // the current thread's affinity mask to that thread, and then retrieves
2153 // the Apic Id for each thread context using the cpuid instruction.
2154 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
2155  kmp_cpuid buf;
2156  *msg_id = kmp_i18n_null;
2157 
2158  if (__kmp_affinity.flags.verbose) {
2159  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
2160  }
2161 
2162  // Check if cpuid leaf 4 is supported.
2163  __kmp_x86_cpuid(0, 0, &buf);
2164  if (buf.eax < 4) {
2165  *msg_id = kmp_i18n_str_NoLeaf4Support;
2166  return false;
2167  }
2168 
2169  // The algorithm used starts by setting the affinity to each available thread
2170  // and retrieving info from the cpuid instruction, so if we are not capable of
2171  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
2172  // need to do something else - use the defaults that we calculated from
2173  // issuing cpuid without binding to each proc.
2174  if (!KMP_AFFINITY_CAPABLE()) {
2175  // Hack to try and infer the machine topology using only the data
2176  // available from cpuid on the current thread, and __kmp_xproc.
2177  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2178 
2179  // Get an upper bound on the number of threads per package using cpuid(1).
2180  // On some OS/chps combinations where HT is supported by the chip but is
2181  // disabled, this value will be 2 on a single core chip. Usually, it will be
2182  // 2 if HT is enabled and 1 if HT is disabled.
2183  __kmp_x86_cpuid(1, 0, &buf);
2184  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2185  if (maxThreadsPerPkg == 0) {
2186  maxThreadsPerPkg = 1;
2187  }
2188 
2189  // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
2190  // value.
2191  //
2192  // The author of cpu_count.cpp treated this only an upper bound on the
2193  // number of cores, but I haven't seen any cases where it was greater than
2194  // the actual number of cores, so we will treat it as exact in this block of
2195  // code.
2196  //
2197  // First, we need to check if cpuid(4) is supported on this chip. To see if
2198  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
2199  // greater.
2200  __kmp_x86_cpuid(0, 0, &buf);
2201  if (buf.eax >= 4) {
2202  __kmp_x86_cpuid(4, 0, &buf);
2203  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2204  } else {
2205  nCoresPerPkg = 1;
2206  }
2207 
2208  // There is no way to reliably tell if HT is enabled without issuing the
2209  // cpuid instruction from every thread, can correlating the cpuid info, so
2210  // if the machine is not affinity capable, we assume that HT is off. We have
2211  // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
2212  // does not support HT.
2213  //
2214  // - Older OSes are usually found on machines with older chips, which do not
2215  // support HT.
2216  // - The performance penalty for mistakenly identifying a machine as HT when
2217  // it isn't (which results in blocktime being incorrectly set to 0) is
2218  // greater than the penalty when for mistakenly identifying a machine as
2219  // being 1 thread/core when it is really HT enabled (which results in
2220  // blocktime being incorrectly set to a positive value).
2221  __kmp_ncores = __kmp_xproc;
2222  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2223  __kmp_nThreadsPerCore = 1;
2224  return true;
2225  }
2226 
2227  // From here on, we can assume that it is safe to call
2228  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2229  // __kmp_affinity.type = affinity_none.
2230 
2231  // Save the affinity mask for the current thread.
2232  kmp_affinity_raii_t previous_affinity;
2233 
2234  // Run through each of the available contexts, binding the current thread
2235  // to it, and obtaining the pertinent information using the cpuid instr.
2236  //
2237  // The relevant information is:
2238  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2239  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
2240  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2241  // of this field determines the width of the core# + thread# fields in the
2242  // Apic Id. It is also an upper bound on the number of threads per
2243  // package, but it has been verified that situations happen were it is not
2244  // exact. In particular, on certain OS/chip combinations where Intel(R)
2245  // Hyper-Threading Technology is supported by the chip but has been
2246  // disabled, the value of this field will be 2 (for a single core chip).
2247  // On other OS/chip combinations supporting Intel(R) Hyper-Threading
2248  // Technology, the value of this field will be 1 when Intel(R)
2249  // Hyper-Threading Technology is disabled and 2 when it is enabled.
2250  // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
2251  // of this field (+1) determines the width of the core# field in the Apic
2252  // Id. The comments in "cpucount.cpp" say that this value is an upper
2253  // bound, but the IA-32 architecture manual says that it is exactly the
2254  // number of cores per package, and I haven't seen any case where it
2255  // wasn't.
2256  //
2257  // From this information, deduce the package Id, core Id, and thread Id,
2258  // and set the corresponding fields in the apicThreadInfo struct.
2259  unsigned i;
2260  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
2261  __kmp_avail_proc * sizeof(apicThreadInfo));
2262  unsigned nApics = 0;
2263  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2264  // Skip this proc if it is not included in the machine model.
2265  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2266  continue;
2267  }
2268  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
2269 
2270  __kmp_affinity_dispatch->bind_thread(i);
2271  threadInfo[nApics].osId = i;
2272 
2273  // The apic id and max threads per pkg come from cpuid(1).
2274  __kmp_x86_cpuid(1, 0, &buf);
2275  if (((buf.edx >> 9) & 1) == 0) {
2276  __kmp_free(threadInfo);
2277  *msg_id = kmp_i18n_str_ApicNotPresent;
2278  return false;
2279  }
2280  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
2281  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2282  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
2283  threadInfo[nApics].maxThreadsPerPkg = 1;
2284  }
2285 
2286  // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
2287  // value.
2288  //
2289  // First, we need to check if cpuid(4) is supported on this chip. To see if
2290  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
2291  // or greater.
2292  __kmp_x86_cpuid(0, 0, &buf);
2293  if (buf.eax >= 4) {
2294  __kmp_x86_cpuid(4, 0, &buf);
2295  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2296  } else {
2297  threadInfo[nApics].maxCoresPerPkg = 1;
2298  }
2299 
2300  // Infer the pkgId / coreId / threadId using only the info obtained locally.
2301  int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
2302  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
2303 
2304  int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
2305  int widthT = widthCT - widthC;
2306  if (widthT < 0) {
2307  // I've never seen this one happen, but I suppose it could, if the cpuid
2308  // instruction on a chip was really screwed up. Make sure to restore the
2309  // affinity mask before the tail call.
2310  __kmp_free(threadInfo);
2311  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2312  return false;
2313  }
2314 
2315  int maskC = (1 << widthC) - 1;
2316  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
2317 
2318  int maskT = (1 << widthT) - 1;
2319  threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
2320 
2321  nApics++;
2322  }
2323 
2324  // We've collected all the info we need.
2325  // Restore the old affinity mask for this thread.
2326  previous_affinity.restore();
2327 
2328  // Sort the threadInfo table by physical Id.
2329  qsort(threadInfo, nApics, sizeof(*threadInfo),
2330  __kmp_affinity_cmp_apicThreadInfo_phys_id);
2331 
2332  // The table is now sorted by pkgId / coreId / threadId, but we really don't
2333  // know the radix of any of the fields. pkgId's may be sparsely assigned among
2334  // the chips on a system. Although coreId's are usually assigned
2335  // [0 .. coresPerPkg-1] and threadId's are usually assigned
2336  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2337  //
2338  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2339  // total # packages) are at this point - we want to determine that now. We
2340  // only have an upper bound on the first two figures.
2341  //
2342  // We also perform a consistency check at this point: the values returned by
2343  // the cpuid instruction for any thread bound to a given package had better
2344  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
2345  nPackages = 1;
2346  nCoresPerPkg = 1;
2347  __kmp_nThreadsPerCore = 1;
2348  unsigned nCores = 1;
2349 
2350  unsigned pkgCt = 1; // to determine radii
2351  unsigned lastPkgId = threadInfo[0].pkgId;
2352  unsigned coreCt = 1;
2353  unsigned lastCoreId = threadInfo[0].coreId;
2354  unsigned threadCt = 1;
2355  unsigned lastThreadId = threadInfo[0].threadId;
2356 
2357  // intra-pkg consist checks
2358  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
2359  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
2360 
2361  for (i = 1; i < nApics; i++) {
2362  if (threadInfo[i].pkgId != lastPkgId) {
2363  nCores++;
2364  pkgCt++;
2365  lastPkgId = threadInfo[i].pkgId;
2366  if ((int)coreCt > nCoresPerPkg)
2367  nCoresPerPkg = coreCt;
2368  coreCt = 1;
2369  lastCoreId = threadInfo[i].coreId;
2370  if ((int)threadCt > __kmp_nThreadsPerCore)
2371  __kmp_nThreadsPerCore = threadCt;
2372  threadCt = 1;
2373  lastThreadId = threadInfo[i].threadId;
2374 
2375  // This is a different package, so go on to the next iteration without
2376  // doing any consistency checks. Reset the consistency check vars, though.
2377  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
2378  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
2379  continue;
2380  }
2381 
2382  if (threadInfo[i].coreId != lastCoreId) {
2383  nCores++;
2384  coreCt++;
2385  lastCoreId = threadInfo[i].coreId;
2386  if ((int)threadCt > __kmp_nThreadsPerCore)
2387  __kmp_nThreadsPerCore = threadCt;
2388  threadCt = 1;
2389  lastThreadId = threadInfo[i].threadId;
2390  } else if (threadInfo[i].threadId != lastThreadId) {
2391  threadCt++;
2392  lastThreadId = threadInfo[i].threadId;
2393  } else {
2394  __kmp_free(threadInfo);
2395  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2396  return false;
2397  }
2398 
2399  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
2400  // fields agree between all the threads bounds to a given package.
2401  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
2402  (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
2403  __kmp_free(threadInfo);
2404  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
2405  return false;
2406  }
2407  }
2408  // When affinity is off, this routine will still be called to set
2409  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2410  // Make sure all these vars are set correctly
2411  nPackages = pkgCt;
2412  if ((int)coreCt > nCoresPerPkg)
2413  nCoresPerPkg = coreCt;
2414  if ((int)threadCt > __kmp_nThreadsPerCore)
2415  __kmp_nThreadsPerCore = threadCt;
2416  __kmp_ncores = nCores;
2417  KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
2418 
2419  // Now that we've determined the number of packages, the number of cores per
2420  // package, and the number of threads per core, we can construct the data
2421  // structure that is to be returned.
2422  int idx = 0;
2423  int pkgLevel = 0;
2424  int coreLevel = 1;
2425  int threadLevel = 2;
2426  //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2427  int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
2428  kmp_hw_t types[3];
2429  if (pkgLevel >= 0)
2430  types[idx++] = KMP_HW_SOCKET;
2431  if (coreLevel >= 0)
2432  types[idx++] = KMP_HW_CORE;
2433  if (threadLevel >= 0)
2434  types[idx++] = KMP_HW_THREAD;
2435 
2436  KMP_ASSERT(depth > 0);
2437  __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
2438 
2439  for (i = 0; i < nApics; ++i) {
2440  idx = 0;
2441  unsigned os = threadInfo[i].osId;
2442  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2443  hw_thread.clear();
2444 
2445  if (pkgLevel >= 0) {
2446  hw_thread.ids[idx++] = threadInfo[i].pkgId;
2447  }
2448  if (coreLevel >= 0) {
2449  hw_thread.ids[idx++] = threadInfo[i].coreId;
2450  }
2451  if (threadLevel >= 0) {
2452  hw_thread.ids[idx++] = threadInfo[i].threadId;
2453  }
2454  hw_thread.os_id = os;
2455  }
2456 
2457  __kmp_free(threadInfo);
2458  __kmp_topology->sort_ids();
2459  if (!__kmp_topology->check_ids()) {
2460  kmp_topology_t::deallocate(__kmp_topology);
2461  __kmp_topology = nullptr;
2462  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2463  return false;
2464  }
2465  return true;
2466 }
2467 
2468 // Hybrid cpu detection using CPUID.1A
2469 // Thread should be pinned to processor already
2470 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
2471  unsigned *native_model_id) {
2472  kmp_cpuid buf;
2473  __kmp_x86_cpuid(0x1a, 0, &buf);
2474  *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
2475  switch (*type) {
2476  case KMP_HW_CORE_TYPE_ATOM:
2477  *efficiency = 0;
2478  break;
2479  case KMP_HW_CORE_TYPE_CORE:
2480  *efficiency = 1;
2481  break;
2482  default:
2483  *efficiency = 0;
2484  }
2485  *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
2486 }
2487 
2488 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
2489 // architectures support a newer interface for specifying the x2APIC Ids,
2490 // based on CPUID.B or CPUID.1F
2491 /*
2492  * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
2493  Bits Bits Bits Bits
2494  31-16 15-8 7-4 4-0
2495 ---+-----------+--------------+-------------+-----------------+
2496 EAX| reserved | reserved | reserved | Bits to Shift |
2497 ---+-----------|--------------+-------------+-----------------|
2498 EBX| reserved | Num logical processors at level (16 bits) |
2499 ---+-----------|--------------+-------------------------------|
2500 ECX| reserved | Level Type | Level Number (8 bits) |
2501 ---+-----------+--------------+-------------------------------|
2502 EDX| X2APIC ID (32 bits) |
2503 ---+----------------------------------------------------------+
2504 */
2505 
2506 enum {
2507  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
2508  INTEL_LEVEL_TYPE_SMT = 1,
2509  INTEL_LEVEL_TYPE_CORE = 2,
2510  INTEL_LEVEL_TYPE_MODULE = 3,
2511  INTEL_LEVEL_TYPE_TILE = 4,
2512  INTEL_LEVEL_TYPE_DIE = 5,
2513  INTEL_LEVEL_TYPE_LAST = 6,
2514 };
2515 
2516 struct cpuid_level_info_t {
2517  unsigned level_type, mask, mask_width, nitems, cache_mask;
2518 };
2519 
2520 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
2521  switch (intel_type) {
2522  case INTEL_LEVEL_TYPE_INVALID:
2523  return KMP_HW_SOCKET;
2524  case INTEL_LEVEL_TYPE_SMT:
2525  return KMP_HW_THREAD;
2526  case INTEL_LEVEL_TYPE_CORE:
2527  return KMP_HW_CORE;
2528  case INTEL_LEVEL_TYPE_TILE:
2529  return KMP_HW_TILE;
2530  case INTEL_LEVEL_TYPE_MODULE:
2531  return KMP_HW_MODULE;
2532  case INTEL_LEVEL_TYPE_DIE:
2533  return KMP_HW_DIE;
2534  }
2535  return KMP_HW_UNKNOWN;
2536 }
2537 
2538 // This function takes the topology leaf, a levels array to store the levels
2539 // detected and a bitmap of the known levels.
2540 // Returns the number of levels in the topology
2541 static unsigned
2542 __kmp_x2apicid_get_levels(int leaf,
2543  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
2544  kmp_uint64 known_levels) {
2545  unsigned level, levels_index;
2546  unsigned level_type, mask_width, nitems;
2547  kmp_cpuid buf;
2548 
2549  // New algorithm has known topology layers act as highest unknown topology
2550  // layers when unknown topology layers exist.
2551  // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
2552  // are unknown topology layers, Then SMT will take the characteristics of
2553  // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
2554  // This eliminates unknown portions of the topology while still keeping the
2555  // correct structure.
2556  level = levels_index = 0;
2557  do {
2558  __kmp_x86_cpuid(leaf, level, &buf);
2559  level_type = __kmp_extract_bits<8, 15>(buf.ecx);
2560  mask_width = __kmp_extract_bits<0, 4>(buf.eax);
2561  nitems = __kmp_extract_bits<0, 15>(buf.ebx);
2562  if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
2563  return 0;
2564 
2565  if (known_levels & (1ull << level_type)) {
2566  // Add a new level to the topology
2567  KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
2568  levels[levels_index].level_type = level_type;
2569  levels[levels_index].mask_width = mask_width;
2570  levels[levels_index].nitems = nitems;
2571  levels_index++;
2572  } else {
2573  // If it is an unknown level, then logically move the previous layer up
2574  if (levels_index > 0) {
2575  levels[levels_index - 1].mask_width = mask_width;
2576  levels[levels_index - 1].nitems = nitems;
2577  }
2578  }
2579  level++;
2580  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
2581 
2582  // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2583  if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
2584  return 0;
2585 
2586  // Set the masks to & with apicid
2587  for (unsigned i = 0; i < levels_index; ++i) {
2588  if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
2589  levels[i].mask = ~((-1) << levels[i].mask_width);
2590  levels[i].cache_mask = (-1) << levels[i].mask_width;
2591  for (unsigned j = 0; j < i; ++j)
2592  levels[i].mask ^= levels[j].mask;
2593  } else {
2594  KMP_DEBUG_ASSERT(i > 0);
2595  levels[i].mask = (-1) << levels[i - 1].mask_width;
2596  levels[i].cache_mask = 0;
2597  }
2598  }
2599  return levels_index;
2600 }
2601 
2602 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
2603 
2604  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
2605  kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
2606  unsigned levels_index;
2607  kmp_cpuid buf;
2608  kmp_uint64 known_levels;
2609  int topology_leaf, highest_leaf, apic_id;
2610  int num_leaves;
2611  static int leaves[] = {0, 0};
2612 
2613  kmp_i18n_id_t leaf_message_id;
2614 
2615  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
2616 
2617  *msg_id = kmp_i18n_null;
2618  if (__kmp_affinity.flags.verbose) {
2619  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
2620  }
2621 
2622  // Figure out the known topology levels
2623  known_levels = 0ull;
2624  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
2625  if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
2626  known_levels |= (1ull << i);
2627  }
2628  }
2629 
2630  // Get the highest cpuid leaf supported
2631  __kmp_x86_cpuid(0, 0, &buf);
2632  highest_leaf = buf.eax;
2633 
2634  // If a specific topology method was requested, only allow that specific leaf
2635  // otherwise, try both leaves 31 and 11 in that order
2636  num_leaves = 0;
2637  if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
2638  num_leaves = 1;
2639  leaves[0] = 11;
2640  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2641  } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
2642  num_leaves = 1;
2643  leaves[0] = 31;
2644  leaf_message_id = kmp_i18n_str_NoLeaf31Support;
2645  } else {
2646  num_leaves = 2;
2647  leaves[0] = 31;
2648  leaves[1] = 11;
2649  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2650  }
2651 
2652  // Check to see if cpuid leaf 31 or 11 is supported.
2653  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2654  topology_leaf = -1;
2655  for (int i = 0; i < num_leaves; ++i) {
2656  int leaf = leaves[i];
2657  if (highest_leaf < leaf)
2658  continue;
2659  __kmp_x86_cpuid(leaf, 0, &buf);
2660  if (buf.ebx == 0)
2661  continue;
2662  topology_leaf = leaf;
2663  levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2664  if (levels_index == 0)
2665  continue;
2666  break;
2667  }
2668  if (topology_leaf == -1 || levels_index == 0) {
2669  *msg_id = leaf_message_id;
2670  return false;
2671  }
2672  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2673 
2674  // The algorithm used starts by setting the affinity to each available thread
2675  // and retrieving info from the cpuid instruction, so if we are not capable of
2676  // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2677  // we need to do something else - use the defaults that we calculated from
2678  // issuing cpuid without binding to each proc.
2679  if (!KMP_AFFINITY_CAPABLE()) {
2680  // Hack to try and infer the machine topology using only the data
2681  // available from cpuid on the current thread, and __kmp_xproc.
2682  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2683  for (unsigned i = 0; i < levels_index; ++i) {
2684  if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2685  __kmp_nThreadsPerCore = levels[i].nitems;
2686  } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2687  nCoresPerPkg = levels[i].nitems;
2688  }
2689  }
2690  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2691  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2692  return true;
2693  }
2694 
2695  // Allocate the data structure to be returned.
2696  int depth = levels_index;
2697  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2698  types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2699  __kmp_topology =
2700  kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2701 
2702  // Insert equivalent cache types if they exist
2703  kmp_cache_info_t cache_info;
2704  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2705  const kmp_cache_info_t::info_t &info = cache_info[i];
2706  unsigned cache_mask = info.mask;
2707  unsigned cache_level = info.level;
2708  for (unsigned j = 0; j < levels_index; ++j) {
2709  unsigned hw_cache_mask = levels[j].cache_mask;
2710  kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2711  if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2712  kmp_hw_t type =
2713  __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2714  __kmp_topology->set_equivalent_type(cache_type, type);
2715  }
2716  }
2717  }
2718 
2719  // From here on, we can assume that it is safe to call
2720  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2721  // __kmp_affinity.type = affinity_none.
2722 
2723  // Save the affinity mask for the current thread.
2724  kmp_affinity_raii_t previous_affinity;
2725 
2726  // Run through each of the available contexts, binding the current thread
2727  // to it, and obtaining the pertinent information using the cpuid instr.
2728  unsigned int proc;
2729  int hw_thread_index = 0;
2730  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2731  cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2732  unsigned my_levels_index;
2733 
2734  // Skip this proc if it is not included in the machine model.
2735  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2736  continue;
2737  }
2738  KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2739 
2740  __kmp_affinity_dispatch->bind_thread(proc);
2741 
2742  // New algorithm
2743  __kmp_x86_cpuid(topology_leaf, 0, &buf);
2744  apic_id = buf.edx;
2745  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2746  my_levels_index =
2747  __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2748  if (my_levels_index == 0 || my_levels_index != levels_index) {
2749  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2750  return false;
2751  }
2752  hw_thread.clear();
2753  hw_thread.os_id = proc;
2754  // Put in topology information
2755  for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2756  hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2757  if (j > 0) {
2758  hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2759  }
2760  }
2761  // Hybrid information
2762  if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2763  kmp_hw_core_type_t type;
2764  unsigned native_model_id;
2765  int efficiency;
2766  __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
2767  hw_thread.attrs.set_core_type(type);
2768  hw_thread.attrs.set_core_eff(efficiency);
2769  }
2770  hw_thread_index++;
2771  }
2772  KMP_ASSERT(hw_thread_index > 0);
2773  __kmp_topology->sort_ids();
2774  if (!__kmp_topology->check_ids()) {
2775  kmp_topology_t::deallocate(__kmp_topology);
2776  __kmp_topology = nullptr;
2777  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2778  return false;
2779  }
2780  return true;
2781 }
2782 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2783 
2784 #define osIdIndex 0
2785 #define threadIdIndex 1
2786 #define coreIdIndex 2
2787 #define pkgIdIndex 3
2788 #define nodeIdIndex 4
2789 
2790 typedef unsigned *ProcCpuInfo;
2791 static unsigned maxIndex = pkgIdIndex;
2792 
2793 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2794  const void *b) {
2795  unsigned i;
2796  const unsigned *aa = *(unsigned *const *)a;
2797  const unsigned *bb = *(unsigned *const *)b;
2798  for (i = maxIndex;; i--) {
2799  if (aa[i] < bb[i])
2800  return -1;
2801  if (aa[i] > bb[i])
2802  return 1;
2803  if (i == osIdIndex)
2804  break;
2805  }
2806  return 0;
2807 }
2808 
2809 #if KMP_USE_HIER_SCHED
2810 // Set the array sizes for the hierarchy layers
2811 static void __kmp_dispatch_set_hierarchy_values() {
2812  // Set the maximum number of L1's to number of cores
2813  // Set the maximum number of L2's to either number of cores / 2 for
2814  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2815  // Or the number of cores for Intel(R) Xeon(R) processors
2816  // Set the maximum number of NUMA nodes and L3's to number of packages
2817  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2818  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2819  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2820 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2821  KMP_MIC_SUPPORTED
2822  if (__kmp_mic_type >= mic3)
2823  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2824  else
2825 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2826  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2827  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2828  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2829  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2830  // Set the number of threads per unit
2831  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2832  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2833  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2834  __kmp_nThreadsPerCore;
2835 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2836  KMP_MIC_SUPPORTED
2837  if (__kmp_mic_type >= mic3)
2838  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2839  2 * __kmp_nThreadsPerCore;
2840  else
2841 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2842  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2843  __kmp_nThreadsPerCore;
2844  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2845  nCoresPerPkg * __kmp_nThreadsPerCore;
2846  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2847  nCoresPerPkg * __kmp_nThreadsPerCore;
2848  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2849  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2850 }
2851 
2852 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2853 // i.e., this thread's L1 or this thread's L2, etc.
2854 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2855  int index = type + 1;
2856  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2857  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2858  if (type == kmp_hier_layer_e::LAYER_THREAD)
2859  return tid;
2860  else if (type == kmp_hier_layer_e::LAYER_LOOP)
2861  return 0;
2862  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2863  if (tid >= num_hw_threads)
2864  tid = tid % num_hw_threads;
2865  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2866 }
2867 
2868 // Return the number of t1's per t2
2869 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2870  int i1 = t1 + 1;
2871  int i2 = t2 + 1;
2872  KMP_DEBUG_ASSERT(i1 <= i2);
2873  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2874  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2875  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2876  // (nthreads/t2) / (nthreads/t1) = t1 / t2
2877  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2878 }
2879 #endif // KMP_USE_HIER_SCHED
2880 
2881 static inline const char *__kmp_cpuinfo_get_filename() {
2882  const char *filename;
2883  if (__kmp_cpuinfo_file != nullptr)
2884  filename = __kmp_cpuinfo_file;
2885  else
2886  filename = "/proc/cpuinfo";
2887  return filename;
2888 }
2889 
2890 static inline const char *__kmp_cpuinfo_get_envvar() {
2891  const char *envvar = nullptr;
2892  if (__kmp_cpuinfo_file != nullptr)
2893  envvar = "KMP_CPUINFO_FILE";
2894  return envvar;
2895 }
2896 
2897 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2898 // affinity map.
2899 static bool __kmp_affinity_create_cpuinfo_map(int *line,
2900  kmp_i18n_id_t *const msg_id) {
2901  const char *filename = __kmp_cpuinfo_get_filename();
2902  const char *envvar = __kmp_cpuinfo_get_envvar();
2903  *msg_id = kmp_i18n_null;
2904 
2905  if (__kmp_affinity.flags.verbose) {
2906  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2907  }
2908 
2909  kmp_safe_raii_file_t f(filename, "r", envvar);
2910 
2911  // Scan of the file, and count the number of "processor" (osId) fields,
2912  // and find the highest value of <n> for a node_<n> field.
2913  char buf[256];
2914  unsigned num_records = 0;
2915  while (!feof(f)) {
2916  buf[sizeof(buf) - 1] = 1;
2917  if (!fgets(buf, sizeof(buf), f)) {
2918  // Read errors presumably because of EOF
2919  break;
2920  }
2921 
2922  char s1[] = "processor";
2923  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2924  num_records++;
2925  continue;
2926  }
2927 
2928  // FIXME - this will match "node_<n> <garbage>"
2929  unsigned level;
2930  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2931  // validate the input fisrt:
2932  if (level > (unsigned)__kmp_xproc) { // level is too big
2933  level = __kmp_xproc;
2934  }
2935  if (nodeIdIndex + level >= maxIndex) {
2936  maxIndex = nodeIdIndex + level;
2937  }
2938  continue;
2939  }
2940  }
2941 
2942  // Check for empty file / no valid processor records, or too many. The number
2943  // of records can't exceed the number of valid bits in the affinity mask.
2944  if (num_records == 0) {
2945  *msg_id = kmp_i18n_str_NoProcRecords;
2946  return false;
2947  }
2948  if (num_records > (unsigned)__kmp_xproc) {
2949  *msg_id = kmp_i18n_str_TooManyProcRecords;
2950  return false;
2951  }
2952 
2953  // Set the file pointer back to the beginning, so that we can scan the file
2954  // again, this time performing a full parse of the data. Allocate a vector of
2955  // ProcCpuInfo object, where we will place the data. Adding an extra element
2956  // at the end allows us to remove a lot of extra checks for termination
2957  // conditions.
2958  if (fseek(f, 0, SEEK_SET) != 0) {
2959  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2960  return false;
2961  }
2962 
2963  // Allocate the array of records to store the proc info in. The dummy
2964  // element at the end makes the logic in filling them out easier to code.
2965  unsigned **threadInfo =
2966  (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2967  unsigned i;
2968  for (i = 0; i <= num_records; i++) {
2969  threadInfo[i] =
2970  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2971  }
2972 
2973 #define CLEANUP_THREAD_INFO \
2974  for (i = 0; i <= num_records; i++) { \
2975  __kmp_free(threadInfo[i]); \
2976  } \
2977  __kmp_free(threadInfo);
2978 
2979  // A value of UINT_MAX means that we didn't find the field
2980  unsigned __index;
2981 
2982 #define INIT_PROC_INFO(p) \
2983  for (__index = 0; __index <= maxIndex; __index++) { \
2984  (p)[__index] = UINT_MAX; \
2985  }
2986 
2987  for (i = 0; i <= num_records; i++) {
2988  INIT_PROC_INFO(threadInfo[i]);
2989  }
2990 
2991  unsigned num_avail = 0;
2992  *line = 0;
2993  while (!feof(f)) {
2994  // Create an inner scoping level, so that all the goto targets at the end of
2995  // the loop appear in an outer scoping level. This avoids warnings about
2996  // jumping past an initialization to a target in the same block.
2997  {
2998  buf[sizeof(buf) - 1] = 1;
2999  bool long_line = false;
3000  if (!fgets(buf, sizeof(buf), f)) {
3001  // Read errors presumably because of EOF
3002  // If there is valid data in threadInfo[num_avail], then fake
3003  // a blank line in ensure that the last address gets parsed.
3004  bool valid = false;
3005  for (i = 0; i <= maxIndex; i++) {
3006  if (threadInfo[num_avail][i] != UINT_MAX) {
3007  valid = true;
3008  }
3009  }
3010  if (!valid) {
3011  break;
3012  }
3013  buf[0] = 0;
3014  } else if (!buf[sizeof(buf) - 1]) {
3015  // The line is longer than the buffer. Set a flag and don't
3016  // emit an error if we were going to ignore the line, anyway.
3017  long_line = true;
3018 
3019 #define CHECK_LINE \
3020  if (long_line) { \
3021  CLEANUP_THREAD_INFO; \
3022  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
3023  return false; \
3024  }
3025  }
3026  (*line)++;
3027 
3028 #if KMP_ARCH_LOONGARCH64
3029  // The parsing logic of /proc/cpuinfo in this function highly depends on
3030  // the blank lines between each processor info block. But on LoongArch a
3031  // blank line exists before the first processor info block (i.e. after the
3032  // "system type" line). This blank line was added because the "system
3033  // type" line is unrelated to any of the CPUs. We must skip this line so
3034  // that the original logic works on LoongArch.
3035  if (*buf == '\n' && *line == 2)
3036  continue;
3037 #endif
3038 
3039  char s1[] = "processor";
3040  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
3041  CHECK_LINE;
3042  char *p = strchr(buf + sizeof(s1) - 1, ':');
3043  unsigned val;
3044  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3045  goto no_val;
3046  if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
3047 #if KMP_ARCH_AARCH64
3048  // Handle the old AArch64 /proc/cpuinfo layout differently,
3049  // it contains all of the 'processor' entries listed in a
3050  // single 'Processor' section, therefore the normal looking
3051  // for duplicates in that section will always fail.
3052  num_avail++;
3053 #else
3054  goto dup_field;
3055 #endif
3056  threadInfo[num_avail][osIdIndex] = val;
3057 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
3058  char path[256];
3059  KMP_SNPRINTF(
3060  path, sizeof(path),
3061  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
3062  threadInfo[num_avail][osIdIndex]);
3063  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
3064 
3065  KMP_SNPRINTF(path, sizeof(path),
3066  "/sys/devices/system/cpu/cpu%u/topology/core_id",
3067  threadInfo[num_avail][osIdIndex]);
3068  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
3069  continue;
3070 #else
3071  }
3072  char s2[] = "physical id";
3073  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
3074  CHECK_LINE;
3075  char *p = strchr(buf + sizeof(s2) - 1, ':');
3076  unsigned val;
3077  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3078  goto no_val;
3079  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
3080  goto dup_field;
3081  threadInfo[num_avail][pkgIdIndex] = val;
3082  continue;
3083  }
3084  char s3[] = "core id";
3085  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
3086  CHECK_LINE;
3087  char *p = strchr(buf + sizeof(s3) - 1, ':');
3088  unsigned val;
3089  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3090  goto no_val;
3091  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
3092  goto dup_field;
3093  threadInfo[num_avail][coreIdIndex] = val;
3094  continue;
3095 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
3096  }
3097  char s4[] = "thread id";
3098  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
3099  CHECK_LINE;
3100  char *p = strchr(buf + sizeof(s4) - 1, ':');
3101  unsigned val;
3102  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3103  goto no_val;
3104  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
3105  goto dup_field;
3106  threadInfo[num_avail][threadIdIndex] = val;
3107  continue;
3108  }
3109  unsigned level;
3110  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
3111  CHECK_LINE;
3112  char *p = strchr(buf + sizeof(s4) - 1, ':');
3113  unsigned val;
3114  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3115  goto no_val;
3116  // validate the input before using level:
3117  if (level > (unsigned)__kmp_xproc) { // level is too big
3118  level = __kmp_xproc;
3119  }
3120  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
3121  goto dup_field;
3122  threadInfo[num_avail][nodeIdIndex + level] = val;
3123  continue;
3124  }
3125 
3126  // We didn't recognize the leading token on the line. There are lots of
3127  // leading tokens that we don't recognize - if the line isn't empty, go on
3128  // to the next line.
3129  if ((*buf != 0) && (*buf != '\n')) {
3130  // If the line is longer than the buffer, read characters
3131  // until we find a newline.
3132  if (long_line) {
3133  int ch;
3134  while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
3135  ;
3136  }
3137  continue;
3138  }
3139 
3140  // A newline has signalled the end of the processor record.
3141  // Check that there aren't too many procs specified.
3142  if ((int)num_avail == __kmp_xproc) {
3143  CLEANUP_THREAD_INFO;
3144  *msg_id = kmp_i18n_str_TooManyEntries;
3145  return false;
3146  }
3147 
3148  // Check for missing fields. The osId field must be there, and we
3149  // currently require that the physical id field is specified, also.
3150  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
3151  CLEANUP_THREAD_INFO;
3152  *msg_id = kmp_i18n_str_MissingProcField;
3153  return false;
3154  }
3155  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
3156  CLEANUP_THREAD_INFO;
3157  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
3158  return false;
3159  }
3160 
3161  // Skip this proc if it is not included in the machine model.
3162  if (KMP_AFFINITY_CAPABLE() &&
3163  !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
3164  __kmp_affin_fullMask)) {
3165  INIT_PROC_INFO(threadInfo[num_avail]);
3166  continue;
3167  }
3168 
3169  // We have a successful parse of this proc's info.
3170  // Increment the counter, and prepare for the next proc.
3171  num_avail++;
3172  KMP_ASSERT(num_avail <= num_records);
3173  INIT_PROC_INFO(threadInfo[num_avail]);
3174  }
3175  continue;
3176 
3177  no_val:
3178  CLEANUP_THREAD_INFO;
3179  *msg_id = kmp_i18n_str_MissingValCpuinfo;
3180  return false;
3181 
3182  dup_field:
3183  CLEANUP_THREAD_INFO;
3184  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
3185  return false;
3186  }
3187  *line = 0;
3188 
3189 #if KMP_MIC && REDUCE_TEAM_SIZE
3190  unsigned teamSize = 0;
3191 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3192 
3193  // check for num_records == __kmp_xproc ???
3194 
3195  // If it is configured to omit the package level when there is only a single
3196  // package, the logic at the end of this routine won't work if there is only a
3197  // single thread
3198  KMP_ASSERT(num_avail > 0);
3199  KMP_ASSERT(num_avail <= num_records);
3200 
3201  // Sort the threadInfo table by physical Id.
3202  qsort(threadInfo, num_avail, sizeof(*threadInfo),
3203  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
3204 
3205  // The table is now sorted by pkgId / coreId / threadId, but we really don't
3206  // know the radix of any of the fields. pkgId's may be sparsely assigned among
3207  // the chips on a system. Although coreId's are usually assigned
3208  // [0 .. coresPerPkg-1] and threadId's are usually assigned
3209  // [0..threadsPerCore-1], we don't want to make any such assumptions.
3210  //
3211  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
3212  // total # packages) are at this point - we want to determine that now. We
3213  // only have an upper bound on the first two figures.
3214  unsigned *counts =
3215  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3216  unsigned *maxCt =
3217  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3218  unsigned *totals =
3219  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3220  unsigned *lastId =
3221  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3222 
3223  bool assign_thread_ids = false;
3224  unsigned threadIdCt;
3225  unsigned index;
3226 
3227 restart_radix_check:
3228  threadIdCt = 0;
3229 
3230  // Initialize the counter arrays with data from threadInfo[0].
3231  if (assign_thread_ids) {
3232  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
3233  threadInfo[0][threadIdIndex] = threadIdCt++;
3234  } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
3235  threadIdCt = threadInfo[0][threadIdIndex] + 1;
3236  }
3237  }
3238  for (index = 0; index <= maxIndex; index++) {
3239  counts[index] = 1;
3240  maxCt[index] = 1;
3241  totals[index] = 1;
3242  lastId[index] = threadInfo[0][index];
3243  ;
3244  }
3245 
3246  // Run through the rest of the OS procs.
3247  for (i = 1; i < num_avail; i++) {
3248  // Find the most significant index whose id differs from the id for the
3249  // previous OS proc.
3250  for (index = maxIndex; index >= threadIdIndex; index--) {
3251  if (assign_thread_ids && (index == threadIdIndex)) {
3252  // Auto-assign the thread id field if it wasn't specified.
3253  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3254  threadInfo[i][threadIdIndex] = threadIdCt++;
3255  }
3256  // Apparently the thread id field was specified for some entries and not
3257  // others. Start the thread id counter off at the next higher thread id.
3258  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3259  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3260  }
3261  }
3262  if (threadInfo[i][index] != lastId[index]) {
3263  // Run through all indices which are less significant, and reset the
3264  // counts to 1. At all levels up to and including index, we need to
3265  // increment the totals and record the last id.
3266  unsigned index2;
3267  for (index2 = threadIdIndex; index2 < index; index2++) {
3268  totals[index2]++;
3269  if (counts[index2] > maxCt[index2]) {
3270  maxCt[index2] = counts[index2];
3271  }
3272  counts[index2] = 1;
3273  lastId[index2] = threadInfo[i][index2];
3274  }
3275  counts[index]++;
3276  totals[index]++;
3277  lastId[index] = threadInfo[i][index];
3278 
3279  if (assign_thread_ids && (index > threadIdIndex)) {
3280 
3281 #if KMP_MIC && REDUCE_TEAM_SIZE
3282  // The default team size is the total #threads in the machine
3283  // minus 1 thread for every core that has 3 or more threads.
3284  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3285 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3286 
3287  // Restart the thread counter, as we are on a new core.
3288  threadIdCt = 0;
3289 
3290  // Auto-assign the thread id field if it wasn't specified.
3291  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3292  threadInfo[i][threadIdIndex] = threadIdCt++;
3293  }
3294 
3295  // Apparently the thread id field was specified for some entries and
3296  // not others. Start the thread id counter off at the next higher
3297  // thread id.
3298  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3299  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3300  }
3301  }
3302  break;
3303  }
3304  }
3305  if (index < threadIdIndex) {
3306  // If thread ids were specified, it is an error if they are not unique.
3307  // Also, check that we waven't already restarted the loop (to be safe -
3308  // shouldn't need to).
3309  if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
3310  __kmp_free(lastId);
3311  __kmp_free(totals);
3312  __kmp_free(maxCt);
3313  __kmp_free(counts);
3314  CLEANUP_THREAD_INFO;
3315  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3316  return false;
3317  }
3318 
3319  // If the thread ids were not specified and we see entries that
3320  // are duplicates, start the loop over and assign the thread ids manually.
3321  assign_thread_ids = true;
3322  goto restart_radix_check;
3323  }
3324  }
3325 
3326 #if KMP_MIC && REDUCE_TEAM_SIZE
3327  // The default team size is the total #threads in the machine
3328  // minus 1 thread for every core that has 3 or more threads.
3329  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3330 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3331 
3332  for (index = threadIdIndex; index <= maxIndex; index++) {
3333  if (counts[index] > maxCt[index]) {
3334  maxCt[index] = counts[index];
3335  }
3336  }
3337 
3338  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
3339  nCoresPerPkg = maxCt[coreIdIndex];
3340  nPackages = totals[pkgIdIndex];
3341 
3342  // When affinity is off, this routine will still be called to set
3343  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
3344  // Make sure all these vars are set correctly, and return now if affinity is
3345  // not enabled.
3346  __kmp_ncores = totals[coreIdIndex];
3347  if (!KMP_AFFINITY_CAPABLE()) {
3348  KMP_ASSERT(__kmp_affinity.type == affinity_none);
3349  return true;
3350  }
3351 
3352 #if KMP_MIC && REDUCE_TEAM_SIZE
3353  // Set the default team size.
3354  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
3355  __kmp_dflt_team_nth = teamSize;
3356  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
3357  "__kmp_dflt_team_nth = %d\n",
3358  __kmp_dflt_team_nth));
3359  }
3360 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3361 
3362  KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
3363 
3364  // Count the number of levels which have more nodes at that level than at the
3365  // parent's level (with there being an implicit root node of the top level).
3366  // This is equivalent to saying that there is at least one node at this level
3367  // which has a sibling. These levels are in the map, and the package level is
3368  // always in the map.
3369  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
3370  for (index = threadIdIndex; index < maxIndex; index++) {
3371  KMP_ASSERT(totals[index] >= totals[index + 1]);
3372  inMap[index] = (totals[index] > totals[index + 1]);
3373  }
3374  inMap[maxIndex] = (totals[maxIndex] > 1);
3375  inMap[pkgIdIndex] = true;
3376  inMap[coreIdIndex] = true;
3377  inMap[threadIdIndex] = true;
3378 
3379  int depth = 0;
3380  int idx = 0;
3381  kmp_hw_t types[KMP_HW_LAST];
3382  int pkgLevel = -1;
3383  int coreLevel = -1;
3384  int threadLevel = -1;
3385  for (index = threadIdIndex; index <= maxIndex; index++) {
3386  if (inMap[index]) {
3387  depth++;
3388  }
3389  }
3390  if (inMap[pkgIdIndex]) {
3391  pkgLevel = idx;
3392  types[idx++] = KMP_HW_SOCKET;
3393  }
3394  if (inMap[coreIdIndex]) {
3395  coreLevel = idx;
3396  types[idx++] = KMP_HW_CORE;
3397  }
3398  if (inMap[threadIdIndex]) {
3399  threadLevel = idx;
3400  types[idx++] = KMP_HW_THREAD;
3401  }
3402  KMP_ASSERT(depth > 0);
3403 
3404  // Construct the data structure that is to be returned.
3405  __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
3406 
3407  for (i = 0; i < num_avail; ++i) {
3408  unsigned os = threadInfo[i][osIdIndex];
3409  int src_index;
3410  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3411  hw_thread.clear();
3412  hw_thread.os_id = os;
3413 
3414  idx = 0;
3415  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
3416  if (!inMap[src_index]) {
3417  continue;
3418  }
3419  if (src_index == pkgIdIndex) {
3420  hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
3421  } else if (src_index == coreIdIndex) {
3422  hw_thread.ids[coreLevel] = threadInfo[i][src_index];
3423  } else if (src_index == threadIdIndex) {
3424  hw_thread.ids[threadLevel] = threadInfo[i][src_index];
3425  }
3426  }
3427  }
3428 
3429  __kmp_free(inMap);
3430  __kmp_free(lastId);
3431  __kmp_free(totals);
3432  __kmp_free(maxCt);
3433  __kmp_free(counts);
3434  CLEANUP_THREAD_INFO;
3435  __kmp_topology->sort_ids();
3436  if (!__kmp_topology->check_ids()) {
3437  kmp_topology_t::deallocate(__kmp_topology);
3438  __kmp_topology = nullptr;
3439  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3440  return false;
3441  }
3442  return true;
3443 }
3444 
3445 // Create and return a table of affinity masks, indexed by OS thread ID.
3446 // This routine handles OR'ing together all the affinity masks of threads
3447 // that are sufficiently close, if granularity > fine.
3448 template <typename FindNextFunctionType>
3449 static void __kmp_create_os_id_masks(unsigned *numUnique,
3450  kmp_affinity_t &affinity,
3451  FindNextFunctionType find_next) {
3452  // First form a table of affinity masks in order of OS thread id.
3453  int maxOsId;
3454  int i;
3455  int numAddrs = __kmp_topology->get_num_hw_threads();
3456  int depth = __kmp_topology->get_depth();
3457  const char *env_var = __kmp_get_affinity_env_var(affinity);
3458  KMP_ASSERT(numAddrs);
3459  KMP_ASSERT(depth);
3460 
3461  i = find_next(-1);
3462  // If could not find HW thread location with attributes, then return and
3463  // fallback to increment find_next and disregard core attributes.
3464  if (i >= numAddrs)
3465  return;
3466 
3467  maxOsId = 0;
3468  for (i = numAddrs - 1;; --i) {
3469  int osId = __kmp_topology->at(i).os_id;
3470  if (osId > maxOsId) {
3471  maxOsId = osId;
3472  }
3473  if (i == 0)
3474  break;
3475  }
3476  affinity.num_os_id_masks = maxOsId + 1;
3477  KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
3478  KMP_ASSERT(affinity.gran_levels >= 0);
3479  if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
3480  KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
3481  }
3482  if (affinity.gran_levels >= (int)depth) {
3483  KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
3484  }
3485 
3486  // Run through the table, forming the masks for all threads on each core.
3487  // Threads on the same core will have identical kmp_hw_thread_t objects, not
3488  // considering the last level, which must be the thread id. All threads on a
3489  // core will appear consecutively.
3490  int unique = 0;
3491  int j = 0; // index of 1st thread on core
3492  int leader = 0;
3493  kmp_affin_mask_t *sum;
3494  KMP_CPU_ALLOC_ON_STACK(sum);
3495  KMP_CPU_ZERO(sum);
3496 
3497  i = j = leader = find_next(-1);
3498  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3499  kmp_full_mask_modifier_t full_mask;
3500  for (i = find_next(i); i < numAddrs; i = find_next(i)) {
3501  // If this thread is sufficiently close to the leader (within the
3502  // granularity setting), then set the bit for this os thread in the
3503  // affinity mask for this group, and go on to the next thread.
3504  if (__kmp_topology->is_close(leader, i, affinity)) {
3505  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3506  continue;
3507  }
3508 
3509  // For every thread in this group, copy the mask to the thread's entry in
3510  // the OS Id mask table. Mark the first address as a leader.
3511  for (; j < i; j = find_next(j)) {
3512  int osId = __kmp_topology->at(j).os_id;
3513  KMP_DEBUG_ASSERT(osId <= maxOsId);
3514  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3515  KMP_CPU_COPY(mask, sum);
3516  __kmp_topology->at(j).leader = (j == leader);
3517  }
3518  unique++;
3519 
3520  // Start a new mask.
3521  leader = i;
3522  full_mask.include(sum);
3523  KMP_CPU_ZERO(sum);
3524  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3525  }
3526 
3527  // For every thread in last group, copy the mask to the thread's
3528  // entry in the OS Id mask table.
3529  for (; j < i; j = find_next(j)) {
3530  int osId = __kmp_topology->at(j).os_id;
3531  KMP_DEBUG_ASSERT(osId <= maxOsId);
3532  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3533  KMP_CPU_COPY(mask, sum);
3534  __kmp_topology->at(j).leader = (j == leader);
3535  }
3536  full_mask.include(sum);
3537  unique++;
3538  KMP_CPU_FREE_FROM_STACK(sum);
3539 
3540  // See if the OS Id mask table further restricts or changes the full mask
3541  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
3542  __kmp_topology->print(env_var);
3543  }
3544 
3545  *numUnique = unique;
3546 }
3547 
3548 // Stuff for the affinity proclist parsers. It's easier to declare these vars
3549 // as file-static than to try and pass them through the calling sequence of
3550 // the recursive-descent OMP_PLACES parser.
3551 static kmp_affin_mask_t *newMasks;
3552 static int numNewMasks;
3553 static int nextNewMask;
3554 
3555 #define ADD_MASK(_mask) \
3556  { \
3557  if (nextNewMask >= numNewMasks) { \
3558  int i; \
3559  numNewMasks *= 2; \
3560  kmp_affin_mask_t *temp; \
3561  KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
3562  for (i = 0; i < numNewMasks / 2; i++) { \
3563  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
3564  kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
3565  KMP_CPU_COPY(dest, src); \
3566  } \
3567  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
3568  newMasks = temp; \
3569  } \
3570  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
3571  nextNewMask++; \
3572  }
3573 
3574 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
3575  { \
3576  if (((_osId) > _maxOsId) || \
3577  (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
3578  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId); \
3579  } else { \
3580  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
3581  } \
3582  }
3583 
3584 // Re-parse the proclist (for the explicit affinity type), and form the list
3585 // of affinity newMasks indexed by gtid.
3586 static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
3587  int i;
3588  kmp_affin_mask_t **out_masks = &affinity.masks;
3589  unsigned *out_numMasks = &affinity.num_masks;
3590  const char *proclist = affinity.proclist;
3591  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3592  int maxOsId = affinity.num_os_id_masks - 1;
3593  const char *scan = proclist;
3594  const char *next = proclist;
3595 
3596  // We use malloc() for the temporary mask vector, so that we can use
3597  // realloc() to extend it.
3598  numNewMasks = 2;
3599  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3600  nextNewMask = 0;
3601  kmp_affin_mask_t *sumMask;
3602  KMP_CPU_ALLOC(sumMask);
3603  int setSize = 0;
3604 
3605  for (;;) {
3606  int start, end, stride;
3607 
3608  SKIP_WS(scan);
3609  next = scan;
3610  if (*next == '\0') {
3611  break;
3612  }
3613 
3614  if (*next == '{') {
3615  int num;
3616  setSize = 0;
3617  next++; // skip '{'
3618  SKIP_WS(next);
3619  scan = next;
3620 
3621  // Read the first integer in the set.
3622  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
3623  SKIP_DIGITS(next);
3624  num = __kmp_str_to_int(scan, *next);
3625  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3626 
3627  // Copy the mask for that osId to the sum (union) mask.
3628  if ((num > maxOsId) ||
3629  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3630  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3631  KMP_CPU_ZERO(sumMask);
3632  } else {
3633  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3634  setSize = 1;
3635  }
3636 
3637  for (;;) {
3638  // Check for end of set.
3639  SKIP_WS(next);
3640  if (*next == '}') {
3641  next++; // skip '}'
3642  break;
3643  }
3644 
3645  // Skip optional comma.
3646  if (*next == ',') {
3647  next++;
3648  }
3649  SKIP_WS(next);
3650 
3651  // Read the next integer in the set.
3652  scan = next;
3653  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3654 
3655  SKIP_DIGITS(next);
3656  num = __kmp_str_to_int(scan, *next);
3657  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3658 
3659  // Add the mask for that osId to the sum mask.
3660  if ((num > maxOsId) ||
3661  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3662  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3663  } else {
3664  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3665  setSize++;
3666  }
3667  }
3668  if (setSize > 0) {
3669  ADD_MASK(sumMask);
3670  }
3671 
3672  SKIP_WS(next);
3673  if (*next == ',') {
3674  next++;
3675  }
3676  scan = next;
3677  continue;
3678  }
3679 
3680  // Read the first integer.
3681  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3682  SKIP_DIGITS(next);
3683  start = __kmp_str_to_int(scan, *next);
3684  KMP_ASSERT2(start >= 0, "bad explicit proc list");
3685  SKIP_WS(next);
3686 
3687  // If this isn't a range, then add a mask to the list and go on.
3688  if (*next != '-') {
3689  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3690 
3691  // Skip optional comma.
3692  if (*next == ',') {
3693  next++;
3694  }
3695  scan = next;
3696  continue;
3697  }
3698 
3699  // This is a range. Skip over the '-' and read in the 2nd int.
3700  next++; // skip '-'
3701  SKIP_WS(next);
3702  scan = next;
3703  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3704  SKIP_DIGITS(next);
3705  end = __kmp_str_to_int(scan, *next);
3706  KMP_ASSERT2(end >= 0, "bad explicit proc list");
3707 
3708  // Check for a stride parameter
3709  stride = 1;
3710  SKIP_WS(next);
3711  if (*next == ':') {
3712  // A stride is specified. Skip over the ':" and read the 3rd int.
3713  int sign = +1;
3714  next++; // skip ':'
3715  SKIP_WS(next);
3716  scan = next;
3717  if (*next == '-') {
3718  sign = -1;
3719  next++;
3720  SKIP_WS(next);
3721  scan = next;
3722  }
3723  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3724  SKIP_DIGITS(next);
3725  stride = __kmp_str_to_int(scan, *next);
3726  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3727  stride *= sign;
3728  }
3729 
3730  // Do some range checks.
3731  KMP_ASSERT2(stride != 0, "bad explicit proc list");
3732  if (stride > 0) {
3733  KMP_ASSERT2(start <= end, "bad explicit proc list");
3734  } else {
3735  KMP_ASSERT2(start >= end, "bad explicit proc list");
3736  }
3737  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3738 
3739  // Add the mask for each OS proc # to the list.
3740  if (stride > 0) {
3741  do {
3742  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3743  start += stride;
3744  } while (start <= end);
3745  } else {
3746  do {
3747  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3748  start += stride;
3749  } while (start >= end);
3750  }
3751 
3752  // Skip optional comma.
3753  SKIP_WS(next);
3754  if (*next == ',') {
3755  next++;
3756  }
3757  scan = next;
3758  }
3759 
3760  *out_numMasks = nextNewMask;
3761  if (nextNewMask == 0) {
3762  *out_masks = NULL;
3763  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3764  return;
3765  }
3766  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3767  for (i = 0; i < nextNewMask; i++) {
3768  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3769  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3770  KMP_CPU_COPY(dest, src);
3771  }
3772  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3773  KMP_CPU_FREE(sumMask);
3774 }
3775 
3776 /*-----------------------------------------------------------------------------
3777 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3778 places. Again, Here is the grammar:
3779 
3780 place_list := place
3781 place_list := place , place_list
3782 place := num
3783 place := place : num
3784 place := place : num : signed
3785 place := { subplacelist }
3786 place := ! place // (lowest priority)
3787 subplace_list := subplace
3788 subplace_list := subplace , subplace_list
3789 subplace := num
3790 subplace := num : num
3791 subplace := num : num : signed
3792 signed := num
3793 signed := + signed
3794 signed := - signed
3795 -----------------------------------------------------------------------------*/
3796 static void __kmp_process_subplace_list(const char **scan,
3797  kmp_affinity_t &affinity, int maxOsId,
3798  kmp_affin_mask_t *tempMask,
3799  int *setSize) {
3800  const char *next;
3801  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3802 
3803  for (;;) {
3804  int start, count, stride, i;
3805 
3806  // Read in the starting proc id
3807  SKIP_WS(*scan);
3808  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3809  next = *scan;
3810  SKIP_DIGITS(next);
3811  start = __kmp_str_to_int(*scan, *next);
3812  KMP_ASSERT(start >= 0);
3813  *scan = next;
3814 
3815  // valid follow sets are ',' ':' and '}'
3816  SKIP_WS(*scan);
3817  if (**scan == '}' || **scan == ',') {
3818  if ((start > maxOsId) ||
3819  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3820  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3821  } else {
3822  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3823  (*setSize)++;
3824  }
3825  if (**scan == '}') {
3826  break;
3827  }
3828  (*scan)++; // skip ','
3829  continue;
3830  }
3831  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3832  (*scan)++; // skip ':'
3833 
3834  // Read count parameter
3835  SKIP_WS(*scan);
3836  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3837  next = *scan;
3838  SKIP_DIGITS(next);
3839  count = __kmp_str_to_int(*scan, *next);
3840  KMP_ASSERT(count >= 0);
3841  *scan = next;
3842 
3843  // valid follow sets are ',' ':' and '}'
3844  SKIP_WS(*scan);
3845  if (**scan == '}' || **scan == ',') {
3846  for (i = 0; i < count; i++) {
3847  if ((start > maxOsId) ||
3848  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3849  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3850  break; // don't proliferate warnings for large count
3851  } else {
3852  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3853  start++;
3854  (*setSize)++;
3855  }
3856  }
3857  if (**scan == '}') {
3858  break;
3859  }
3860  (*scan)++; // skip ','
3861  continue;
3862  }
3863  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3864  (*scan)++; // skip ':'
3865 
3866  // Read stride parameter
3867  int sign = +1;
3868  for (;;) {
3869  SKIP_WS(*scan);
3870  if (**scan == '+') {
3871  (*scan)++; // skip '+'
3872  continue;
3873  }
3874  if (**scan == '-') {
3875  sign *= -1;
3876  (*scan)++; // skip '-'
3877  continue;
3878  }
3879  break;
3880  }
3881  SKIP_WS(*scan);
3882  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3883  next = *scan;
3884  SKIP_DIGITS(next);
3885  stride = __kmp_str_to_int(*scan, *next);
3886  KMP_ASSERT(stride >= 0);
3887  *scan = next;
3888  stride *= sign;
3889 
3890  // valid follow sets are ',' and '}'
3891  SKIP_WS(*scan);
3892  if (**scan == '}' || **scan == ',') {
3893  for (i = 0; i < count; i++) {
3894  if ((start > maxOsId) ||
3895  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3896  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3897  break; // don't proliferate warnings for large count
3898  } else {
3899  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3900  start += stride;
3901  (*setSize)++;
3902  }
3903  }
3904  if (**scan == '}') {
3905  break;
3906  }
3907  (*scan)++; // skip ','
3908  continue;
3909  }
3910 
3911  KMP_ASSERT2(0, "bad explicit places list");
3912  }
3913 }
3914 
3915 static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
3916  int maxOsId, kmp_affin_mask_t *tempMask,
3917  int *setSize) {
3918  const char *next;
3919  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3920 
3921  // valid follow sets are '{' '!' and num
3922  SKIP_WS(*scan);
3923  if (**scan == '{') {
3924  (*scan)++; // skip '{'
3925  __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
3926  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3927  (*scan)++; // skip '}'
3928  } else if (**scan == '!') {
3929  (*scan)++; // skip '!'
3930  __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
3931  KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3932  } else if ((**scan >= '0') && (**scan <= '9')) {
3933  next = *scan;
3934  SKIP_DIGITS(next);
3935  int num = __kmp_str_to_int(*scan, *next);
3936  KMP_ASSERT(num >= 0);
3937  if ((num > maxOsId) ||
3938  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3939  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3940  } else {
3941  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3942  (*setSize)++;
3943  }
3944  *scan = next; // skip num
3945  } else {
3946  KMP_ASSERT2(0, "bad explicit places list");
3947  }
3948 }
3949 
3950 // static void
3951 void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
3952  int i, j, count, stride, sign;
3953  kmp_affin_mask_t **out_masks = &affinity.masks;
3954  unsigned *out_numMasks = &affinity.num_masks;
3955  const char *placelist = affinity.proclist;
3956  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3957  int maxOsId = affinity.num_os_id_masks - 1;
3958  const char *scan = placelist;
3959  const char *next = placelist;
3960 
3961  numNewMasks = 2;
3962  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3963  nextNewMask = 0;
3964 
3965  // tempMask is modified based on the previous or initial
3966  // place to form the current place
3967  // previousMask contains the previous place
3968  kmp_affin_mask_t *tempMask;
3969  kmp_affin_mask_t *previousMask;
3970  KMP_CPU_ALLOC(tempMask);
3971  KMP_CPU_ZERO(tempMask);
3972  KMP_CPU_ALLOC(previousMask);
3973  KMP_CPU_ZERO(previousMask);
3974  int setSize = 0;
3975 
3976  for (;;) {
3977  __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
3978 
3979  // valid follow sets are ',' ':' and EOL
3980  SKIP_WS(scan);
3981  if (*scan == '\0' || *scan == ',') {
3982  if (setSize > 0) {
3983  ADD_MASK(tempMask);
3984  }
3985  KMP_CPU_ZERO(tempMask);
3986  setSize = 0;
3987  if (*scan == '\0') {
3988  break;
3989  }
3990  scan++; // skip ','
3991  continue;
3992  }
3993 
3994  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3995  scan++; // skip ':'
3996 
3997  // Read count parameter
3998  SKIP_WS(scan);
3999  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4000  next = scan;
4001  SKIP_DIGITS(next);
4002  count = __kmp_str_to_int(scan, *next);
4003  KMP_ASSERT(count >= 0);
4004  scan = next;
4005 
4006  // valid follow sets are ',' ':' and EOL
4007  SKIP_WS(scan);
4008  if (*scan == '\0' || *scan == ',') {
4009  stride = +1;
4010  } else {
4011  KMP_ASSERT2(*scan == ':', "bad explicit places list");
4012  scan++; // skip ':'
4013 
4014  // Read stride parameter
4015  sign = +1;
4016  for (;;) {
4017  SKIP_WS(scan);
4018  if (*scan == '+') {
4019  scan++; // skip '+'
4020  continue;
4021  }
4022  if (*scan == '-') {
4023  sign *= -1;
4024  scan++; // skip '-'
4025  continue;
4026  }
4027  break;
4028  }
4029  SKIP_WS(scan);
4030  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4031  next = scan;
4032  SKIP_DIGITS(next);
4033  stride = __kmp_str_to_int(scan, *next);
4034  KMP_DEBUG_ASSERT(stride >= 0);
4035  scan = next;
4036  stride *= sign;
4037  }
4038 
4039  // Add places determined by initial_place : count : stride
4040  for (i = 0; i < count; i++) {
4041  if (setSize == 0) {
4042  break;
4043  }
4044  // Add the current place, then build the next place (tempMask) from that
4045  KMP_CPU_COPY(previousMask, tempMask);
4046  ADD_MASK(previousMask);
4047  KMP_CPU_ZERO(tempMask);
4048  setSize = 0;
4049  KMP_CPU_SET_ITERATE(j, previousMask) {
4050  if (!KMP_CPU_ISSET(j, previousMask)) {
4051  continue;
4052  }
4053  if ((j + stride > maxOsId) || (j + stride < 0) ||
4054  (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
4055  (!KMP_CPU_ISSET(j + stride,
4056  KMP_CPU_INDEX(osId2Mask, j + stride)))) {
4057  if (i < count - 1) {
4058  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
4059  }
4060  continue;
4061  }
4062  KMP_CPU_SET(j + stride, tempMask);
4063  setSize++;
4064  }
4065  }
4066  KMP_CPU_ZERO(tempMask);
4067  setSize = 0;
4068 
4069  // valid follow sets are ',' and EOL
4070  SKIP_WS(scan);
4071  if (*scan == '\0') {
4072  break;
4073  }
4074  if (*scan == ',') {
4075  scan++; // skip ','
4076  continue;
4077  }
4078 
4079  KMP_ASSERT2(0, "bad explicit places list");
4080  }
4081 
4082  *out_numMasks = nextNewMask;
4083  if (nextNewMask == 0) {
4084  *out_masks = NULL;
4085  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4086  return;
4087  }
4088  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
4089  KMP_CPU_FREE(tempMask);
4090  KMP_CPU_FREE(previousMask);
4091  for (i = 0; i < nextNewMask; i++) {
4092  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
4093  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
4094  KMP_CPU_COPY(dest, src);
4095  }
4096  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4097 }
4098 
4099 #undef ADD_MASK
4100 #undef ADD_MASK_OSID
4101 
4102 // This function figures out the deepest level at which there is at least one
4103 // cluster/core with more than one processing unit bound to it.
4104 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
4105  int core_level = 0;
4106 
4107  for (int i = 0; i < nprocs; i++) {
4108  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
4109  for (int j = bottom_level; j > 0; j--) {
4110  if (hw_thread.ids[j] > 0) {
4111  if (core_level < (j - 1)) {
4112  core_level = j - 1;
4113  }
4114  }
4115  }
4116  }
4117  return core_level;
4118 }
4119 
4120 // This function counts number of clusters/cores at given level.
4121 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
4122  int core_level) {
4123  return __kmp_topology->get_count(core_level);
4124 }
4125 // This function finds to which cluster/core given processing unit is bound.
4126 static int __kmp_affinity_find_core(int proc, int bottom_level,
4127  int core_level) {
4128  int core = 0;
4129  KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
4130  for (int i = 0; i <= proc; ++i) {
4131  if (i + 1 <= proc) {
4132  for (int j = 0; j <= core_level; ++j) {
4133  if (__kmp_topology->at(i + 1).sub_ids[j] !=
4134  __kmp_topology->at(i).sub_ids[j]) {
4135  core++;
4136  break;
4137  }
4138  }
4139  }
4140  }
4141  return core;
4142 }
4143 
4144 // This function finds maximal number of processing units bound to a
4145 // cluster/core at given level.
4146 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
4147  int core_level) {
4148  if (core_level >= bottom_level)
4149  return 1;
4150  int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
4151  return __kmp_topology->calculate_ratio(thread_level, core_level);
4152 }
4153 
4154 static int *procarr = NULL;
4155 static int __kmp_aff_depth = 0;
4156 static int *__kmp_osid_to_hwthread_map = NULL;
4157 
4158 static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
4159  kmp_affinity_ids_t &ids,
4160  kmp_affinity_attrs_t &attrs) {
4161  if (!KMP_AFFINITY_CAPABLE())
4162  return;
4163 
4164  // Initiailze ids and attrs thread data
4165  for (int i = 0; i < KMP_HW_LAST; ++i)
4166  ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
4167  attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4168 
4169  // Iterate through each os id within the mask and determine
4170  // the topology id and attribute information
4171  int cpu;
4172  int depth = __kmp_topology->get_depth();
4173  KMP_CPU_SET_ITERATE(cpu, mask) {
4174  int osid_idx = __kmp_osid_to_hwthread_map[cpu];
4175  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
4176  for (int level = 0; level < depth; ++level) {
4177  kmp_hw_t type = __kmp_topology->get_type(level);
4178  int id = hw_thread.sub_ids[level];
4179  if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
4180  ids[type] = id;
4181  } else {
4182  // This mask spans across multiple topology units, set it as such
4183  // and mark every level below as such as well.
4184  ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4185  for (; level < depth; ++level) {
4186  kmp_hw_t type = __kmp_topology->get_type(level);
4187  ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4188  }
4189  }
4190  }
4191  if (!attrs.valid) {
4192  attrs.core_type = hw_thread.attrs.get_core_type();
4193  attrs.core_eff = hw_thread.attrs.get_core_eff();
4194  attrs.valid = 1;
4195  } else {
4196  // This mask spans across multiple attributes, set it as such
4197  if (attrs.core_type != hw_thread.attrs.get_core_type())
4198  attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
4199  if (attrs.core_eff != hw_thread.attrs.get_core_eff())
4200  attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
4201  }
4202  }
4203 }
4204 
4205 static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
4206  if (!KMP_AFFINITY_CAPABLE())
4207  return;
4208  const kmp_affin_mask_t *mask = th->th.th_affin_mask;
4209  kmp_affinity_ids_t &ids = th->th.th_topology_ids;
4210  kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
4211  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4212 }
4213 
4214 // Assign the topology information to each place in the place list
4215 // A thread can then grab not only its affinity mask, but the topology
4216 // information associated with that mask. e.g., Which socket is a thread on
4217 static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
4218  if (!KMP_AFFINITY_CAPABLE())
4219  return;
4220  if (affinity.type != affinity_none) {
4221  KMP_ASSERT(affinity.num_os_id_masks);
4222  KMP_ASSERT(affinity.os_id_masks);
4223  }
4224  KMP_ASSERT(affinity.num_masks);
4225  KMP_ASSERT(affinity.masks);
4226  KMP_ASSERT(__kmp_affin_fullMask);
4227 
4228  int max_cpu = __kmp_affin_fullMask->get_max_cpu();
4229  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4230 
4231  // Allocate thread topology information
4232  if (!affinity.ids) {
4233  affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
4234  sizeof(kmp_affinity_ids_t) * affinity.num_masks);
4235  }
4236  if (!affinity.attrs) {
4237  affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
4238  sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
4239  }
4240  if (!__kmp_osid_to_hwthread_map) {
4241  // Want the +1 because max_cpu should be valid index into map
4242  __kmp_osid_to_hwthread_map =
4243  (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
4244  }
4245 
4246  // Create the OS proc to hardware thread map
4247  for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
4248  int os_id = __kmp_topology->at(hw_thread).os_id;
4249  if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
4250  __kmp_osid_to_hwthread_map[os_id] = hw_thread;
4251  }
4252 
4253  for (unsigned i = 0; i < affinity.num_masks; ++i) {
4254  kmp_affinity_ids_t &ids = affinity.ids[i];
4255  kmp_affinity_attrs_t &attrs = affinity.attrs[i];
4256  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
4257  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4258  }
4259 }
4260 
4261 // Called when __kmp_topology is ready
4262 static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
4263  // Initialize other data structures which depend on the topology
4264  if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
4265  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
4266  __kmp_affinity_get_topology_info(affinity);
4267  }
4268 }
4269 
4270 // Create a one element mask array (set of places) which only contains the
4271 // initial process's affinity mask
4272 static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
4273  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4274  KMP_ASSERT(affinity.type == affinity_none);
4275  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4276  affinity.num_masks = 1;
4277  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4278  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
4279  KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4280  __kmp_aux_affinity_initialize_other_data(affinity);
4281 }
4282 
4283 static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
4284  // Create the "full" mask - this defines all of the processors that we
4285  // consider to be in the machine model. If respect is set, then it is the
4286  // initialization thread's affinity mask. Otherwise, it is all processors that
4287  // we know about on the machine.
4288  int verbose = affinity.flags.verbose;
4289  const char *env_var = affinity.env_var;
4290 
4291  // Already initialized
4292  if (__kmp_affin_fullMask && __kmp_affin_origMask)
4293  return;
4294 
4295  if (__kmp_affin_fullMask == NULL) {
4296  KMP_CPU_ALLOC(__kmp_affin_fullMask);
4297  }
4298  if (__kmp_affin_origMask == NULL) {
4299  KMP_CPU_ALLOC(__kmp_affin_origMask);
4300  }
4301  if (KMP_AFFINITY_CAPABLE()) {
4302  __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4303  // Make a copy before possible expanding to the entire machine mask
4304  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4305  if (affinity.flags.respect) {
4306  // Count the number of available processors.
4307  unsigned i;
4308  __kmp_avail_proc = 0;
4309  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4310  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4311  continue;
4312  }
4313  __kmp_avail_proc++;
4314  }
4315  if (__kmp_avail_proc > __kmp_xproc) {
4316  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4317  affinity.type = affinity_none;
4318  KMP_AFFINITY_DISABLE();
4319  return;
4320  }
4321 
4322  if (verbose) {
4323  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4324  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4325  __kmp_affin_fullMask);
4326  KMP_INFORM(InitOSProcSetRespect, env_var, buf);
4327  }
4328  } else {
4329  if (verbose) {
4330  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4331  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4332  __kmp_affin_fullMask);
4333  KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
4334  }
4335  __kmp_avail_proc =
4336  __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4337 #if KMP_OS_WINDOWS
4338  if (__kmp_num_proc_groups <= 1) {
4339  // Copy expanded full mask if topology has single processor group
4340  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4341  }
4342  // Set the process affinity mask since threads' affinity
4343  // masks must be subset of process mask in Windows* OS
4344  __kmp_affin_fullMask->set_process_affinity(true);
4345 #endif
4346  }
4347  }
4348 }
4349 
4350 static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
4351  bool success = false;
4352  const char *env_var = affinity.env_var;
4353  kmp_i18n_id_t msg_id = kmp_i18n_null;
4354  int verbose = affinity.flags.verbose;
4355 
4356  // For backward compatibility, setting KMP_CPUINFO_FILE =>
4357  // KMP_TOPOLOGY_METHOD=cpuinfo
4358  if ((__kmp_cpuinfo_file != NULL) &&
4359  (__kmp_affinity_top_method == affinity_top_method_all)) {
4360  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4361  }
4362 
4363  if (__kmp_affinity_top_method == affinity_top_method_all) {
4364 // In the default code path, errors are not fatal - we just try using
4365 // another method. We only emit a warning message if affinity is on, or the
4366 // verbose flag is set, an the nowarnings flag was not set.
4367 #if KMP_USE_HWLOC
4368  if (!success &&
4369  __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4370  if (!__kmp_hwloc_error) {
4371  success = __kmp_affinity_create_hwloc_map(&msg_id);
4372  if (!success && verbose) {
4373  KMP_INFORM(AffIgnoringHwloc, env_var);
4374  }
4375  } else if (verbose) {
4376  KMP_INFORM(AffIgnoringHwloc, env_var);
4377  }
4378  }
4379 #endif
4380 
4381 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4382  if (!success) {
4383  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4384  if (!success && verbose && msg_id != kmp_i18n_null) {
4385  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4386  }
4387  }
4388  if (!success) {
4389  success = __kmp_affinity_create_apicid_map(&msg_id);
4390  if (!success && verbose && msg_id != kmp_i18n_null) {
4391  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4392  }
4393  }
4394 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4395 
4396 #if KMP_OS_LINUX
4397  if (!success) {
4398  int line = 0;
4399  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4400  if (!success && verbose && msg_id != kmp_i18n_null) {
4401  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4402  }
4403  }
4404 #endif /* KMP_OS_LINUX */
4405 
4406 #if KMP_GROUP_AFFINITY
4407  if (!success && (__kmp_num_proc_groups > 1)) {
4408  success = __kmp_affinity_create_proc_group_map(&msg_id);
4409  if (!success && verbose && msg_id != kmp_i18n_null) {
4410  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4411  }
4412  }
4413 #endif /* KMP_GROUP_AFFINITY */
4414 
4415  if (!success) {
4416  success = __kmp_affinity_create_flat_map(&msg_id);
4417  if (!success && verbose && msg_id != kmp_i18n_null) {
4418  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4419  }
4420  KMP_ASSERT(success);
4421  }
4422  }
4423 
4424 // If the user has specified that a paricular topology discovery method is to be
4425 // used, then we abort if that method fails. The exception is group affinity,
4426 // which might have been implicitly set.
4427 #if KMP_USE_HWLOC
4428  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4429  KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4430  success = __kmp_affinity_create_hwloc_map(&msg_id);
4431  if (!success) {
4432  KMP_ASSERT(msg_id != kmp_i18n_null);
4433  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4434  }
4435  }
4436 #endif // KMP_USE_HWLOC
4437 
4438 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4439  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
4440  __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
4441  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4442  if (!success) {
4443  KMP_ASSERT(msg_id != kmp_i18n_null);
4444  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4445  }
4446  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4447  success = __kmp_affinity_create_apicid_map(&msg_id);
4448  if (!success) {
4449  KMP_ASSERT(msg_id != kmp_i18n_null);
4450  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4451  }
4452  }
4453 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4454 
4455  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4456  int line = 0;
4457  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4458  if (!success) {
4459  KMP_ASSERT(msg_id != kmp_i18n_null);
4460  const char *filename = __kmp_cpuinfo_get_filename();
4461  if (line > 0) {
4462  KMP_FATAL(FileLineMsgExiting, filename, line,
4463  __kmp_i18n_catgets(msg_id));
4464  } else {
4465  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4466  }
4467  }
4468  }
4469 
4470 #if KMP_GROUP_AFFINITY
4471  else if (__kmp_affinity_top_method == affinity_top_method_group) {
4472  success = __kmp_affinity_create_proc_group_map(&msg_id);
4473  KMP_ASSERT(success);
4474  if (!success) {
4475  KMP_ASSERT(msg_id != kmp_i18n_null);
4476  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4477  }
4478  }
4479 #endif /* KMP_GROUP_AFFINITY */
4480 
4481  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4482  success = __kmp_affinity_create_flat_map(&msg_id);
4483  // should not fail
4484  KMP_ASSERT(success);
4485  }
4486 
4487  // Early exit if topology could not be created
4488  if (!__kmp_topology) {
4489  if (KMP_AFFINITY_CAPABLE()) {
4490  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4491  }
4492  if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
4493  __kmp_ncores > 0) {
4494  __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
4495  __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
4496  __kmp_nThreadsPerCore, __kmp_ncores);
4497  if (verbose) {
4498  __kmp_topology->print(env_var);
4499  }
4500  }
4501  return false;
4502  }
4503 
4504  // Canonicalize, print (if requested), apply KMP_HW_SUBSET
4505  __kmp_topology->canonicalize();
4506  if (verbose)
4507  __kmp_topology->print(env_var);
4508  bool filtered = __kmp_topology->filter_hw_subset();
4509  if (filtered && verbose)
4510  __kmp_topology->print("KMP_HW_SUBSET");
4511  return success;
4512 }
4513 
4514 static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
4515  bool is_regular_affinity = (&affinity == &__kmp_affinity);
4516  bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
4517  const char *env_var = __kmp_get_affinity_env_var(affinity);
4518 
4519  if (affinity.flags.initialized) {
4520  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4521  return;
4522  }
4523 
4524  if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
4525  __kmp_aux_affinity_initialize_masks(affinity);
4526 
4527  if (is_regular_affinity && !__kmp_topology) {
4528  bool success = __kmp_aux_affinity_initialize_topology(affinity);
4529  if (success) {
4530  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4531  } else {
4532  affinity.type = affinity_none;
4533  KMP_AFFINITY_DISABLE();
4534  }
4535  }
4536 
4537  // If KMP_AFFINITY=none, then only create the single "none" place
4538  // which is the process's initial affinity mask or the number of
4539  // hardware threads depending on respect,norespect
4540  if (affinity.type == affinity_none) {
4541  __kmp_create_affinity_none_places(affinity);
4542 #if KMP_USE_HIER_SCHED
4543  __kmp_dispatch_set_hierarchy_values();
4544 #endif
4545  affinity.flags.initialized = TRUE;
4546  return;
4547  }
4548 
4549  __kmp_topology->set_granularity(affinity);
4550  int depth = __kmp_topology->get_depth();
4551 
4552  // Create the table of masks, indexed by thread Id.
4553  unsigned numUnique;
4554  int numAddrs = __kmp_topology->get_num_hw_threads();
4555  // If OMP_PLACES=cores:<attribute> specified, then attempt
4556  // to make OS Id mask table using those attributes
4557  if (affinity.core_attr_gran.valid) {
4558  __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
4559  KMP_ASSERT(idx >= -1);
4560  for (int i = idx + 1; i < numAddrs; ++i)
4561  if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
4562  return i;
4563  return numAddrs;
4564  });
4565  if (!affinity.os_id_masks) {
4566  const char *core_attribute;
4567  if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
4568  core_attribute = "core_efficiency";
4569  else
4570  core_attribute = "core_type";
4571  KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
4572  core_attribute,
4573  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
4574  }
4575  }
4576  // If core attributes did not work, or none were specified,
4577  // then make OS Id mask table using typical incremental way.
4578  if (!affinity.os_id_masks) {
4579  __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
4580  KMP_ASSERT(idx >= -1);
4581  return idx + 1;
4582  });
4583  }
4584  if (affinity.gran_levels == 0) {
4585  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4586  }
4587 
4588  switch (affinity.type) {
4589 
4590  case affinity_explicit:
4591  KMP_DEBUG_ASSERT(affinity.proclist != NULL);
4592  if (is_hidden_helper_affinity ||
4593  __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4594  __kmp_affinity_process_proclist(affinity);
4595  } else {
4596  __kmp_affinity_process_placelist(affinity);
4597  }
4598  if (affinity.num_masks == 0) {
4599  KMP_AFF_WARNING(affinity, AffNoValidProcID);
4600  affinity.type = affinity_none;
4601  __kmp_create_affinity_none_places(affinity);
4602  affinity.flags.initialized = TRUE;
4603  return;
4604  }
4605  break;
4606 
4607  // The other affinity types rely on sorting the hardware threads according to
4608  // some permutation of the machine topology tree. Set affinity.compact
4609  // and affinity.offset appropriately, then jump to a common code
4610  // fragment to do the sort and create the array of affinity masks.
4611  case affinity_logical:
4612  affinity.compact = 0;
4613  if (affinity.offset) {
4614  affinity.offset =
4615  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4616  }
4617  goto sortTopology;
4618 
4619  case affinity_physical:
4620  if (__kmp_nThreadsPerCore > 1) {
4621  affinity.compact = 1;
4622  if (affinity.compact >= depth) {
4623  affinity.compact = 0;
4624  }
4625  } else {
4626  affinity.compact = 0;
4627  }
4628  if (affinity.offset) {
4629  affinity.offset =
4630  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4631  }
4632  goto sortTopology;
4633 
4634  case affinity_scatter:
4635  if (affinity.compact >= depth) {
4636  affinity.compact = 0;
4637  } else {
4638  affinity.compact = depth - 1 - affinity.compact;
4639  }
4640  goto sortTopology;
4641 
4642  case affinity_compact:
4643  if (affinity.compact >= depth) {
4644  affinity.compact = depth - 1;
4645  }
4646  goto sortTopology;
4647 
4648  case affinity_balanced:
4649  if (depth <= 1 || is_hidden_helper_affinity) {
4650  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4651  affinity.type = affinity_none;
4652  __kmp_create_affinity_none_places(affinity);
4653  affinity.flags.initialized = TRUE;
4654  return;
4655  } else if (!__kmp_topology->is_uniform()) {
4656  // Save the depth for further usage
4657  __kmp_aff_depth = depth;
4658 
4659  int core_level =
4660  __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
4661  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
4662  core_level);
4663  int maxprocpercore = __kmp_affinity_max_proc_per_core(
4664  __kmp_avail_proc, depth - 1, core_level);
4665 
4666  int nproc = ncores * maxprocpercore;
4667  if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4668  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4669  affinity.type = affinity_none;
4670  __kmp_create_affinity_none_places(affinity);
4671  affinity.flags.initialized = TRUE;
4672  return;
4673  }
4674 
4675  procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4676  for (int i = 0; i < nproc; i++) {
4677  procarr[i] = -1;
4678  }
4679 
4680  int lastcore = -1;
4681  int inlastcore = 0;
4682  for (int i = 0; i < __kmp_avail_proc; i++) {
4683  int proc = __kmp_topology->at(i).os_id;
4684  int core = __kmp_affinity_find_core(i, depth - 1, core_level);
4685 
4686  if (core == lastcore) {
4687  inlastcore++;
4688  } else {
4689  inlastcore = 0;
4690  }
4691  lastcore = core;
4692 
4693  procarr[core * maxprocpercore + inlastcore] = proc;
4694  }
4695  }
4696  if (affinity.compact >= depth) {
4697  affinity.compact = depth - 1;
4698  }
4699 
4700  sortTopology:
4701  // Allocate the gtid->affinity mask table.
4702  if (affinity.flags.dups) {
4703  affinity.num_masks = __kmp_avail_proc;
4704  } else {
4705  affinity.num_masks = numUnique;
4706  }
4707 
4708  if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4709  (__kmp_affinity_num_places > 0) &&
4710  ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
4711  !is_hidden_helper_affinity) {
4712  affinity.num_masks = __kmp_affinity_num_places;
4713  }
4714 
4715  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4716 
4717  // Sort the topology table according to the current setting of
4718  // affinity.compact, then fill out affinity.masks.
4719  __kmp_topology->sort_compact(affinity);
4720  {
4721  int i;
4722  unsigned j;
4723  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4724  kmp_full_mask_modifier_t full_mask;
4725  for (i = 0, j = 0; i < num_hw_threads; i++) {
4726  if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
4727  continue;
4728  }
4729  int osId = __kmp_topology->at(i).os_id;
4730 
4731  kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
4732  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
4733  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4734  KMP_CPU_COPY(dest, src);
4735  full_mask.include(src);
4736  if (++j >= affinity.num_masks) {
4737  break;
4738  }
4739  }
4740  KMP_DEBUG_ASSERT(j == affinity.num_masks);
4741  // See if the places list further restricts or changes the full mask
4742  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
4743  __kmp_topology->print(env_var);
4744  }
4745  }
4746  // Sort the topology back using ids
4747  __kmp_topology->sort_ids();
4748  break;
4749 
4750  default:
4751  KMP_ASSERT2(0, "Unexpected affinity setting");
4752  }
4753  __kmp_aux_affinity_initialize_other_data(affinity);
4754  affinity.flags.initialized = TRUE;
4755 }
4756 
4757 void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
4758  // Much of the code above was written assuming that if a machine was not
4759  // affinity capable, then affinity type == affinity_none.
4760  // We now explicitly represent this as affinity type == affinity_disabled.
4761  // There are too many checks for affinity type == affinity_none in this code.
4762  // Instead of trying to change them all, check if
4763  // affinity type == affinity_disabled, and if so, slam it with affinity_none,
4764  // call the real initialization routine, then restore affinity type to
4765  // affinity_disabled.
4766  int disabled = (affinity.type == affinity_disabled);
4767  if (!KMP_AFFINITY_CAPABLE())
4768  KMP_ASSERT(disabled);
4769  if (disabled)
4770  affinity.type = affinity_none;
4771  __kmp_aux_affinity_initialize(affinity);
4772  if (disabled)
4773  affinity.type = affinity_disabled;
4774 }
4775 
4776 void __kmp_affinity_uninitialize(void) {
4777  for (kmp_affinity_t *affinity : __kmp_affinities) {
4778  if (affinity->masks != NULL)
4779  KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
4780  if (affinity->os_id_masks != NULL)
4781  KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
4782  if (affinity->proclist != NULL)
4783  __kmp_free(affinity->proclist);
4784  if (affinity->ids != NULL)
4785  __kmp_free(affinity->ids);
4786  if (affinity->attrs != NULL)
4787  __kmp_free(affinity->attrs);
4788  *affinity = KMP_AFFINITY_INIT(affinity->env_var);
4789  }
4790  if (__kmp_affin_origMask != NULL) {
4791  if (KMP_AFFINITY_CAPABLE()) {
4792  __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
4793  }
4794  KMP_CPU_FREE(__kmp_affin_origMask);
4795  __kmp_affin_origMask = NULL;
4796  }
4797  __kmp_affinity_num_places = 0;
4798  if (procarr != NULL) {
4799  __kmp_free(procarr);
4800  procarr = NULL;
4801  }
4802  if (__kmp_osid_to_hwthread_map) {
4803  __kmp_free(__kmp_osid_to_hwthread_map);
4804  __kmp_osid_to_hwthread_map = NULL;
4805  }
4806 #if KMP_USE_HWLOC
4807  if (__kmp_hwloc_topology != NULL) {
4808  hwloc_topology_destroy(__kmp_hwloc_topology);
4809  __kmp_hwloc_topology = NULL;
4810  }
4811 #endif
4812  if (__kmp_hw_subset) {
4813  kmp_hw_subset_t::deallocate(__kmp_hw_subset);
4814  __kmp_hw_subset = nullptr;
4815  }
4816  if (__kmp_topology) {
4817  kmp_topology_t::deallocate(__kmp_topology);
4818  __kmp_topology = nullptr;
4819  }
4820  KMPAffinity::destroy_api();
4821 }
4822 
4823 static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
4824  int *place, kmp_affin_mask_t **mask) {
4825  int mask_idx;
4826  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4827  if (is_hidden_helper)
4828  // The first gtid is the regular primary thread, the second gtid is the main
4829  // thread of hidden team which does not participate in task execution.
4830  mask_idx = gtid - 2;
4831  else
4832  mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
4833  KMP_DEBUG_ASSERT(affinity->num_masks > 0);
4834  *place = (mask_idx + affinity->offset) % affinity->num_masks;
4835  *mask = KMP_CPU_INDEX(affinity->masks, *place);
4836 }
4837 
4838 // This function initializes the per-thread data concerning affinity including
4839 // the mask and topology information
4840 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4841 
4842  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4843 
4844  // Set the thread topology information to default of unknown
4845  for (int id = 0; id < KMP_HW_LAST; ++id)
4846  th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
4847  th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4848 
4849  if (!KMP_AFFINITY_CAPABLE()) {
4850  return;
4851  }
4852 
4853  if (th->th.th_affin_mask == NULL) {
4854  KMP_CPU_ALLOC(th->th.th_affin_mask);
4855  } else {
4856  KMP_CPU_ZERO(th->th.th_affin_mask);
4857  }
4858 
4859  // Copy the thread mask to the kmp_info_t structure. If
4860  // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
4861  // one that has all of the OS proc ids set, or if
4862  // __kmp_affinity.flags.respect is set, then the full mask is the
4863  // same as the mask of the initialization thread.
4864  kmp_affin_mask_t *mask;
4865  int i;
4866  const kmp_affinity_t *affinity;
4867  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4868 
4869  if (is_hidden_helper)
4870  affinity = &__kmp_hh_affinity;
4871  else
4872  affinity = &__kmp_affinity;
4873 
4874  if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
4875  if ((affinity->type == affinity_none) ||
4876  (affinity->type == affinity_balanced) ||
4877  KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
4878 #if KMP_GROUP_AFFINITY
4879  if (__kmp_num_proc_groups > 1) {
4880  return;
4881  }
4882 #endif
4883  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4884  i = 0;
4885  mask = __kmp_affin_fullMask;
4886  } else {
4887  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
4888  }
4889  } else {
4890  if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
4891 #if KMP_GROUP_AFFINITY
4892  if (__kmp_num_proc_groups > 1) {
4893  return;
4894  }
4895 #endif
4896  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4897  i = KMP_PLACE_ALL;
4898  mask = __kmp_affin_fullMask;
4899  } else {
4900  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
4901  }
4902  }
4903 
4904  th->th.th_current_place = i;
4905  if (isa_root && !is_hidden_helper) {
4906  th->th.th_new_place = i;
4907  th->th.th_first_place = 0;
4908  th->th.th_last_place = affinity->num_masks - 1;
4909  } else if (KMP_AFFINITY_NON_PROC_BIND) {
4910  // When using a Non-OMP_PROC_BIND affinity method,
4911  // set all threads' place-partition-var to the entire place list
4912  th->th.th_first_place = 0;
4913  th->th.th_last_place = affinity->num_masks - 1;
4914  }
4915  // Copy topology information associated with the place
4916  if (i >= 0) {
4917  th->th.th_topology_ids = __kmp_affinity.ids[i];
4918  th->th.th_topology_attrs = __kmp_affinity.attrs[i];
4919  }
4920 
4921  if (i == KMP_PLACE_ALL) {
4922  KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
4923  gtid));
4924  } else {
4925  KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
4926  gtid, i));
4927  }
4928 
4929  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4930 }
4931 
4932 void __kmp_affinity_bind_init_mask(int gtid) {
4933  if (!KMP_AFFINITY_CAPABLE()) {
4934  return;
4935  }
4936  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4937  const kmp_affinity_t *affinity;
4938  const char *env_var;
4939  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4940 
4941  if (is_hidden_helper)
4942  affinity = &__kmp_hh_affinity;
4943  else
4944  affinity = &__kmp_affinity;
4945  env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
4946  /* to avoid duplicate printing (will be correctly printed on barrier) */
4947  if (affinity->flags.verbose && (affinity->type == affinity_none ||
4948  (th->th.th_current_place != KMP_PLACE_ALL &&
4949  affinity->type != affinity_balanced)) &&
4950  !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
4951  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4952  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4953  th->th.th_affin_mask);
4954  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
4955  gtid, buf);
4956  }
4957 
4958 #if KMP_OS_WINDOWS
4959  // On Windows* OS, the process affinity mask might have changed. If the user
4960  // didn't request affinity and this call fails, just continue silently.
4961  // See CQ171393.
4962  if (affinity->type == affinity_none) {
4963  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4964  } else
4965 #endif
4966  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4967 }
4968 
4969 void __kmp_affinity_bind_place(int gtid) {
4970  // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
4971  if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
4972  return;
4973  }
4974 
4975  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4976 
4977  KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
4978  "place = %d)\n",
4979  gtid, th->th.th_new_place, th->th.th_current_place));
4980 
4981  // Check that the new place is within this thread's partition.
4982  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4983  KMP_ASSERT(th->th.th_new_place >= 0);
4984  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
4985  if (th->th.th_first_place <= th->th.th_last_place) {
4986  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4987  (th->th.th_new_place <= th->th.th_last_place));
4988  } else {
4989  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4990  (th->th.th_new_place >= th->th.th_last_place));
4991  }
4992 
4993  // Copy the thread mask to the kmp_info_t structure,
4994  // and set this thread's affinity.
4995  kmp_affin_mask_t *mask =
4996  KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
4997  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4998  th->th.th_current_place = th->th.th_new_place;
4999 
5000  if (__kmp_affinity.flags.verbose) {
5001  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5002  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5003  th->th.th_affin_mask);
5004  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
5005  __kmp_gettid(), gtid, buf);
5006  }
5007  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5008 }
5009 
5010 int __kmp_aux_set_affinity(void **mask) {
5011  int gtid;
5012  kmp_info_t *th;
5013  int retval;
5014 
5015  if (!KMP_AFFINITY_CAPABLE()) {
5016  return -1;
5017  }
5018 
5019  gtid = __kmp_entry_gtid();
5020  KA_TRACE(
5021  1000, (""); {
5022  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5023  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5024  (kmp_affin_mask_t *)(*mask));
5025  __kmp_debug_printf(
5026  "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
5027  gtid, buf);
5028  });
5029 
5030  if (__kmp_env_consistency_check) {
5031  if ((mask == NULL) || (*mask == NULL)) {
5032  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5033  } else {
5034  unsigned proc;
5035  int num_procs = 0;
5036 
5037  KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
5038  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5039  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5040  }
5041  if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
5042  continue;
5043  }
5044  num_procs++;
5045  }
5046  if (num_procs == 0) {
5047  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5048  }
5049 
5050 #if KMP_GROUP_AFFINITY
5051  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
5052  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5053  }
5054 #endif /* KMP_GROUP_AFFINITY */
5055  }
5056  }
5057 
5058  th = __kmp_threads[gtid];
5059  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5060  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5061  if (retval == 0) {
5062  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
5063  }
5064 
5065  th->th.th_current_place = KMP_PLACE_UNDEFINED;
5066  th->th.th_new_place = KMP_PLACE_UNDEFINED;
5067  th->th.th_first_place = 0;
5068  th->th.th_last_place = __kmp_affinity.num_masks - 1;
5069 
5070  // Turn off 4.0 affinity for the current tread at this parallel level.
5071  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
5072 
5073  return retval;
5074 }
5075 
5076 int __kmp_aux_get_affinity(void **mask) {
5077  int gtid;
5078  int retval;
5079 #if KMP_OS_WINDOWS || KMP_DEBUG
5080  kmp_info_t *th;
5081 #endif
5082  if (!KMP_AFFINITY_CAPABLE()) {
5083  return -1;
5084  }
5085 
5086  gtid = __kmp_entry_gtid();
5087 #if KMP_OS_WINDOWS || KMP_DEBUG
5088  th = __kmp_threads[gtid];
5089 #else
5090  (void)gtid; // unused variable
5091 #endif
5092  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5093 
5094  KA_TRACE(
5095  1000, (""); {
5096  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5097  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5098  th->th.th_affin_mask);
5099  __kmp_printf(
5100  "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
5101  buf);
5102  });
5103 
5104  if (__kmp_env_consistency_check) {
5105  if ((mask == NULL) || (*mask == NULL)) {
5106  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
5107  }
5108  }
5109 
5110 #if !KMP_OS_WINDOWS
5111 
5112  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5113  KA_TRACE(
5114  1000, (""); {
5115  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5116  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5117  (kmp_affin_mask_t *)(*mask));
5118  __kmp_printf(
5119  "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
5120  buf);
5121  });
5122  return retval;
5123 
5124 #else
5125  (void)retval;
5126 
5127  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
5128  return 0;
5129 
5130 #endif /* KMP_OS_WINDOWS */
5131 }
5132 
5133 int __kmp_aux_get_affinity_max_proc() {
5134  if (!KMP_AFFINITY_CAPABLE()) {
5135  return 0;
5136  }
5137 #if KMP_GROUP_AFFINITY
5138  if (__kmp_num_proc_groups > 1) {
5139  return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
5140  }
5141 #endif
5142  return __kmp_xproc;
5143 }
5144 
5145 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
5146  if (!KMP_AFFINITY_CAPABLE()) {
5147  return -1;
5148  }
5149 
5150  KA_TRACE(
5151  1000, (""); {
5152  int gtid = __kmp_entry_gtid();
5153  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5154  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5155  (kmp_affin_mask_t *)(*mask));
5156  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
5157  "affinity mask for thread %d = %s\n",
5158  proc, gtid, buf);
5159  });
5160 
5161  if (__kmp_env_consistency_check) {
5162  if ((mask == NULL) || (*mask == NULL)) {
5163  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
5164  }
5165  }
5166 
5167  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5168  return -1;
5169  }
5170  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5171  return -2;
5172  }
5173 
5174  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5175  return 0;
5176 }
5177 
5178 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5179  if (!KMP_AFFINITY_CAPABLE()) {
5180  return -1;
5181  }
5182 
5183  KA_TRACE(
5184  1000, (""); {
5185  int gtid = __kmp_entry_gtid();
5186  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5187  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5188  (kmp_affin_mask_t *)(*mask));
5189  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5190  "affinity mask for thread %d = %s\n",
5191  proc, gtid, buf);
5192  });
5193 
5194  if (__kmp_env_consistency_check) {
5195  if ((mask == NULL) || (*mask == NULL)) {
5196  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5197  }
5198  }
5199 
5200  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5201  return -1;
5202  }
5203  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5204  return -2;
5205  }
5206 
5207  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5208  return 0;
5209 }
5210 
5211 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5212  if (!KMP_AFFINITY_CAPABLE()) {
5213  return -1;
5214  }
5215 
5216  KA_TRACE(
5217  1000, (""); {
5218  int gtid = __kmp_entry_gtid();
5219  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5220  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5221  (kmp_affin_mask_t *)(*mask));
5222  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5223  "affinity mask for thread %d = %s\n",
5224  proc, gtid, buf);
5225  });
5226 
5227  if (__kmp_env_consistency_check) {
5228  if ((mask == NULL) || (*mask == NULL)) {
5229  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5230  }
5231  }
5232 
5233  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5234  return -1;
5235  }
5236  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5237  return 0;
5238  }
5239 
5240  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5241 }
5242 
5243 // Dynamic affinity settings - Affinity balanced
5244 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5245  KMP_DEBUG_ASSERT(th);
5246  bool fine_gran = true;
5247  int tid = th->th.th_info.ds.ds_tid;
5248  const char *env_var = "KMP_AFFINITY";
5249 
5250  // Do not perform balanced affinity for the hidden helper threads
5251  if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
5252  return;
5253 
5254  switch (__kmp_affinity.gran) {
5255  case KMP_HW_THREAD:
5256  break;
5257  case KMP_HW_CORE:
5258  if (__kmp_nThreadsPerCore > 1) {
5259  fine_gran = false;
5260  }
5261  break;
5262  case KMP_HW_SOCKET:
5263  if (nCoresPerPkg > 1) {
5264  fine_gran = false;
5265  }
5266  break;
5267  default:
5268  fine_gran = false;
5269  }
5270 
5271  if (__kmp_topology->is_uniform()) {
5272  int coreID;
5273  int threadID;
5274  // Number of hyper threads per core in HT machine
5275  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5276  // Number of cores
5277  int ncores = __kmp_ncores;
5278  if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5279  __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5280  ncores = nPackages;
5281  }
5282  // How many threads will be bound to each core
5283  int chunk = nthreads / ncores;
5284  // How many cores will have an additional thread bound to it - "big cores"
5285  int big_cores = nthreads % ncores;
5286  // Number of threads on the big cores
5287  int big_nth = (chunk + 1) * big_cores;
5288  if (tid < big_nth) {
5289  coreID = tid / (chunk + 1);
5290  threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5291  } else { // tid >= big_nth
5292  coreID = (tid - big_cores) / chunk;
5293  threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5294  }
5295  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5296  "Illegal set affinity operation when not capable");
5297 
5298  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5299  KMP_CPU_ZERO(mask);
5300 
5301  if (fine_gran) {
5302  int osID =
5303  __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
5304  KMP_CPU_SET(osID, mask);
5305  } else {
5306  for (int i = 0; i < __kmp_nth_per_core; i++) {
5307  int osID;
5308  osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
5309  KMP_CPU_SET(osID, mask);
5310  }
5311  }
5312  if (__kmp_affinity.flags.verbose) {
5313  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5314  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5315  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5316  tid, buf);
5317  }
5318  __kmp_affinity_get_thread_topology_info(th);
5319  __kmp_set_system_affinity(mask, TRUE);
5320  } else { // Non-uniform topology
5321 
5322  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5323  KMP_CPU_ZERO(mask);
5324 
5325  int core_level =
5326  __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
5327  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
5328  __kmp_aff_depth - 1, core_level);
5329  int nth_per_core = __kmp_affinity_max_proc_per_core(
5330  __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5331 
5332  // For performance gain consider the special case nthreads ==
5333  // __kmp_avail_proc
5334  if (nthreads == __kmp_avail_proc) {
5335  if (fine_gran) {
5336  int osID = __kmp_topology->at(tid).os_id;
5337  KMP_CPU_SET(osID, mask);
5338  } else {
5339  int core =
5340  __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
5341  for (int i = 0; i < __kmp_avail_proc; i++) {
5342  int osID = __kmp_topology->at(i).os_id;
5343  if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
5344  core) {
5345  KMP_CPU_SET(osID, mask);
5346  }
5347  }
5348  }
5349  } else if (nthreads <= ncores) {
5350 
5351  int core = 0;
5352  for (int i = 0; i < ncores; i++) {
5353  // Check if this core from procarr[] is in the mask
5354  int in_mask = 0;
5355  for (int j = 0; j < nth_per_core; j++) {
5356  if (procarr[i * nth_per_core + j] != -1) {
5357  in_mask = 1;
5358  break;
5359  }
5360  }
5361  if (in_mask) {
5362  if (tid == core) {
5363  for (int j = 0; j < nth_per_core; j++) {
5364  int osID = procarr[i * nth_per_core + j];
5365  if (osID != -1) {
5366  KMP_CPU_SET(osID, mask);
5367  // For fine granularity it is enough to set the first available
5368  // osID for this core
5369  if (fine_gran) {
5370  break;
5371  }
5372  }
5373  }
5374  break;
5375  } else {
5376  core++;
5377  }
5378  }
5379  }
5380  } else { // nthreads > ncores
5381  // Array to save the number of processors at each core
5382  int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5383  // Array to save the number of cores with "x" available processors;
5384  int *ncores_with_x_procs =
5385  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5386  // Array to save the number of cores with # procs from x to nth_per_core
5387  int *ncores_with_x_to_max_procs =
5388  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5389 
5390  for (int i = 0; i <= nth_per_core; i++) {
5391  ncores_with_x_procs[i] = 0;
5392  ncores_with_x_to_max_procs[i] = 0;
5393  }
5394 
5395  for (int i = 0; i < ncores; i++) {
5396  int cnt = 0;
5397  for (int j = 0; j < nth_per_core; j++) {
5398  if (procarr[i * nth_per_core + j] != -1) {
5399  cnt++;
5400  }
5401  }
5402  nproc_at_core[i] = cnt;
5403  ncores_with_x_procs[cnt]++;
5404  }
5405 
5406  for (int i = 0; i <= nth_per_core; i++) {
5407  for (int j = i; j <= nth_per_core; j++) {
5408  ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5409  }
5410  }
5411 
5412  // Max number of processors
5413  int nproc = nth_per_core * ncores;
5414  // An array to keep number of threads per each context
5415  int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5416  for (int i = 0; i < nproc; i++) {
5417  newarr[i] = 0;
5418  }
5419 
5420  int nth = nthreads;
5421  int flag = 0;
5422  while (nth > 0) {
5423  for (int j = 1; j <= nth_per_core; j++) {
5424  int cnt = ncores_with_x_to_max_procs[j];
5425  for (int i = 0; i < ncores; i++) {
5426  // Skip the core with 0 processors
5427  if (nproc_at_core[i] == 0) {
5428  continue;
5429  }
5430  for (int k = 0; k < nth_per_core; k++) {
5431  if (procarr[i * nth_per_core + k] != -1) {
5432  if (newarr[i * nth_per_core + k] == 0) {
5433  newarr[i * nth_per_core + k] = 1;
5434  cnt--;
5435  nth--;
5436  break;
5437  } else {
5438  if (flag != 0) {
5439  newarr[i * nth_per_core + k]++;
5440  cnt--;
5441  nth--;
5442  break;
5443  }
5444  }
5445  }
5446  }
5447  if (cnt == 0 || nth == 0) {
5448  break;
5449  }
5450  }
5451  if (nth == 0) {
5452  break;
5453  }
5454  }
5455  flag = 1;
5456  }
5457  int sum = 0;
5458  for (int i = 0; i < nproc; i++) {
5459  sum += newarr[i];
5460  if (sum > tid) {
5461  if (fine_gran) {
5462  int osID = procarr[i];
5463  KMP_CPU_SET(osID, mask);
5464  } else {
5465  int coreID = i / nth_per_core;
5466  for (int ii = 0; ii < nth_per_core; ii++) {
5467  int osID = procarr[coreID * nth_per_core + ii];
5468  if (osID != -1) {
5469  KMP_CPU_SET(osID, mask);
5470  }
5471  }
5472  }
5473  break;
5474  }
5475  }
5476  __kmp_free(newarr);
5477  }
5478 
5479  if (__kmp_affinity.flags.verbose) {
5480  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5481  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5482  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5483  tid, buf);
5484  }
5485  __kmp_affinity_get_thread_topology_info(th);
5486  __kmp_set_system_affinity(mask, TRUE);
5487  }
5488 }
5489 
5490 #if KMP_OS_LINUX || KMP_OS_FREEBSD
5491 // We don't need this entry for Windows because
5492 // there is GetProcessAffinityMask() api
5493 //
5494 // The intended usage is indicated by these steps:
5495 // 1) The user gets the current affinity mask
5496 // 2) Then sets the affinity by calling this function
5497 // 3) Error check the return value
5498 // 4) Use non-OpenMP parallelization
5499 // 5) Reset the affinity to what was stored in step 1)
5500 #ifdef __cplusplus
5501 extern "C"
5502 #endif
5503  int
5504  kmp_set_thread_affinity_mask_initial()
5505 // the function returns 0 on success,
5506 // -1 if we cannot bind thread
5507 // >0 (errno) if an error happened during binding
5508 {
5509  int gtid = __kmp_get_gtid();
5510  if (gtid < 0) {
5511  // Do not touch non-omp threads
5512  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5513  "non-omp thread, returning\n"));
5514  return -1;
5515  }
5516  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5517  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5518  "affinity not initialized, returning\n"));
5519  return -1;
5520  }
5521  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5522  "set full mask for thread %d\n",
5523  gtid));
5524  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5525  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5526 }
5527 #endif
5528 
5529 #endif // KMP_AFFINITY_SUPPORTED
int try_open(const char *filename, const char *mode)
Definition: kmp.h:4614