LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24  class Mask : public KMPAffinity::Mask {
25  hwloc_cpuset_t mask;
26 
27  public:
28  Mask() {
29  mask = hwloc_bitmap_alloc();
30  this->zero();
31  }
32  ~Mask() { hwloc_bitmap_free(mask); }
33  void set(int i) override { hwloc_bitmap_set(mask, i); }
34  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36  void zero() override { hwloc_bitmap_zero(mask); }
37  bool empty() const override { return hwloc_bitmap_iszero(mask); }
38  void copy(const KMPAffinity::Mask *src) override {
39  const Mask *convert = static_cast<const Mask *>(src);
40  hwloc_bitmap_copy(mask, convert->mask);
41  }
42  void bitwise_and(const KMPAffinity::Mask *rhs) override {
43  const Mask *convert = static_cast<const Mask *>(rhs);
44  hwloc_bitmap_and(mask, mask, convert->mask);
45  }
46  void bitwise_or(const KMPAffinity::Mask *rhs) override {
47  const Mask *convert = static_cast<const Mask *>(rhs);
48  hwloc_bitmap_or(mask, mask, convert->mask);
49  }
50  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51  bool is_equal(const KMPAffinity::Mask *rhs) const override {
52  const Mask *convert = static_cast<const Mask *>(rhs);
53  return hwloc_bitmap_isequal(mask, convert->mask);
54  }
55  int begin() const override { return hwloc_bitmap_first(mask); }
56  int end() const override { return -1; }
57  int next(int previous) const override {
58  return hwloc_bitmap_next(mask, previous);
59  }
60  int get_system_affinity(bool abort_on_error) override {
61  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62  "Illegal get affinity operation when not capable");
63  long retval =
64  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65  if (retval >= 0) {
66  return 0;
67  }
68  int error = errno;
69  if (abort_on_error) {
70  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71  KMP_ERR(error), __kmp_msg_null);
72  }
73  return error;
74  }
75  int set_system_affinity(bool abort_on_error) const override {
76  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77  "Illegal set affinity operation when not capable");
78  long retval =
79  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80  if (retval >= 0) {
81  return 0;
82  }
83  int error = errno;
84  if (abort_on_error) {
85  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86  KMP_ERR(error), __kmp_msg_null);
87  }
88  return error;
89  }
90 #if KMP_OS_WINDOWS
91  int set_process_affinity(bool abort_on_error) const override {
92  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93  "Illegal set process affinity operation when not capable");
94  int error = 0;
95  const hwloc_topology_support *support =
96  hwloc_topology_get_support(__kmp_hwloc_topology);
97  if (support->cpubind->set_proc_cpubind) {
98  int retval;
99  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100  HWLOC_CPUBIND_PROCESS);
101  if (retval >= 0)
102  return 0;
103  error = errno;
104  if (abort_on_error)
105  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106  KMP_ERR(error), __kmp_msg_null);
107  }
108  return error;
109  }
110 #endif
111  int get_proc_group() const override {
112  int group = -1;
113 #if KMP_OS_WINDOWS
114  if (__kmp_num_proc_groups == 1) {
115  return 1;
116  }
117  for (int i = 0; i < __kmp_num_proc_groups; i++) {
118  // On windows, the long type is always 32 bits
119  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120  unsigned long second_32_bits =
121  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122  if (first_32_bits == 0 && second_32_bits == 0) {
123  continue;
124  }
125  if (group >= 0) {
126  return -1;
127  }
128  group = i;
129  }
130 #endif /* KMP_OS_WINDOWS */
131  return group;
132  }
133  };
134  void determine_capable(const char *var) override {
135  const hwloc_topology_support *topology_support;
136  if (__kmp_hwloc_topology == NULL) {
137  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138  __kmp_hwloc_error = TRUE;
139  if (__kmp_affinity.flags.verbose) {
140  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141  }
142  }
143  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144  __kmp_hwloc_error = TRUE;
145  if (__kmp_affinity.flags.verbose) {
146  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147  }
148  }
149  }
150  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151  // Is the system capable of setting/getting this thread's affinity?
152  // Also, is topology discovery possible? (pu indicates ability to discover
153  // processing units). And finally, were there no errors when calling any
154  // hwloc_* API functions?
155  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156  topology_support->cpubind->get_thisthread_cpubind &&
157  topology_support->discovery->pu && !__kmp_hwloc_error) {
158  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159  KMP_AFFINITY_ENABLE(TRUE);
160  } else {
161  // indicate that hwloc didn't work and disable affinity
162  __kmp_hwloc_error = TRUE;
163  KMP_AFFINITY_DISABLE();
164  }
165  }
166  void bind_thread(int which) override {
167  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168  "Illegal set affinity operation when not capable");
169  KMPAffinity::Mask *mask;
170  KMP_CPU_ALLOC_ON_STACK(mask);
171  KMP_CPU_ZERO(mask);
172  KMP_CPU_SET(which, mask);
173  __kmp_set_system_affinity(mask, TRUE);
174  KMP_CPU_FREE_FROM_STACK(mask);
175  }
176  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178  KMPAffinity::Mask *allocate_mask_array(int num) override {
179  return new Mask[num];
180  }
181  void deallocate_mask_array(KMPAffinity::Mask *array) override {
182  Mask *hwloc_array = static_cast<Mask *>(array);
183  delete[] hwloc_array;
184  }
185  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186  int index) override {
187  Mask *hwloc_array = static_cast<Mask *>(array);
188  return &(hwloc_array[index]);
189  }
190  api_type get_api_type() const override { return HWLOC; }
191 };
192 #endif /* KMP_USE_HWLOC */
193 
194 #if KMP_OS_LINUX || KMP_OS_FREEBSD
195 #if KMP_OS_LINUX
196 /* On some of the older OS's that we build on, these constants aren't present
197  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
198  all systems of the same arch where they are defined, and they cannot change.
199  stone forever. */
200 #include <sys/syscall.h>
201 #if KMP_ARCH_X86 || KMP_ARCH_ARM
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 241
204 #elif __NR_sched_setaffinity != 241
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 242
209 #elif __NR_sched_getaffinity != 242
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_AARCH64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 122
215 #elif __NR_sched_setaffinity != 122
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 123
220 #elif __NR_sched_getaffinity != 123
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_RISCV64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 122
226 #elif __NR_sched_setaffinity != 122
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 123
231 #elif __NR_sched_getaffinity != 123
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_X86_64
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 203
237 #elif __NR_sched_setaffinity != 203
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 204
242 #elif __NR_sched_getaffinity != 204
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 #elif KMP_ARCH_PPC64
246 #ifndef __NR_sched_setaffinity
247 #define __NR_sched_setaffinity 222
248 #elif __NR_sched_setaffinity != 222
249 #error Wrong code for setaffinity system call.
250 #endif /* __NR_sched_setaffinity */
251 #ifndef __NR_sched_getaffinity
252 #define __NR_sched_getaffinity 223
253 #elif __NR_sched_getaffinity != 223
254 #error Wrong code for getaffinity system call.
255 #endif /* __NR_sched_getaffinity */
256 #elif KMP_ARCH_MIPS
257 #ifndef __NR_sched_setaffinity
258 #define __NR_sched_setaffinity 4239
259 #elif __NR_sched_setaffinity != 4239
260 #error Wrong code for setaffinity system call.
261 #endif /* __NR_sched_setaffinity */
262 #ifndef __NR_sched_getaffinity
263 #define __NR_sched_getaffinity 4240
264 #elif __NR_sched_getaffinity != 4240
265 #error Wrong code for getaffinity system call.
266 #endif /* __NR_sched_getaffinity */
267 #elif KMP_ARCH_MIPS64
268 #ifndef __NR_sched_setaffinity
269 #define __NR_sched_setaffinity 5195
270 #elif __NR_sched_setaffinity != 5195
271 #error Wrong code for setaffinity system call.
272 #endif /* __NR_sched_setaffinity */
273 #ifndef __NR_sched_getaffinity
274 #define __NR_sched_getaffinity 5196
275 #elif __NR_sched_getaffinity != 5196
276 #error Wrong code for getaffinity system call.
277 #endif /* __NR_sched_getaffinity */
278 #elif KMP_ARCH_LOONGARCH64
279 #ifndef __NR_sched_setaffinity
280 #define __NR_sched_setaffinity 122
281 #elif __NR_sched_setaffinity != 122
282 #error Wrong code for setaffinity system call.
283 #endif /* __NR_sched_setaffinity */
284 #ifndef __NR_sched_getaffinity
285 #define __NR_sched_getaffinity 123
286 #elif __NR_sched_getaffinity != 123
287 #error Wrong code for getaffinity system call.
288 #endif /* __NR_sched_getaffinity */
289 #elif KMP_ARCH_RISCV64
290 #ifndef __NR_sched_setaffinity
291 #define __NR_sched_setaffinity 122
292 #elif __NR_sched_setaffinity != 122
293 #error Wrong code for setaffinity system call.
294 #endif /* __NR_sched_setaffinity */
295 #ifndef __NR_sched_getaffinity
296 #define __NR_sched_getaffinity 123
297 #elif __NR_sched_getaffinity != 123
298 #error Wrong code for getaffinity system call.
299 #endif /* __NR_sched_getaffinity */
300 #elif KMP_ARCH_VE
301 #ifndef __NR_sched_setaffinity
302 #define __NR_sched_setaffinity 203
303 #elif __NR_sched_setaffinity != 203
304 #error Wrong code for setaffinity system call.
305 #endif /* __NR_sched_setaffinity */
306 #ifndef __NR_sched_getaffinity
307 #define __NR_sched_getaffinity 204
308 #elif __NR_sched_getaffinity != 204
309 #error Wrong code for getaffinity system call.
310 #endif /* __NR_sched_getaffinity */
311 #else
312 #error Unknown or unsupported architecture
313 #endif /* KMP_ARCH_* */
314 #elif KMP_OS_FREEBSD
315 #include <pthread.h>
316 #include <pthread_np.h>
317 #endif
318 class KMPNativeAffinity : public KMPAffinity {
319  class Mask : public KMPAffinity::Mask {
320  typedef unsigned long mask_t;
321  typedef decltype(__kmp_affin_mask_size) mask_size_type;
322  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
323  static const mask_t ONE = 1;
324  mask_size_type get_num_mask_types() const {
325  return __kmp_affin_mask_size / sizeof(mask_t);
326  }
327 
328  public:
329  mask_t *mask;
330  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
331  ~Mask() {
332  if (mask)
333  __kmp_free(mask);
334  }
335  void set(int i) override {
336  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
337  }
338  bool is_set(int i) const override {
339  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
340  }
341  void clear(int i) override {
342  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
343  }
344  void zero() override {
345  mask_size_type e = get_num_mask_types();
346  for (mask_size_type i = 0; i < e; ++i)
347  mask[i] = (mask_t)0;
348  }
349  bool empty() const override {
350  mask_size_type e = get_num_mask_types();
351  for (mask_size_type i = 0; i < e; ++i)
352  if (mask[i] != (mask_t)0)
353  return false;
354  return true;
355  }
356  void copy(const KMPAffinity::Mask *src) override {
357  const Mask *convert = static_cast<const Mask *>(src);
358  mask_size_type e = get_num_mask_types();
359  for (mask_size_type i = 0; i < e; ++i)
360  mask[i] = convert->mask[i];
361  }
362  void bitwise_and(const KMPAffinity::Mask *rhs) override {
363  const Mask *convert = static_cast<const Mask *>(rhs);
364  mask_size_type e = get_num_mask_types();
365  for (mask_size_type i = 0; i < e; ++i)
366  mask[i] &= convert->mask[i];
367  }
368  void bitwise_or(const KMPAffinity::Mask *rhs) override {
369  const Mask *convert = static_cast<const Mask *>(rhs);
370  mask_size_type e = get_num_mask_types();
371  for (mask_size_type i = 0; i < e; ++i)
372  mask[i] |= convert->mask[i];
373  }
374  void bitwise_not() override {
375  mask_size_type e = get_num_mask_types();
376  for (mask_size_type i = 0; i < e; ++i)
377  mask[i] = ~(mask[i]);
378  }
379  bool is_equal(const KMPAffinity::Mask *rhs) const override {
380  const Mask *convert = static_cast<const Mask *>(rhs);
381  mask_size_type e = get_num_mask_types();
382  for (mask_size_type i = 0; i < e; ++i)
383  if (mask[i] != convert->mask[i])
384  return false;
385  return true;
386  }
387  int begin() const override {
388  int retval = 0;
389  while (retval < end() && !is_set(retval))
390  ++retval;
391  return retval;
392  }
393  int end() const override {
394  int e;
395  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
396  return e;
397  }
398  int next(int previous) const override {
399  int retval = previous + 1;
400  while (retval < end() && !is_set(retval))
401  ++retval;
402  return retval;
403  }
404  int get_system_affinity(bool abort_on_error) override {
405  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
406  "Illegal get affinity operation when not capable");
407 #if KMP_OS_LINUX
408  long retval =
409  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
410 #elif KMP_OS_FREEBSD
411  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
412  reinterpret_cast<cpuset_t *>(mask));
413  int retval = (r == 0 ? 0 : -1);
414 #endif
415  if (retval >= 0) {
416  return 0;
417  }
418  int error = errno;
419  if (abort_on_error) {
420  __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
421  KMP_ERR(error), __kmp_msg_null);
422  }
423  return error;
424  }
425  int set_system_affinity(bool abort_on_error) const override {
426  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
427  "Illegal set affinity operation when not capable");
428 #if KMP_OS_LINUX
429  long retval =
430  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
431 #elif KMP_OS_FREEBSD
432  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
433  reinterpret_cast<cpuset_t *>(mask));
434  int retval = (r == 0 ? 0 : -1);
435 #endif
436  if (retval >= 0) {
437  return 0;
438  }
439  int error = errno;
440  if (abort_on_error) {
441  __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
442  KMP_ERR(error), __kmp_msg_null);
443  }
444  return error;
445  }
446  };
447  void determine_capable(const char *env_var) override {
448  __kmp_affinity_determine_capable(env_var);
449  }
450  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
451  KMPAffinity::Mask *allocate_mask() override {
452  KMPNativeAffinity::Mask *retval = new Mask();
453  return retval;
454  }
455  void deallocate_mask(KMPAffinity::Mask *m) override {
456  KMPNativeAffinity::Mask *native_mask =
457  static_cast<KMPNativeAffinity::Mask *>(m);
458  delete native_mask;
459  }
460  KMPAffinity::Mask *allocate_mask_array(int num) override {
461  return new Mask[num];
462  }
463  void deallocate_mask_array(KMPAffinity::Mask *array) override {
464  Mask *linux_array = static_cast<Mask *>(array);
465  delete[] linux_array;
466  }
467  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
468  int index) override {
469  Mask *linux_array = static_cast<Mask *>(array);
470  return &(linux_array[index]);
471  }
472  api_type get_api_type() const override { return NATIVE_OS; }
473 };
474 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
475 
476 #if KMP_OS_WINDOWS
477 class KMPNativeAffinity : public KMPAffinity {
478  class Mask : public KMPAffinity::Mask {
479  typedef ULONG_PTR mask_t;
480  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
481  mask_t *mask;
482 
483  public:
484  Mask() {
485  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
486  }
487  ~Mask() {
488  if (mask)
489  __kmp_free(mask);
490  }
491  void set(int i) override {
492  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
493  }
494  bool is_set(int i) const override {
495  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
496  }
497  void clear(int i) override {
498  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
499  }
500  void zero() override {
501  for (int i = 0; i < __kmp_num_proc_groups; ++i)
502  mask[i] = 0;
503  }
504  bool empty() const override {
505  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
506  if (mask[i])
507  return false;
508  return true;
509  }
510  void copy(const KMPAffinity::Mask *src) override {
511  const Mask *convert = static_cast<const Mask *>(src);
512  for (int i = 0; i < __kmp_num_proc_groups; ++i)
513  mask[i] = convert->mask[i];
514  }
515  void bitwise_and(const KMPAffinity::Mask *rhs) override {
516  const Mask *convert = static_cast<const Mask *>(rhs);
517  for (int i = 0; i < __kmp_num_proc_groups; ++i)
518  mask[i] &= convert->mask[i];
519  }
520  void bitwise_or(const KMPAffinity::Mask *rhs) override {
521  const Mask *convert = static_cast<const Mask *>(rhs);
522  for (int i = 0; i < __kmp_num_proc_groups; ++i)
523  mask[i] |= convert->mask[i];
524  }
525  void bitwise_not() override {
526  for (int i = 0; i < __kmp_num_proc_groups; ++i)
527  mask[i] = ~(mask[i]);
528  }
529  bool is_equal(const KMPAffinity::Mask *rhs) const override {
530  const Mask *convert = static_cast<const Mask *>(rhs);
531  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
532  if (mask[i] != convert->mask[i])
533  return false;
534  return true;
535  }
536  int begin() const override {
537  int retval = 0;
538  while (retval < end() && !is_set(retval))
539  ++retval;
540  return retval;
541  }
542  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
543  int next(int previous) const override {
544  int retval = previous + 1;
545  while (retval < end() && !is_set(retval))
546  ++retval;
547  return retval;
548  }
549  int set_process_affinity(bool abort_on_error) const override {
550  if (__kmp_num_proc_groups <= 1) {
551  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
552  DWORD error = GetLastError();
553  if (abort_on_error) {
554  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
555  __kmp_msg_null);
556  }
557  return error;
558  }
559  }
560  return 0;
561  }
562  int set_system_affinity(bool abort_on_error) const override {
563  if (__kmp_num_proc_groups > 1) {
564  // Check for a valid mask.
565  GROUP_AFFINITY ga;
566  int group = get_proc_group();
567  if (group < 0) {
568  if (abort_on_error) {
569  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
570  }
571  return -1;
572  }
573  // Transform the bit vector into a GROUP_AFFINITY struct
574  // and make the system call to set affinity.
575  ga.Group = group;
576  ga.Mask = mask[group];
577  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
578 
579  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
580  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
581  DWORD error = GetLastError();
582  if (abort_on_error) {
583  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
584  __kmp_msg_null);
585  }
586  return error;
587  }
588  } else {
589  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
590  DWORD error = GetLastError();
591  if (abort_on_error) {
592  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
593  __kmp_msg_null);
594  }
595  return error;
596  }
597  }
598  return 0;
599  }
600  int get_system_affinity(bool abort_on_error) override {
601  if (__kmp_num_proc_groups > 1) {
602  this->zero();
603  GROUP_AFFINITY ga;
604  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
605  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
606  DWORD error = GetLastError();
607  if (abort_on_error) {
608  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
609  KMP_ERR(error), __kmp_msg_null);
610  }
611  return error;
612  }
613  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
614  (ga.Mask == 0)) {
615  return -1;
616  }
617  mask[ga.Group] = ga.Mask;
618  } else {
619  mask_t newMask, sysMask, retval;
620  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
621  DWORD error = GetLastError();
622  if (abort_on_error) {
623  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
624  KMP_ERR(error), __kmp_msg_null);
625  }
626  return error;
627  }
628  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
629  if (!retval) {
630  DWORD error = GetLastError();
631  if (abort_on_error) {
632  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
633  KMP_ERR(error), __kmp_msg_null);
634  }
635  return error;
636  }
637  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
638  if (!newMask) {
639  DWORD error = GetLastError();
640  if (abort_on_error) {
641  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
642  KMP_ERR(error), __kmp_msg_null);
643  }
644  }
645  *mask = retval;
646  }
647  return 0;
648  }
649  int get_proc_group() const override {
650  int group = -1;
651  if (__kmp_num_proc_groups == 1) {
652  return 1;
653  }
654  for (int i = 0; i < __kmp_num_proc_groups; i++) {
655  if (mask[i] == 0)
656  continue;
657  if (group >= 0)
658  return -1;
659  group = i;
660  }
661  return group;
662  }
663  };
664  void determine_capable(const char *env_var) override {
665  __kmp_affinity_determine_capable(env_var);
666  }
667  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
668  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
669  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
670  KMPAffinity::Mask *allocate_mask_array(int num) override {
671  return new Mask[num];
672  }
673  void deallocate_mask_array(KMPAffinity::Mask *array) override {
674  Mask *windows_array = static_cast<Mask *>(array);
675  delete[] windows_array;
676  }
677  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
678  int index) override {
679  Mask *windows_array = static_cast<Mask *>(array);
680  return &(windows_array[index]);
681  }
682  api_type get_api_type() const override { return NATIVE_OS; }
683 };
684 #endif /* KMP_OS_WINDOWS */
685 #endif /* KMP_AFFINITY_SUPPORTED */
686 
687 // Describe an attribute for a level in the machine topology
688 struct kmp_hw_attr_t {
689  int core_type : 8;
690  int core_eff : 8;
691  unsigned valid : 1;
692  unsigned reserved : 15;
693 
694  static const int UNKNOWN_CORE_EFF = -1;
695 
696  kmp_hw_attr_t()
697  : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
698  valid(0), reserved(0) {}
699  void set_core_type(kmp_hw_core_type_t type) {
700  valid = 1;
701  core_type = type;
702  }
703  void set_core_eff(int eff) {
704  valid = 1;
705  core_eff = eff;
706  }
707  kmp_hw_core_type_t get_core_type() const {
708  return (kmp_hw_core_type_t)core_type;
709  }
710  int get_core_eff() const { return core_eff; }
711  bool is_core_type_valid() const {
712  return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
713  }
714  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
715  operator bool() const { return valid; }
716  void clear() {
717  core_type = KMP_HW_CORE_TYPE_UNKNOWN;
718  core_eff = UNKNOWN_CORE_EFF;
719  valid = 0;
720  }
721  bool contains(const kmp_hw_attr_t &other) const {
722  if (!valid && !other.valid)
723  return true;
724  if (valid && other.valid) {
725  if (other.is_core_type_valid()) {
726  if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
727  return false;
728  }
729  if (other.is_core_eff_valid()) {
730  if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
731  return false;
732  }
733  return true;
734  }
735  return false;
736  }
737 #if KMP_AFFINITY_SUPPORTED
738  bool contains(const kmp_affinity_attrs_t &attr) const {
739  if (!valid && !attr.valid)
740  return true;
741  if (valid && attr.valid) {
742  if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
743  return (is_core_type_valid() &&
744  (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
745  if (attr.core_eff != UNKNOWN_CORE_EFF)
746  return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
747  return true;
748  }
749  return false;
750  }
751 #endif // KMP_AFFINITY_SUPPORTED
752  bool operator==(const kmp_hw_attr_t &rhs) const {
753  return (rhs.valid == valid && rhs.core_eff == core_eff &&
754  rhs.core_type == core_type);
755  }
756  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
757 };
758 
759 #if KMP_AFFINITY_SUPPORTED
760 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
761 #endif
762 
763 class kmp_hw_thread_t {
764 public:
765  static const int UNKNOWN_ID = -1;
766  static const int MULTIPLE_ID = -2;
767  static int compare_ids(const void *a, const void *b);
768  static int compare_compact(const void *a, const void *b);
769  int ids[KMP_HW_LAST];
770  int sub_ids[KMP_HW_LAST];
771  bool leader;
772  int os_id;
773  kmp_hw_attr_t attrs;
774 
775  void print() const;
776  void clear() {
777  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
778  ids[i] = UNKNOWN_ID;
779  leader = false;
780  attrs.clear();
781  }
782 };
783 
784 class kmp_topology_t {
785 
786  struct flags_t {
787  int uniform : 1;
788  int reserved : 31;
789  };
790 
791  int depth;
792 
793  // The following arrays are all 'depth' long and have been
794  // allocated to hold up to KMP_HW_LAST number of objects if
795  // needed so layers can be added without reallocation of any array
796 
797  // Orderd array of the types in the topology
798  kmp_hw_t *types;
799 
800  // Keep quick topology ratios, for non-uniform topologies,
801  // this ratio holds the max number of itemAs per itemB
802  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
803  int *ratio;
804 
805  // Storage containing the absolute number of each topology layer
806  int *count;
807 
808  // The number of core efficiencies. This is only useful for hybrid
809  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
810  int num_core_efficiencies;
811  int num_core_types;
812  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
813 
814  // The hardware threads array
815  // hw_threads is num_hw_threads long
816  // Each hw_thread's ids and sub_ids are depth deep
817  int num_hw_threads;
818  kmp_hw_thread_t *hw_threads;
819 
820  // Equivalence hash where the key is the hardware topology item
821  // and the value is the equivalent hardware topology type in the
822  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
823  // known equivalence for the topology type
824  kmp_hw_t equivalent[KMP_HW_LAST];
825 
826  // Flags describing the topology
827  flags_t flags;
828 
829  // Compact value used during sort_compact()
830  int compact;
831 
832  // Insert a new topology layer after allocation
833  void _insert_layer(kmp_hw_t type, const int *ids);
834 
835 #if KMP_GROUP_AFFINITY
836  // Insert topology information about Windows Processor groups
837  void _insert_windows_proc_groups();
838 #endif
839 
840  // Count each item & get the num x's per y
841  // e.g., get the number of cores and the number of threads per core
842  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
843  void _gather_enumeration_information();
844 
845  // Remove layers that don't add information to the topology.
846  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
847  void _remove_radix1_layers();
848 
849  // Find out if the topology is uniform
850  void _discover_uniformity();
851 
852  // Set all the sub_ids for each hardware thread
853  void _set_sub_ids();
854 
855  // Set global affinity variables describing the number of threads per
856  // core, the number of packages, the number of cores per package, and
857  // the number of cores.
858  void _set_globals();
859 
860  // Set the last level cache equivalent type
861  void _set_last_level_cache();
862 
863  // Return the number of cores with a particular attribute, 'attr'.
864  // If 'find_all' is true, then find all cores on the machine, otherwise find
865  // all cores per the layer 'above'
866  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
867  bool find_all = false) const;
868 
869 public:
870  // Force use of allocate()/deallocate()
871  kmp_topology_t() = delete;
872  kmp_topology_t(const kmp_topology_t &t) = delete;
873  kmp_topology_t(kmp_topology_t &&t) = delete;
874  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
875  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
876 
877  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
878  static void deallocate(kmp_topology_t *);
879 
880  // Functions used in create_map() routines
881  kmp_hw_thread_t &at(int index) {
882  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
883  return hw_threads[index];
884  }
885  const kmp_hw_thread_t &at(int index) const {
886  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
887  return hw_threads[index];
888  }
889  int get_num_hw_threads() const { return num_hw_threads; }
890  void sort_ids() {
891  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
892  kmp_hw_thread_t::compare_ids);
893  }
894  // Check if the hardware ids are unique, if they are
895  // return true, otherwise return false
896  bool check_ids() const;
897 
898  // Function to call after the create_map() routine
899  void canonicalize();
900  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
901 
902 // Functions used after canonicalize() called
903 
904 #if KMP_AFFINITY_SUPPORTED
905  // Set the granularity for affinity settings
906  void set_granularity(kmp_affinity_t &stgs) const;
907  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
908  bool restrict_to_mask(const kmp_affin_mask_t *mask);
909  bool filter_hw_subset();
910 #endif
911  bool is_uniform() const { return flags.uniform; }
912  // Tell whether a type is a valid type in the topology
913  // returns KMP_HW_UNKNOWN when there is no equivalent type
914  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
915  if (type == KMP_HW_UNKNOWN)
916  return KMP_HW_UNKNOWN;
917  return equivalent[type];
918  }
919  // Set type1 = type2
920  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
921  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
922  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
923  kmp_hw_t real_type2 = equivalent[type2];
924  if (real_type2 == KMP_HW_UNKNOWN)
925  real_type2 = type2;
926  equivalent[type1] = real_type2;
927  // This loop is required since any of the types may have been set to
928  // be equivalent to type1. They all must be checked and reset to type2.
929  KMP_FOREACH_HW_TYPE(type) {
930  if (equivalent[type] == type1) {
931  equivalent[type] = real_type2;
932  }
933  }
934  }
935  // Calculate number of types corresponding to level1
936  // per types corresponding to level2 (e.g., number of threads per core)
937  int calculate_ratio(int level1, int level2) const {
938  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
939  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
940  int r = 1;
941  for (int level = level1; level > level2; --level)
942  r *= ratio[level];
943  return r;
944  }
945  int get_ratio(int level) const {
946  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
947  return ratio[level];
948  }
949  int get_depth() const { return depth; };
950  kmp_hw_t get_type(int level) const {
951  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
952  return types[level];
953  }
954  int get_level(kmp_hw_t type) const {
955  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
956  int eq_type = equivalent[type];
957  if (eq_type == KMP_HW_UNKNOWN)
958  return -1;
959  for (int i = 0; i < depth; ++i)
960  if (types[i] == eq_type)
961  return i;
962  return -1;
963  }
964  int get_count(int level) const {
965  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
966  return count[level];
967  }
968  // Return the total number of cores with attribute 'attr'
969  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
970  return _get_ncores_with_attr(attr, -1, true);
971  }
972  // Return the number of cores with attribute
973  // 'attr' per topology level 'above'
974  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
975  return _get_ncores_with_attr(attr, above, false);
976  }
977 
978 #if KMP_AFFINITY_SUPPORTED
979  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
980  void sort_compact(kmp_affinity_t &affinity) {
981  compact = affinity.compact;
982  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
983  kmp_hw_thread_t::compare_compact);
984  }
985 #endif
986  void print(const char *env_var = "KMP_AFFINITY") const;
987  void dump() const;
988 };
989 extern kmp_topology_t *__kmp_topology;
990 
991 class kmp_hw_subset_t {
992  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
993 
994 public:
995  // Describe a machine topology item in KMP_HW_SUBSET
996  struct item_t {
997  kmp_hw_t type;
998  int num_attrs;
999  int num[MAX_ATTRS];
1000  int offset[MAX_ATTRS];
1001  kmp_hw_attr_t attr[MAX_ATTRS];
1002  };
1003  // Put parenthesis around max to avoid accidental use of Windows max macro.
1004  const static int USE_ALL = (std::numeric_limits<int>::max)();
1005 
1006 private:
1007  int depth;
1008  int capacity;
1009  item_t *items;
1010  kmp_uint64 set;
1011  bool absolute;
1012  // The set must be able to handle up to KMP_HW_LAST number of layers
1013  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1014  // Sorting the KMP_HW_SUBSET items to follow topology order
1015  // All unknown topology types will be at the beginning of the subset
1016  static int hw_subset_compare(const void *i1, const void *i2) {
1017  kmp_hw_t type1 = ((const item_t *)i1)->type;
1018  kmp_hw_t type2 = ((const item_t *)i2)->type;
1019  int level1 = __kmp_topology->get_level(type1);
1020  int level2 = __kmp_topology->get_level(type2);
1021  return level1 - level2;
1022  }
1023 
1024 public:
1025  // Force use of allocate()/deallocate()
1026  kmp_hw_subset_t() = delete;
1027  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1028  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1029  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1030  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1031 
1032  static kmp_hw_subset_t *allocate() {
1033  int initial_capacity = 5;
1034  kmp_hw_subset_t *retval =
1035  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1036  retval->depth = 0;
1037  retval->capacity = initial_capacity;
1038  retval->set = 0ull;
1039  retval->absolute = false;
1040  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1041  return retval;
1042  }
1043  static void deallocate(kmp_hw_subset_t *subset) {
1044  __kmp_free(subset->items);
1045  __kmp_free(subset);
1046  }
1047  void set_absolute() { absolute = true; }
1048  bool is_absolute() const { return absolute; }
1049  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1050  for (int i = 0; i < depth; ++i) {
1051  // Found an existing item for this layer type
1052  // Add the num, offset, and attr to this item
1053  if (items[i].type == type) {
1054  int idx = items[i].num_attrs++;
1055  if ((size_t)idx >= MAX_ATTRS)
1056  return;
1057  items[i].num[idx] = num;
1058  items[i].offset[idx] = offset;
1059  items[i].attr[idx] = attr;
1060  return;
1061  }
1062  }
1063  if (depth == capacity - 1) {
1064  capacity *= 2;
1065  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1066  for (int i = 0; i < depth; ++i)
1067  new_items[i] = items[i];
1068  __kmp_free(items);
1069  items = new_items;
1070  }
1071  items[depth].num_attrs = 1;
1072  items[depth].type = type;
1073  items[depth].num[0] = num;
1074  items[depth].offset[0] = offset;
1075  items[depth].attr[0] = attr;
1076  depth++;
1077  set |= (1ull << type);
1078  }
1079  int get_depth() const { return depth; }
1080  const item_t &at(int index) const {
1081  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1082  return items[index];
1083  }
1084  item_t &at(int index) {
1085  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1086  return items[index];
1087  }
1088  void remove(int index) {
1089  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1090  set &= ~(1ull << items[index].type);
1091  for (int j = index + 1; j < depth; ++j) {
1092  items[j - 1] = items[j];
1093  }
1094  depth--;
1095  }
1096  void sort() {
1097  KMP_DEBUG_ASSERT(__kmp_topology);
1098  qsort(items, depth, sizeof(item_t), hw_subset_compare);
1099  }
1100  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1101  void dump() const {
1102  printf("**********************\n");
1103  printf("*** kmp_hw_subset: ***\n");
1104  printf("* depth: %d\n", depth);
1105  printf("* items:\n");
1106  for (int i = 0; i < depth; ++i) {
1107  printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1108  for (int j = 0; j < items[i].num_attrs; ++j) {
1109  printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1110  items[i].offset[j]);
1111  if (!items[i].attr[j]) {
1112  printf(" (none)\n");
1113  } else {
1114  printf(
1115  " core_type = %s, core_eff = %d\n",
1116  __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1117  items[i].attr[j].get_core_eff());
1118  }
1119  }
1120  }
1121  printf("* set: 0x%llx\n", set);
1122  printf("* absolute: %d\n", absolute);
1123  printf("**********************\n");
1124  }
1125 };
1126 extern kmp_hw_subset_t *__kmp_hw_subset;
1127 
1128 /* A structure for holding machine-specific hierarchy info to be computed once
1129  at init. This structure represents a mapping of threads to the actual machine
1130  hierarchy, or to our best guess at what the hierarchy might be, for the
1131  purpose of performing an efficient barrier. In the worst case, when there is
1132  no machine hierarchy information, it produces a tree suitable for a barrier,
1133  similar to the tree used in the hyper barrier. */
1134 class hierarchy_info {
1135 public:
1136  /* Good default values for number of leaves and branching factor, given no
1137  affinity information. Behaves a bit like hyper barrier. */
1138  static const kmp_uint32 maxLeaves = 4;
1139  static const kmp_uint32 minBranch = 4;
1145  kmp_uint32 maxLevels;
1146 
1151  kmp_uint32 depth;
1152  kmp_uint32 base_num_threads;
1153  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1154  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1155  // 2=initialization in progress
1156  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1157 
1162  kmp_uint32 *numPerLevel;
1163  kmp_uint32 *skipPerLevel;
1164 
1165  void deriveLevels() {
1166  int hier_depth = __kmp_topology->get_depth();
1167  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1168  numPerLevel[level] = __kmp_topology->get_ratio(i);
1169  }
1170  }
1171 
1172  hierarchy_info()
1173  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1174 
1175  void fini() {
1176  if (!uninitialized && numPerLevel) {
1177  __kmp_free(numPerLevel);
1178  numPerLevel = NULL;
1179  uninitialized = not_initialized;
1180  }
1181  }
1182 
1183  void init(int num_addrs) {
1184  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1185  &uninitialized, not_initialized, initializing);
1186  if (bool_result == 0) { // Wait for initialization
1187  while (TCR_1(uninitialized) != initialized)
1188  KMP_CPU_PAUSE();
1189  return;
1190  }
1191  KMP_DEBUG_ASSERT(bool_result == 1);
1192 
1193  /* Added explicit initialization of the data fields here to prevent usage of
1194  dirty value observed when static library is re-initialized multiple times
1195  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1196  OpenMP). */
1197  depth = 1;
1198  resizing = 0;
1199  maxLevels = 7;
1200  numPerLevel =
1201  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1202  skipPerLevel = &(numPerLevel[maxLevels]);
1203  for (kmp_uint32 i = 0; i < maxLevels;
1204  ++i) { // init numPerLevel[*] to 1 item per level
1205  numPerLevel[i] = 1;
1206  skipPerLevel[i] = 1;
1207  }
1208 
1209  // Sort table by physical ID
1210  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1211  deriveLevels();
1212  } else {
1213  numPerLevel[0] = maxLeaves;
1214  numPerLevel[1] = num_addrs / maxLeaves;
1215  if (num_addrs % maxLeaves)
1216  numPerLevel[1]++;
1217  }
1218 
1219  base_num_threads = num_addrs;
1220  for (int i = maxLevels - 1; i >= 0;
1221  --i) // count non-empty levels to get depth
1222  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1223  depth++;
1224 
1225  kmp_uint32 branch = minBranch;
1226  if (numPerLevel[0] == 1)
1227  branch = num_addrs / maxLeaves;
1228  if (branch < minBranch)
1229  branch = minBranch;
1230  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1231  while (numPerLevel[d] > branch ||
1232  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1233  if (numPerLevel[d] & 1)
1234  numPerLevel[d]++;
1235  numPerLevel[d] = numPerLevel[d] >> 1;
1236  if (numPerLevel[d + 1] == 1)
1237  depth++;
1238  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1239  }
1240  if (numPerLevel[0] == 1) {
1241  branch = branch >> 1;
1242  if (branch < 4)
1243  branch = minBranch;
1244  }
1245  }
1246 
1247  for (kmp_uint32 i = 1; i < depth; ++i)
1248  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1249  // Fill in hierarchy in the case of oversubscription
1250  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1251  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1252 
1253  uninitialized = initialized; // One writer
1254  }
1255 
1256  // Resize the hierarchy if nproc changes to something larger than before
1257  void resize(kmp_uint32 nproc) {
1258  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1259  while (bool_result == 0) { // someone else is trying to resize
1260  KMP_CPU_PAUSE();
1261  if (nproc <= base_num_threads) // happy with other thread's resize
1262  return;
1263  else // try to resize
1264  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1265  }
1266  KMP_DEBUG_ASSERT(bool_result != 0);
1267  if (nproc <= base_num_threads)
1268  return; // happy with other thread's resize
1269 
1270  // Calculate new maxLevels
1271  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1272  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1273  // First see if old maxLevels is enough to contain new size
1274  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1275  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1276  numPerLevel[i - 1] *= 2;
1277  old_sz *= 2;
1278  depth++;
1279  }
1280  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1281  while (nproc > old_sz) {
1282  old_sz *= 2;
1283  incs++;
1284  depth++;
1285  }
1286  maxLevels += incs;
1287 
1288  // Resize arrays
1289  kmp_uint32 *old_numPerLevel = numPerLevel;
1290  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1291  numPerLevel = skipPerLevel = NULL;
1292  numPerLevel =
1293  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1294  skipPerLevel = &(numPerLevel[maxLevels]);
1295 
1296  // Copy old elements from old arrays
1297  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1298  // init numPerLevel[*] to 1 item per level
1299  numPerLevel[i] = old_numPerLevel[i];
1300  skipPerLevel[i] = old_skipPerLevel[i];
1301  }
1302 
1303  // Init new elements in arrays to 1
1304  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1305  // init numPerLevel[*] to 1 item per level
1306  numPerLevel[i] = 1;
1307  skipPerLevel[i] = 1;
1308  }
1309 
1310  // Free old arrays
1311  __kmp_free(old_numPerLevel);
1312  }
1313 
1314  // Fill in oversubscription levels of hierarchy
1315  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1316  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1317 
1318  base_num_threads = nproc;
1319  resizing = 0; // One writer
1320  }
1321 };
1322 #endif // KMP_AFFINITY_H