LLVM OpenMP* Runtime Library
kmp_collapse.cpp
1 /*
2  * kmp_collapse.cpp -- loop collapse feature
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_error.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_str.h"
19 #include "kmp_collapse.h"
20 
21 #if OMPT_SUPPORT
22 #include "ompt-specific.h"
23 #endif
24 
25 // OMPTODO: different style of comments (see kmp_sched)
26 // OMPTODO: OMPT/OMPD
27 
28 // avoid inadevertently using a library based abs
29 template <typename T> T __kmp_abs(const T val) {
30  return (val < 0) ? -val: val;
31 }
32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34 
35 //----------------------------------------------------------------------------
36 // Common functions for working with rectangular and non-rectangular loops
37 //----------------------------------------------------------------------------
38 
39 template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); }
40 
41 //----------Loop canonicalization---------------------------------------------
42 
43 // For loop nest (any shape):
44 // convert != to < or >;
45 // switch from using < or > to <= or >=.
46 // "bounds" array has to be allocated per thread.
47 // All other internal functions will work only with canonicalized loops.
48 template <typename T>
49 void kmp_canonicalize_one_loop_XX(
50  ident_t *loc,
51  /*in/out*/ bounds_infoXX_template<T> *bounds) {
52 
53  if (__kmp_env_consistency_check) {
54  if (bounds->step == 0) {
55  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
56  loc);
57  }
58  }
59 
60  if (bounds->comparison == comparison_t::comp_not_eq) {
61  // We can convert this to < or >, depends on the sign of the step:
62  if (bounds->step > 0) {
63  bounds->comparison = comparison_t::comp_less;
64  } else {
65  bounds->comparison = comparison_t::comp_greater;
66  }
67  }
68 
69  if (bounds->comparison == comparison_t::comp_less) {
70  // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
71  // because ub0 + ub1*j should be still positive (otherwise loop was not
72  // well formed)
73  bounds->ub0 -= 1;
74  bounds->comparison = comparison_t::comp_less_or_eq;
75  } else if (bounds->comparison == comparison_t::comp_greater) {
76  bounds->ub0 += 1;
77  bounds->comparison = comparison_t::comp_greater_or_eq;
78  }
79 }
80 
81 // Canonicalize loop nest. original_bounds_nest is an array of length n.
82 void kmp_canonicalize_loop_nest(ident_t *loc,
83  /*in/out*/ bounds_info_t *original_bounds_nest,
84  kmp_index_t n) {
85 
86  for (kmp_index_t ind = 0; ind < n; ++ind) {
87  auto bounds = &(original_bounds_nest[ind]);
88 
89  switch (bounds->loop_type) {
90  case loop_type_t::loop_type_int32:
91  kmp_canonicalize_one_loop_XX<kmp_int32>(
92  loc,
93  /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
94  break;
95  case loop_type_t::loop_type_uint32:
96  kmp_canonicalize_one_loop_XX<kmp_uint32>(
97  loc,
98  /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
99  break;
100  case loop_type_t::loop_type_int64:
101  kmp_canonicalize_one_loop_XX<kmp_int64>(
102  loc,
103  /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
104  break;
105  case loop_type_t::loop_type_uint64:
106  kmp_canonicalize_one_loop_XX<kmp_uint64>(
107  loc,
108  /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
109  break;
110  default:
111  KMP_ASSERT(false);
112  }
113  }
114 }
115 
116 //----------Calculating trip count on one level-------------------------------
117 
118 // Calculate trip count on this loop level.
119 // We do this either for a rectangular loop nest,
120 // or after an adjustment bringing the loops to a parallelepiped shape.
121 // This number should not depend on the value of outer IV
122 // even if the formular has lb1 and ub1.
123 // Note: for non-rectangular loops don't use span for this, it's too big.
124 
125 template <typename T>
126 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
127  /*in/out*/ bounds_infoXX_template<T> *bounds) {
128 
129  if (bounds->comparison == comparison_t::comp_less_or_eq) {
130  if (bounds->ub0 < bounds->lb0) {
131  // Note: after this we don't need to calculate inner loops,
132  // but that should be an edge case:
133  bounds->trip_count = 0;
134  } else {
135  // ub - lb may exceed signed type range; we need to cast to
136  // kmp_loop_nest_iv_t anyway
137  bounds->trip_count =
138  static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
139  __kmp_abs(bounds->step) +
140  1;
141  }
142  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
143  if (bounds->lb0 < bounds->ub0) {
144  // Note: after this we don't need to calculate inner loops,
145  // but that should be an edge case:
146  bounds->trip_count = 0;
147  } else {
148  // lb - ub may exceed signed type range; we need to cast to
149  // kmp_loop_nest_iv_t anyway
150  bounds->trip_count =
151  static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
152  __kmp_abs(bounds->step) +
153  1;
154  }
155  } else {
156  KMP_ASSERT(false);
157  }
158  return bounds->trip_count;
159 }
160 
161 // Calculate trip count on this loop level.
162 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
163 
164  kmp_loop_nest_iv_t trip_count = 0;
165 
166  switch (bounds->loop_type) {
167  case loop_type_t::loop_type_int32:
168  trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
169  /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
170  break;
171  case loop_type_t::loop_type_uint32:
172  trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
173  /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
174  break;
175  case loop_type_t::loop_type_int64:
176  trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
177  /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
178  break;
179  case loop_type_t::loop_type_uint64:
180  trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
181  /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
182  break;
183  default:
184  KMP_ASSERT(false);
185  }
186 
187  return trip_count;
188 }
189 
190 //----------Trim original iv according to its type----------------------------
191 
192 // Trim original iv according to its type.
193 // Return kmp_uint64 value which can be easily used in all internal calculations
194 // And can be statically cast back to original type in user code.
195 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
196  kmp_uint64 res = 0;
197 
198  switch (loop_iv_type) {
199  case loop_type_t::loop_type_int8:
200  res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
201  break;
202  case loop_type_t::loop_type_uint8:
203  res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
204  break;
205  case loop_type_t::loop_type_int16:
206  res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
207  break;
208  case loop_type_t::loop_type_uint16:
209  res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
210  break;
211  case loop_type_t::loop_type_int32:
212  res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
213  break;
214  case loop_type_t::loop_type_uint32:
215  res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
216  break;
217  case loop_type_t::loop_type_int64:
218  res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
219  break;
220  case loop_type_t::loop_type_uint64:
221  res = static_cast<kmp_uint64>(original_iv);
222  break;
223  default:
224  KMP_ASSERT(false);
225  }
226 
227  return res;
228 }
229 
230 //----------Compare two IVs (remember they have a type)-----------------------
231 
232 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
233  kmp_uint64 original_iv2) {
234  bool res = false;
235 
236  switch (loop_iv_type) {
237  case loop_type_t::loop_type_int8:
238  res = static_cast<kmp_int8>(original_iv1) ==
239  static_cast<kmp_int8>(original_iv2);
240  break;
241  case loop_type_t::loop_type_uint8:
242  res = static_cast<kmp_uint8>(original_iv1) ==
243  static_cast<kmp_uint8>(original_iv2);
244  break;
245  case loop_type_t::loop_type_int16:
246  res = static_cast<kmp_int16>(original_iv1) ==
247  static_cast<kmp_int16>(original_iv2);
248  break;
249  case loop_type_t::loop_type_uint16:
250  res = static_cast<kmp_uint16>(original_iv1) ==
251  static_cast<kmp_uint16>(original_iv2);
252  break;
253  case loop_type_t::loop_type_int32:
254  res = static_cast<kmp_int32>(original_iv1) ==
255  static_cast<kmp_int32>(original_iv2);
256  break;
257  case loop_type_t::loop_type_uint32:
258  res = static_cast<kmp_uint32>(original_iv1) ==
259  static_cast<kmp_uint32>(original_iv2);
260  break;
261  case loop_type_t::loop_type_int64:
262  res = static_cast<kmp_int64>(original_iv1) ==
263  static_cast<kmp_int64>(original_iv2);
264  break;
265  case loop_type_t::loop_type_uint64:
266  res = static_cast<kmp_uint64>(original_iv1) ==
267  static_cast<kmp_uint64>(original_iv2);
268  break;
269  default:
270  KMP_ASSERT(false);
271  }
272 
273  return res;
274 }
275 
276 //----------Calculate original iv on one level--------------------------------
277 
278 // Return true if the point fits into upper bounds on this level,
279 // false otherwise
280 template <typename T>
281 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
282  const kmp_point_t original_ivs,
283  kmp_index_t ind) {
284 
285  T iv = static_cast<T>(original_ivs[ind]);
286  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
287 
288  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
289  (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
290  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
291  (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
292  // The calculated point is outside of loop upper boundary:
293  return false;
294  }
295 
296  return true;
297 }
298 
299 // Calculate one iv corresponding to iteration on the level ind.
300 // Return true if it fits into lower-upper bounds on this level
301 // (if not, we need to re-calculate)
302 template <typename T>
303 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
304  /*in/out*/ kmp_point_t original_ivs,
305  const kmp_iterations_t iterations, kmp_index_t ind,
306  bool start_with_lower_bound, bool checkBounds) {
307 
308  kmp_uint64 temp = 0;
309  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
310 
311  if (start_with_lower_bound) {
312  // we moved to the next iteration on one of outer loops, should start
313  // with the lower bound here:
314  temp = bounds->lb0 + bounds->lb1 * outer_iv;
315  } else {
316  auto iteration = iterations[ind];
317  temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
318  }
319 
320  // Now trim original iv according to its type:
321  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
322 
323  if (checkBounds) {
324  return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
325  } else {
326  return true;
327  }
328 }
329 
330 bool kmp_calc_one_iv(const bounds_info_t *bounds,
331  /*in/out*/ kmp_point_t original_ivs,
332  const kmp_iterations_t iterations, kmp_index_t ind,
333  bool start_with_lower_bound, bool checkBounds) {
334 
335  switch (bounds->loop_type) {
336  case loop_type_t::loop_type_int32:
337  return kmp_calc_one_iv_XX<kmp_int32>(
339  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
340  checkBounds);
341  break;
342  case loop_type_t::loop_type_uint32:
343  return kmp_calc_one_iv_XX<kmp_uint32>(
345  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
346  checkBounds);
347  break;
348  case loop_type_t::loop_type_int64:
349  return kmp_calc_one_iv_XX<kmp_int64>(
351  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
352  checkBounds);
353  break;
354  case loop_type_t::loop_type_uint64:
355  return kmp_calc_one_iv_XX<kmp_uint64>(
357  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
358  checkBounds);
359  break;
360  default:
361  KMP_ASSERT(false);
362  return false;
363  }
364 }
365 
366 //----------Calculate original iv on one level for rectangular loop nest------
367 
368 // Calculate one iv corresponding to iteration on the level ind.
369 // Return true if it fits into lower-upper bounds on this level
370 // (if not, we need to re-calculate)
371 template <typename T>
372 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
373  /*in/out*/ kmp_uint64 *original_ivs,
374  const kmp_iterations_t iterations,
375  kmp_index_t ind) {
376 
377  auto iteration = iterations[ind];
378 
379  kmp_uint64 temp =
380  bounds->lb0 +
381  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
382  iteration * bounds->step;
383 
384  // Now trim original iv according to its type:
385  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
386 }
387 
388 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
389  /*in/out*/ kmp_uint64 *original_ivs,
390  const kmp_iterations_t iterations,
391  kmp_index_t ind) {
392 
393  switch (bounds->loop_type) {
394  case loop_type_t::loop_type_int32:
395  kmp_calc_one_iv_rectang_XX<kmp_int32>(
397  /*in/out*/ original_ivs, iterations, ind);
398  break;
399  case loop_type_t::loop_type_uint32:
400  kmp_calc_one_iv_rectang_XX<kmp_uint32>(
402  /*in/out*/ original_ivs, iterations, ind);
403  break;
404  case loop_type_t::loop_type_int64:
405  kmp_calc_one_iv_rectang_XX<kmp_int64>(
407  /*in/out*/ original_ivs, iterations, ind);
408  break;
409  case loop_type_t::loop_type_uint64:
410  kmp_calc_one_iv_rectang_XX<kmp_uint64>(
412  /*in/out*/ original_ivs, iterations, ind);
413  break;
414  default:
415  KMP_ASSERT(false);
416  }
417 }
418 
419 //----------------------------------------------------------------------------
420 // Rectangular loop nest
421 //----------------------------------------------------------------------------
422 
423 //----------Canonicalize loop nest and calculate trip count-------------------
424 
425 // Canonicalize loop nest and calculate overall trip count.
426 // "bounds_nest" has to be allocated per thread.
427 // API will modify original bounds_nest array to bring it to a canonical form
428 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
429 // canonical form there will be no changes to bounds in bounds_nest array
430 // (only trip counts will be calculated).
431 // Returns trip count of overall space.
432 extern "C" kmp_loop_nest_iv_t
433 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
434  /*in/out*/ bounds_info_t *original_bounds_nest,
435  kmp_index_t n) {
436 
437  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
438 
439  kmp_loop_nest_iv_t total = 1;
440 
441  for (kmp_index_t ind = 0; ind < n; ++ind) {
442  auto bounds = &(original_bounds_nest[ind]);
443 
444  kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
445  total *= trip_count;
446  }
447 
448  return total;
449 }
450 
451 //----------Calculate old induction variables---------------------------------
452 
453 // Calculate old induction variables corresponding to overall new_iv.
454 // Note: original IV will be returned as if it had kmp_uint64 type,
455 // will have to be converted to original type in user code.
456 // Note: trip counts should be already calculated by
457 // __kmpc_process_loop_nest_rectang.
458 // OMPTODO: special case 2, 3 nested loops: either do different
459 // interface without array or possibly template this over n
460 extern "C" void
461 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
462  const bounds_info_t *original_bounds_nest,
463  /*out*/ kmp_uint64 *original_ivs,
464  kmp_index_t n) {
465 
466  kmp_iterations_t iterations =
467  (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
468 
469  // First, calc corresponding iteration in every original loop:
470  for (kmp_index_t ind = n; ind > 0;) {
471  --ind;
472  auto bounds = &(original_bounds_nest[ind]);
473 
474  // should be optimized to OPDIVREM:
475  auto temp = new_iv / bounds->trip_count;
476  auto iteration = new_iv % bounds->trip_count;
477  new_iv = temp;
478 
479  iterations[ind] = iteration;
480  }
481  KMP_ASSERT(new_iv == 0);
482 
483  for (kmp_index_t ind = 0; ind < n; ++ind) {
484  auto bounds = &(original_bounds_nest[ind]);
485 
486  kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
487  }
488  __kmp_free(iterations);
489 }
490 
491 //----------------------------------------------------------------------------
492 // Non-rectangular loop nest
493 //----------------------------------------------------------------------------
494 
495 //----------Calculate maximum possible span of iv values on one level---------
496 
497 // Calculate span for IV on this loop level for "<=" case.
498 // Note: it's for <= on this loop nest level, so lower bound should be smallest
499 // value, upper bound should be the biggest value. If the loop won't execute,
500 // 'smallest' may be bigger than 'biggest', but we'd better not switch them
501 // around.
502 template <typename T>
503 void kmp_calc_span_lessoreq_XX(
504  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
505  /* in/out*/ bounds_info_internal_t *bounds_nest) {
506 
507  typedef typename traits_t<T>::unsigned_t UT;
508  // typedef typename traits_t<T>::signed_t ST;
509 
510  // typedef typename big_span_t span_t;
511  typedef T span_t;
512 
513  auto &bbounds = bounds->b;
514 
515  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
516  // This dimention depends on one of previous ones; can't be the outermost
517  // one.
518  bounds_info_internalXX_template<T> *previous =
519  reinterpret_cast<bounds_info_internalXX_template<T> *>(
520  &(bounds_nest[bbounds.outer_iv]));
521 
522  // OMPTODO: assert that T is compatible with loop variable type on
523  // 'previous' loop
524 
525  {
526  span_t bound_candidate1 =
527  bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
528  span_t bound_candidate2 =
529  bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
530  if (bound_candidate1 < bound_candidate2) {
531  bounds->span_smallest = bound_candidate1;
532  } else {
533  bounds->span_smallest = bound_candidate2;
534  }
535  }
536 
537  {
538  // We can't adjust the upper bound with respect to step, because
539  // lower bound might be off after adjustments
540 
541  span_t bound_candidate1 =
542  bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
543  span_t bound_candidate2 =
544  bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
545  if (bound_candidate1 < bound_candidate2) {
546  bounds->span_biggest = bound_candidate2;
547  } else {
548  bounds->span_biggest = bound_candidate1;
549  }
550  }
551  } else {
552  // Rectangular:
553  bounds->span_smallest = bbounds.lb0;
554  bounds->span_biggest = bbounds.ub0;
555  }
556  if (!bounds->loop_bounds_adjusted) {
557  // Here it's safe to reduce the space to the multiply of step.
558  // OMPTODO: check if the formular is correct.
559  // Also check if it would be safe to do this if we didn't adjust left side.
560  bounds->span_biggest -=
561  (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
562  }
563 }
564 
565 // Calculate span for IV on this loop level for ">=" case.
566 template <typename T>
567 void kmp_calc_span_greateroreq_XX(
568  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
569  /* in/out*/ bounds_info_internal_t *bounds_nest) {
570 
571  typedef typename traits_t<T>::unsigned_t UT;
572  // typedef typename traits_t<T>::signed_t ST;
573 
574  // typedef typename big_span_t span_t;
575  typedef T span_t;
576 
577  auto &bbounds = bounds->b;
578 
579  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
580  // This dimention depends on one of previous ones; can't be the outermost
581  // one.
582  bounds_info_internalXX_template<T> *previous =
583  reinterpret_cast<bounds_info_internalXX_template<T> *>(
584  &(bounds_nest[bbounds.outer_iv]));
585 
586  // OMPTODO: assert that T is compatible with loop variable type on
587  // 'previous' loop
588 
589  {
590  span_t bound_candidate1 =
591  bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
592  span_t bound_candidate2 =
593  bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
594  if (bound_candidate1 >= bound_candidate2) {
595  bounds->span_smallest = bound_candidate1;
596  } else {
597  bounds->span_smallest = bound_candidate2;
598  }
599  }
600 
601  {
602  // We can't adjust the upper bound with respect to step, because
603  // lower bound might be off after adjustments
604 
605  span_t bound_candidate1 =
606  bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
607  span_t bound_candidate2 =
608  bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
609  if (bound_candidate1 >= bound_candidate2) {
610  bounds->span_biggest = bound_candidate2;
611  } else {
612  bounds->span_biggest = bound_candidate1;
613  }
614  }
615 
616  } else {
617  // Rectangular:
618  bounds->span_biggest = bbounds.lb0;
619  bounds->span_smallest = bbounds.ub0;
620  }
621  if (!bounds->loop_bounds_adjusted) {
622  // Here it's safe to reduce the space to the multiply of step.
623  // OMPTODO: check if the formular is correct.
624  // Also check if it would be safe to do this if we didn't adjust left side.
625  bounds->span_biggest -=
626  (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
627  }
628 }
629 
630 // Calculate maximum possible span for IV on this loop level.
631 template <typename T>
632 void kmp_calc_span_XX(
633  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
634  /* in/out*/ bounds_info_internal_t *bounds_nest) {
635 
636  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
637  kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
638  } else {
639  KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
640  kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
641  }
642 }
643 
644 //----------All initial processing of the loop nest---------------------------
645 
646 // Calculate new bounds for this loop level.
647 // To be able to work with the nest we need to get it to a parallelepiped shape.
648 // We need to stay in the original range of values, so that there will be no
649 // overflow, for that we'll adjust both upper and lower bounds as needed.
650 template <typename T>
651 void kmp_calc_new_bounds_XX(
652  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
653  /* in/out*/ bounds_info_internal_t *bounds_nest) {
654 
655  auto &bbounds = bounds->b;
656 
657  if (bbounds.lb1 == bbounds.ub1) {
658  // Already parallel, no need to adjust:
659  bounds->loop_bounds_adjusted = false;
660  } else {
661  bounds->loop_bounds_adjusted = true;
662 
663  T old_lb1 = bbounds.lb1;
664  T old_ub1 = bbounds.ub1;
665 
666  if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
667  // With this shape we can adjust to a rectangle:
668  bbounds.lb1 = 0;
669  bbounds.ub1 = 0;
670  } else {
671  // get upper and lower bounds to be parallel
672  // with values in the old range.
673  // Note: abs didn't work here.
674  if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
675  ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
676  bbounds.lb1 = old_ub1;
677  } else {
678  bbounds.ub1 = old_lb1;
679  }
680  }
681 
682  // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
683  // The idea here that for this IV we are now getting the same span
684  // irrespective of the previous IV value.
685  bounds_info_internalXX_template<T> *previous =
686  reinterpret_cast<bounds_info_internalXX_template<T> *>(
687  &bounds_nest[bbounds.outer_iv]);
688 
689  if (bbounds.comparison == comparison_t::comp_less_or_eq) {
690  if (old_lb1 < bbounds.lb1) {
691  KMP_ASSERT(old_lb1 < 0);
692  // The length is good on outer_iv biggest number,
693  // can use it to find where to move the lower bound:
694 
695  T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
696  bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
697  // e.g. it was 0?? (same below)
698  } else if (old_lb1 > bbounds.lb1) {
699  // still need to move lower bound:
700  T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
701  bbounds.lb0 += add;
702  }
703 
704  if (old_ub1 > bbounds.ub1) {
705  KMP_ASSERT(old_ub1 > 0);
706  // The length is good on outer_iv biggest number,
707  // can use it to find where to move upper bound:
708 
709  T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
710  bbounds.ub0 += add;
711  } else if (old_ub1 < bbounds.ub1) {
712  // still need to move upper bound:
713  T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
714  bbounds.ub0 -= sub;
715  }
716  } else {
717  KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
718  if (old_lb1 < bbounds.lb1) {
719  KMP_ASSERT(old_lb1 < 0);
720  T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
721  bbounds.lb0 -= sub;
722  } else if (old_lb1 > bbounds.lb1) {
723  T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
724  bbounds.lb0 += add;
725  }
726 
727  if (old_ub1 > bbounds.ub1) {
728  KMP_ASSERT(old_ub1 > 0);
729  T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
730  bbounds.ub0 += add;
731  } else if (old_ub1 < bbounds.ub1) {
732  T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
733  bbounds.ub0 -= sub;
734  }
735  }
736  }
737 }
738 
739 // Do all processing for one canonicalized loop in the nest
740 // (assuming that outer loops already were processed):
741 template <typename T>
742 kmp_loop_nest_iv_t kmp_process_one_loop_XX(
743  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
744  /*in/out*/ bounds_info_internal_t *bounds_nest) {
745 
746  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
747  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
748  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
749 }
750 
751 // Non-rectangular loop nest, canonicalized to use <= or >=.
752 // Process loop nest to have a parallelepiped shape,
753 // calculate biggest spans for IV's on all levels and calculate overall trip
754 // count. "bounds_nest" has to be allocated per thread.
755 // Returns overall trip count (for adjusted space).
756 kmp_loop_nest_iv_t kmp_process_loop_nest(
757  /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
758 
759  kmp_loop_nest_iv_t total = 1;
760 
761  for (kmp_index_t ind = 0; ind < n; ++ind) {
762  auto bounds = &(bounds_nest[ind]);
763  kmp_loop_nest_iv_t trip_count = 0;
764 
765  switch (bounds->b.loop_type) {
766  case loop_type_t::loop_type_int32:
767  trip_count = kmp_process_one_loop_XX<kmp_int32>(
768  /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
769  /*in/out*/ bounds_nest);
770  break;
771  case loop_type_t::loop_type_uint32:
772  trip_count = kmp_process_one_loop_XX<kmp_uint32>(
773  /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
774  /*in/out*/ bounds_nest);
775  break;
776  case loop_type_t::loop_type_int64:
777  trip_count = kmp_process_one_loop_XX<kmp_int64>(
778  /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
779  /*in/out*/ bounds_nest);
780  break;
781  case loop_type_t::loop_type_uint64:
782  trip_count = kmp_process_one_loop_XX<kmp_uint64>(
783  /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
784  /*in/out*/ bounds_nest);
785  break;
786  default:
787  KMP_ASSERT(false);
788  }
789  total *= trip_count;
790  }
791 
792  return total;
793 }
794 
795 //----------Calculate iterations (in the original or updated space)-----------
796 
797 // Calculate number of iterations in original or updated space resulting in
798 // original_ivs[ind] (only on this level, non-negative)
799 // (not counting initial iteration)
800 template <typename T>
801 kmp_loop_nest_iv_t
802 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
803  const kmp_point_t original_ivs,
804  kmp_index_t ind) {
805 
806  kmp_loop_nest_iv_t iterations = 0;
807 
808  if (bounds->comparison == comparison_t::comp_less_or_eq) {
809  iterations =
810  (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
811  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
812  __kmp_abs(bounds->step);
813  } else {
814  KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
815  iterations = (bounds->lb0 +
816  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
817  static_cast<T>(original_ivs[ind])) /
818  __kmp_abs(bounds->step);
819  }
820 
821  return iterations;
822 }
823 
824 // Calculate number of iterations in the original or updated space resulting in
825 // original_ivs[ind] (only on this level, non-negative)
826 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
827  const kmp_point_t original_ivs,
828  kmp_index_t ind) {
829 
830  switch (bounds->loop_type) {
831  case loop_type_t::loop_type_int32:
832  return kmp_calc_number_of_iterations_XX<kmp_int32>(
833  (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
834  break;
835  case loop_type_t::loop_type_uint32:
836  return kmp_calc_number_of_iterations_XX<kmp_uint32>(
837  (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
838  break;
839  case loop_type_t::loop_type_int64:
840  return kmp_calc_number_of_iterations_XX<kmp_int64>(
841  (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
842  break;
843  case loop_type_t::loop_type_uint64:
844  return kmp_calc_number_of_iterations_XX<kmp_uint64>(
845  (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
846  break;
847  default:
848  KMP_ASSERT(false);
849  return 0;
850  }
851 }
852 
853 //----------Calculate new iv corresponding to original ivs--------------------
854 
855 // We got a point in the original loop nest.
856 // Take updated bounds and calculate what new_iv will correspond to this point.
857 // When we are getting original IVs from new_iv, we have to adjust to fit into
858 // original loops bounds. Getting new_iv for the adjusted original IVs will help
859 // with making more chunks non-empty.
860 kmp_loop_nest_iv_t
861 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
862  const kmp_point_t original_ivs,
863  kmp_index_t n) {
864 
865  kmp_loop_nest_iv_t new_iv = 0;
866 
867  for (kmp_index_t ind = 0; ind < n; ++ind) {
868  auto bounds = &(bounds_nest[ind].b);
869 
870  new_iv = new_iv * bounds->trip_count +
871  kmp_calc_number_of_iterations(bounds, original_ivs, ind);
872  }
873 
874  return new_iv;
875 }
876 
877 //----------Calculate original ivs for provided iterations--------------------
878 
879 // Calculate original IVs for provided iterations, assuming iterations are
880 // calculated in the original space.
881 // Loop nest is in canonical form (with <= / >=).
882 bool kmp_calc_original_ivs_from_iterations(
883  const bounds_info_t *original_bounds_nest, kmp_index_t n,
884  /*in/out*/ kmp_point_t original_ivs,
885  /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
886 
887  kmp_index_t lengthened_ind = n;
888 
889  for (; ind < n;) {
890  auto bounds = &(original_bounds_nest[ind]);
891  bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
892  ind, (lengthened_ind < ind), true);
893 
894  if (!good) {
895  // The calculated iv value is too big (or too small for >=):
896  if (ind == 0) {
897  // Space is empty:
898  return false;
899  } else {
900  // Go to next iteration on the outer loop:
901  --ind;
902  ++iterations[ind];
903  lengthened_ind = ind;
904  for (kmp_index_t i = ind + 1; i < n; ++i) {
905  iterations[i] = 0;
906  }
907  continue;
908  }
909  }
910  ++ind;
911  }
912 
913  return true;
914 }
915 
916 //----------Calculate original ivs for the beginning of the loop nest---------
917 
918 // Calculate IVs for the beginning of the loop nest.
919 // Note: lower bounds of all loops may not work -
920 // if on some of the iterations of the outer loops inner loops are empty.
921 // Loop nest is in canonical form (with <= / >=).
922 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
923  kmp_index_t n,
924  /*out*/ kmp_point_t original_ivs) {
925 
926  // Iterations in the original space, multiplied by step:
927  kmp_iterations_t iterations =
928  (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
929 
930  for (kmp_index_t ind = n; ind > 0;) {
931  --ind;
932  iterations[ind] = 0;
933  }
934 
935  // Now calculate the point:
936  bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
937  /*in/out*/ original_ivs,
938  /*in/out*/ iterations, 0);
939  __kmp_free(iterations);
940  return b;
941 }
942 
943 //----------Calculate next point in the original loop space-------------------
944 
945 // From current set of original IVs calculate next point.
946 // Return false if there is no next point in the loop bounds.
947 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
948  kmp_index_t n, const kmp_point_t original_ivs,
949  /*out*/ kmp_point_t next_original_ivs) {
950  // Iterations in the original space, multiplied by step (so can be negative):
951  kmp_iterations_t iterations =
952  (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
953 
954  // First, calc corresponding iteration in every original loop:
955  for (kmp_index_t ind = 0; ind < n; ++ind) {
956  auto bounds = &(original_bounds_nest[ind]);
957  iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
958  }
959 
960  for (kmp_index_t ind = 0; ind < n; ++ind) {
961  next_original_ivs[ind] = original_ivs[ind];
962  }
963 
964  // Next add one step to the iterations on the inner-most level, and see if we
965  // need to move up the nest:
966  kmp_index_t ind = n - 1;
967  ++iterations[ind];
968 
969  bool b = kmp_calc_original_ivs_from_iterations(
970  original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
971 
972  __kmp_free(iterations);
973  return b;
974 }
975 
976 //----------Calculate chunk end in the original loop space--------------------
977 
978 // For one level calculate old induction variable corresponding to overall
979 // new_iv for the chunk end.
980 // Return true if it fits into upper bound on this level
981 // (if not, we need to re-calculate)
982 template <typename T>
983 bool kmp_calc_one_iv_for_chunk_end_XX(
984  const bounds_infoXX_template<T> *bounds,
985  const bounds_infoXX_template<T> *updated_bounds,
986  /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
987  kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
988  const kmp_point_t original_ivs_start) {
989 
990  // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
991  // big_span_t;
992 
993  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
994  T temp = 0;
995 
996  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
997 
998  if (start_with_lower_bound) {
999  // we moved to the next iteration on one of outer loops, may as well use
1000  // the lower bound here:
1001  temp = bounds->lb0 + bounds->lb1 * outer_iv;
1002  } else {
1003  // Start in expanded space, but:
1004  // - we need to hit original space lower bound, so need to account for
1005  // that
1006  // - we have to go into original space, even if that means adding more
1007  // iterations than was planned
1008  // - we have to go past (or equal to) previous point (which is the chunk
1009  // starting point)
1010 
1011  auto iteration = iterations[ind];
1012 
1013  auto step = bounds->step;
1014 
1015  // In case of >= it's negative:
1016  auto accountForStep =
1017  ((bounds->lb0 + bounds->lb1 * outer_iv) -
1018  (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1019  step;
1020 
1021  temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1022  accountForStep + iteration * step;
1023 
1024  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1025  (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1026  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1027  (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1028  // Too small (or too big), didn't reach the original lower bound. Use
1029  // heuristic:
1030  temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1031  }
1032 
1033  if (compare_with_start) {
1034 
1035  T start = static_cast<T>(original_ivs_start[ind]);
1036 
1037  temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1038 
1039  // On all previous levels start of the chunk is same as the end, need to
1040  // be really careful here:
1041  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1042  (temp < start)) ||
1043  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1044  (temp > start))) {
1045  // End of the chunk can't be smaller (for >= bigger) than it's start.
1046  // Use heuristic:
1047  temp = start + iteration / 4 * step;
1048  }
1049  }
1050  }
1051 
1052  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1053 
1054  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1055  (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1056  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1057  (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1058  // Too big (or too small for >=).
1059  return false;
1060  }
1061 
1062  return true;
1063 }
1064 
1065 // For one level calculate old induction variable corresponding to overall
1066 // new_iv for the chunk end.
1067 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1068  const bounds_info_t *updated_bounds,
1069  /*in/out*/ kmp_point_t original_ivs,
1070  const kmp_iterations_t iterations,
1071  kmp_index_t ind, bool start_with_lower_bound,
1072  bool compare_with_start,
1073  const kmp_point_t original_ivs_start) {
1074 
1075  switch (bounds->loop_type) {
1076  case loop_type_t::loop_type_int32:
1077  return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1079  (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1080  /*in/out*/
1081  original_ivs, iterations, ind, start_with_lower_bound,
1082  compare_with_start, original_ivs_start);
1083  break;
1084  case loop_type_t::loop_type_uint32:
1085  return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1087  (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1088  /*in/out*/
1089  original_ivs, iterations, ind, start_with_lower_bound,
1090  compare_with_start, original_ivs_start);
1091  break;
1092  case loop_type_t::loop_type_int64:
1093  return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1095  (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1096  /*in/out*/
1097  original_ivs, iterations, ind, start_with_lower_bound,
1098  compare_with_start, original_ivs_start);
1099  break;
1100  case loop_type_t::loop_type_uint64:
1101  return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1103  (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1104  /*in/out*/
1105  original_ivs, iterations, ind, start_with_lower_bound,
1106  compare_with_start, original_ivs_start);
1107  break;
1108  default:
1109  KMP_ASSERT(false);
1110  return false;
1111  }
1112 }
1113 
1114 // Calculate old induction variables corresponding to overall new_iv for the
1115 // chunk end. If due to space extension we are getting old IVs outside of the
1116 // boundaries, bring them into the boundaries. Need to do this in the runtime,
1117 // esp. on the lower bounds side. When getting result need to make sure that the
1118 // new chunk starts at next position to old chunk, not overlaps with it (this is
1119 // done elsewhere), and need to make sure end of the chunk is further than the
1120 // beginning of the chunk. We don't need an exact ending point here, just
1121 // something more-or-less close to the desired chunk length, bigger is fine
1122 // (smaller would be fine, but we risk going into infinite loop, so do smaller
1123 // only at the very end of the space). result: false if could not find the
1124 // ending point in the original loop space. In this case the caller can use
1125 // original upper bounds as the end of the chunk. Chunk won't be empty, because
1126 // it'll have at least the starting point, which is by construction in the
1127 // original space.
1128 bool kmp_calc_original_ivs_for_chunk_end(
1129  const bounds_info_t *original_bounds_nest, kmp_index_t n,
1130  const bounds_info_internal_t *updated_bounds_nest,
1131  const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1132  /*out*/ kmp_point_t original_ivs) {
1133 
1134  // Iterations in the expanded space:
1135  kmp_iterations_t iterations =
1136  (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
1137 
1138  // First, calc corresponding iteration in every modified loop:
1139  for (kmp_index_t ind = n; ind > 0;) {
1140  --ind;
1141  auto &updated_bounds = updated_bounds_nest[ind];
1142 
1143  // should be optimized to OPDIVREM:
1144  auto new_ind = new_iv / updated_bounds.b.trip_count;
1145  auto iteration = new_iv % updated_bounds.b.trip_count;
1146 
1147  new_iv = new_ind;
1148  iterations[ind] = iteration;
1149  }
1150  KMP_DEBUG_ASSERT(new_iv == 0);
1151 
1152  kmp_index_t lengthened_ind = n;
1153  kmp_index_t equal_ind = -1;
1154 
1155  // Next calculate the point, but in original loop nest.
1156  for (kmp_index_t ind = 0; ind < n;) {
1157  auto bounds = &(original_bounds_nest[ind]);
1158  auto updated_bounds = &(updated_bounds_nest[ind].b);
1159 
1160  bool good = kmp_calc_one_iv_for_chunk_end(
1161  bounds, updated_bounds,
1162  /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1163  (equal_ind >= ind - 1), original_ivs_start);
1164 
1165  if (!good) {
1166  // Too big (or too small for >=).
1167  if (ind == 0) {
1168  // Need to reduce to the end.
1169  __kmp_free(iterations);
1170  return false;
1171  } else {
1172  // Go to next iteration on outer loop:
1173  --ind;
1174  ++(iterations[ind]);
1175  lengthened_ind = ind;
1176  if (equal_ind >= lengthened_ind) {
1177  // We've changed the number of iterations here,
1178  // can't be same anymore:
1179  equal_ind = lengthened_ind - 1;
1180  }
1181  for (kmp_index_t i = ind + 1; i < n; ++i) {
1182  iterations[i] = 0;
1183  }
1184  continue;
1185  }
1186  }
1187 
1188  if ((equal_ind == ind - 1) &&
1189  (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1190  original_ivs_start[ind]))) {
1191  equal_ind = ind;
1192  } else if ((equal_ind > ind - 1) &&
1193  !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1194  original_ivs_start[ind]))) {
1195  equal_ind = ind - 1;
1196  }
1197  ++ind;
1198  }
1199 
1200  __kmp_free(iterations);
1201  return true;
1202 }
1203 
1204 //----------Calculate upper bounds for the last chunk-------------------------
1205 
1206 // Calculate one upper bound for the end.
1207 template <typename T>
1208 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1209  /*in/out*/ kmp_point_t original_ivs,
1210  kmp_index_t ind) {
1211 
1212  T temp = bounds->ub0 +
1213  bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1214 
1215  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1216 }
1217 
1218 void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1219  /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1220 
1221  switch (bounds->loop_type) {
1222  default:
1223  KMP_ASSERT(false);
1224  break;
1225  case loop_type_t::loop_type_int32:
1226  kmp_calc_one_iv_end_XX<kmp_int32>(
1228  /*in/out*/ original_ivs, ind);
1229  break;
1230  case loop_type_t::loop_type_uint32:
1231  kmp_calc_one_iv_end_XX<kmp_uint32>(
1233  /*in/out*/ original_ivs, ind);
1234  break;
1235  case loop_type_t::loop_type_int64:
1236  kmp_calc_one_iv_end_XX<kmp_int64>(
1238  /*in/out*/ original_ivs, ind);
1239  break;
1240  case loop_type_t::loop_type_uint64:
1241  kmp_calc_one_iv_end_XX<kmp_uint64>(
1243  /*in/out*/ original_ivs, ind);
1244  break;
1245  }
1246 }
1247 
1248 // Calculate upper bounds for the last loop iteration. Just use original upper
1249 // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1250 // this point is in the original space (it's likely not)
1251 void kmp_calc_original_ivs_for_end(
1252  const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1253  /*out*/ kmp_point_t original_ivs) {
1254  for (kmp_index_t ind = 0; ind < n; ++ind) {
1255  auto bounds = &(original_bounds_nest[ind]);
1256  kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1257  }
1258 }
1259 
1260 //----------Init API for non-rectangular loops--------------------------------
1261 
1262 // Init API for collapsed loops (static, no chunks defined).
1263 // "bounds_nest" has to be allocated per thread.
1264 // API will modify original bounds_nest array to bring it to a canonical form
1265 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1266 // canonical form there will be no changes to bounds in bounds_nest array
1267 // (only trip counts will be calculated). Internally API will expand the space
1268 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1269 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1270 // important on the left side, to hit the lower bounds and not step over), and
1271 // pick the correct chunk for this thread (so it will calculate chunks up to the
1272 // needed one). It could be optimized to calculate just this chunk, potentially
1273 // a bit less well distributed among threads. It is designed to make sure that
1274 // threads will receive predictable chunks, deterministically (so that next nest
1275 // of loops with similar characteristics will get exactly same chunks on same
1276 // threads).
1277 // Current contract: chunk_bounds_nest has only lb0 and ub0,
1278 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1279 extern "C" kmp_int32
1280 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1281  /*in/out*/ bounds_info_t *original_bounds_nest,
1282  /*out*/ bounds_info_t *chunk_bounds_nest,
1283  kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1284 
1285  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1286  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1287 
1288  if (__kmp_env_consistency_check) {
1289  __kmp_push_workshare(gtid, ct_pdo, loc);
1290  }
1291 
1292  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1293 
1294  bounds_info_internal_t *updated_bounds_nest =
1295  (bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) *
1296  n);
1297 
1298  for (kmp_index_t i = 0; i < n; ++i) {
1299  updated_bounds_nest[i].b = original_bounds_nest[i];
1300  }
1301 
1302  kmp_loop_nest_iv_t total =
1303  kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1304 
1305  if (plastiter != NULL) {
1306  *plastiter = FALSE;
1307  }
1308 
1309  if (total == 0) {
1310  // Loop won't execute:
1311  __kmp_free(updated_bounds_nest);
1312  return FALSE;
1313  }
1314 
1315  // OMPTODO: DISTRIBUTE is not supported yet
1316  __kmp_assert_valid_gtid(gtid);
1317  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1318 
1319  kmp_info_t *th = __kmp_threads[gtid];
1320  kmp_team_t *team = th->th.th_team;
1321  kmp_uint32 nth = team->t.t_nproc; // Number of threads
1322 
1323  KMP_DEBUG_ASSERT(tid < nth);
1324 
1325  kmp_point_t original_ivs_start =
1326  (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1327  kmp_point_t original_ivs_end =
1328  (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1329  kmp_point_t original_ivs_next_start =
1330  (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1331 
1332  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1333  /*out*/ original_ivs_start)) {
1334  // Loop won't execute:
1335  __kmp_free(updated_bounds_nest);
1336  __kmp_free(original_ivs_start);
1337  __kmp_free(original_ivs_end);
1338  __kmp_free(original_ivs_next_start);
1339  return FALSE;
1340  }
1341 
1342  // Not doing this optimization for one thread:
1343  // (1) more to test
1344  // (2) without it current contract that chunk_bounds_nest has only lb0 and
1345  // ub0, lb1 and ub1 are set to 0 and can be ignored.
1346  // if (nth == 1) {
1347  // // One thread:
1348  // // Copy all info from original_bounds_nest, it'll be good enough.
1349 
1350  // for (kmp_index_t i = 0; i < n; ++i) {
1351  // chunk_bounds_nest[i] = original_bounds_nest[i];
1352  // }
1353 
1354  // if (plastiter != NULL) {
1355  // *plastiter = TRUE;
1356  // }
1357  // __kmp_free(updated_bounds_nest);
1358  // __kmp_free(original_ivs_start);
1359  // __kmp_free(original_ivs_end);
1360  // __kmp_free(original_ivs_next_start);
1361  // return TRUE;
1362  //}
1363 
1364  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1365  updated_bounds_nest, original_ivs_start, n);
1366 
1367  bool last_iter = false;
1368 
1369  for (; nth > 0;) {
1370  // We could calculate chunk size once, but this is to compensate that the
1371  // original space is not parallelepiped and some threads can be left
1372  // without work:
1373  KMP_DEBUG_ASSERT(total >= new_iv);
1374 
1375  kmp_loop_nest_iv_t total_left = total - new_iv;
1376  kmp_loop_nest_iv_t chunk_size = total_left / nth;
1377  kmp_loop_nest_iv_t remainder = total_left % nth;
1378 
1379  kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1380 
1381  if (remainder > 0) {
1382  ++curr_chunk_size;
1383  --remainder;
1384  }
1385 
1386 #if defined(KMP_DEBUG)
1387  kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1388 #endif
1389 
1390  if (curr_chunk_size > 1) {
1391  new_iv += curr_chunk_size - 1;
1392  }
1393 
1394  if ((nth == 1) || (new_iv >= total - 1)) {
1395  // Do this one till the end - just in case we miscalculated
1396  // and either too much is left to process or new_iv is a bit too big:
1397  kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1398  /*out*/ original_ivs_end);
1399 
1400  last_iter = true;
1401  } else {
1402  // Note: here we make sure it's past (or equal to) the previous point.
1403  if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1404  updated_bounds_nest,
1405  original_ivs_start, new_iv,
1406  /*out*/ original_ivs_end)) {
1407  // We could not find the ending point, use the original upper bounds:
1408  kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1409  /*out*/ original_ivs_end);
1410 
1411  last_iter = true;
1412  }
1413  }
1414 
1415 #if defined(KMP_DEBUG)
1416  auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1417  updated_bounds_nest, original_ivs_end, n);
1418  KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1419 #endif
1420 
1421  if (last_iter && (tid != 0)) {
1422  // We are done, this was last chunk, but no chunk for current thread was
1423  // found:
1424  __kmp_free(updated_bounds_nest);
1425  __kmp_free(original_ivs_start);
1426  __kmp_free(original_ivs_end);
1427  __kmp_free(original_ivs_next_start);
1428  return FALSE;
1429  }
1430 
1431  if (tid == 0) {
1432  // We found the chunk for this thread, now we need to check if it's the
1433  // last chunk or not:
1434 
1435  if (last_iter ||
1436  !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1437  /*out*/ original_ivs_next_start)) {
1438  // no more loop iterations left to process,
1439  // this means that currently found chunk is the last chunk:
1440  if (plastiter != NULL) {
1441  *plastiter = TRUE;
1442  }
1443  }
1444 
1445  // Fill in chunk bounds:
1446  for (kmp_index_t i = 0; i < n; ++i) {
1447  chunk_bounds_nest[i] =
1448  original_bounds_nest[i]; // To fill in types, etc. - optional
1449  chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1450  chunk_bounds_nest[i].lb1_u64 = 0;
1451 
1452  chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1453  chunk_bounds_nest[i].ub1_u64 = 0;
1454  }
1455 
1456  __kmp_free(updated_bounds_nest);
1457  __kmp_free(original_ivs_start);
1458  __kmp_free(original_ivs_end);
1459  __kmp_free(original_ivs_next_start);
1460  return TRUE;
1461  }
1462 
1463  --tid;
1464  --nth;
1465 
1466  bool next_chunk = kmp_calc_next_original_ivs(
1467  original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1468  if (!next_chunk) {
1469  // no more loop iterations to process,
1470  // the prevoius chunk was the last chunk
1471  break;
1472  }
1473 
1474  // original_ivs_start is next to previous chunk original_ivs_end,
1475  // we need to start new chunk here, so chunks will be one after another
1476  // without any gap or overlap:
1477  new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1478  original_ivs_start, n);
1479  }
1480 
1481  __kmp_free(updated_bounds_nest);
1482  __kmp_free(original_ivs_start);
1483  __kmp_free(original_ivs_end);
1484  __kmp_free(original_ivs_next_start);
1485  return FALSE;
1486 }
Definition: kmp.h:234