LLVM OpenMP* Runtime Library
kmp_stats.h
1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21  Accumulates number of samples and computes min, max, mean, standard deviation
22  on the fly.
23 
24  Online variance calculation algorithm from
25  http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37  than is useful for application characterisation and are intended for the
38  runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
50  noTotal = 1 << 0,
51  onlyInMaster = 1 << 1,
52  noUnits = 1 << 2,
53  notInMaster = 1 << 3,
54  logEvent = 1 << 4
55 };
57 
64  IDLE,
65  SERIAL_REGION,
66  FORK_JOIN_BARRIER,
67  PLAIN_BARRIER,
68  TASKWAIT,
69  TASKYIELD,
70  TASKGROUP,
71  IMPLICIT_TASK,
72  EXPLICIT_TASK,
73  TEAMS_REGION
74 };
75 
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg) \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97  macro(OMP_NESTED_PARALLEL, 0, arg) \
98  macro(OMP_LOOP_STATIC, 0, arg) \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100  macro(OMP_LOOP_DYNAMIC, 0, arg) \
101  macro(OMP_DISTRIBUTE, 0, arg) \
102  macro(OMP_BARRIER, 0, arg) \
103  macro(OMP_CRITICAL, 0, arg) \
104  macro(OMP_SINGLE, 0, arg) \
105  macro(OMP_SECTIONS, 0, arg) \
106  macro(OMP_MASTER, 0, arg) \
107  macro(OMP_MASKED, 0, arg) \
108  macro(OMP_TEAMS, 0, arg) \
109  macro(OMP_set_lock, 0, arg) \
110  macro(OMP_test_lock, 0, arg) \
111  macro(REDUCE_wait, 0, arg) \
112  macro(REDUCE_nowait, 0, arg) \
113  macro(OMP_TASKYIELD, 0, arg) \
114  macro(OMP_TASKLOOP, 0, arg) \
115  macro(TASK_executed, 0, arg) \
116  macro(TASK_cancelled, 0, arg) \
117  macro(TASK_stolen, 0, arg)
118 // clang-format on
119 
138 // clang-format off
139 #define KMP_FOREACH_TIMER(macro, arg) \
140  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
141  macro (OMP_parallel, stats_flags_e::logEvent, arg) \
142  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
143  macro (OMP_teams, stats_flags_e::logEvent, arg) \
144  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
145  macro (OMP_loop_static, 0, arg) \
146  macro (OMP_loop_static_scheduling, 0, arg) \
147  macro (OMP_loop_dynamic, 0, arg) \
148  macro (OMP_loop_dynamic_scheduling, 0, arg) \
149  macro (OMP_distribute, 0, arg) \
150  macro (OMP_distribute_scheduling, 0, arg) \
151  macro (OMP_critical, 0, arg) \
152  macro (OMP_critical_wait, 0, arg) \
153  macro (OMP_single, 0, arg) \
154  macro (OMP_sections, 0, arg) \
155  macro (OMP_sections_overhead, 0, arg) \
156  macro (OMP_master, 0, arg) \
157  macro (OMP_masked, 0, arg) \
158  macro (OMP_task_immediate, 0, arg) \
159  macro (OMP_task_taskwait, 0, arg) \
160  macro (OMP_task_taskyield, 0, arg) \
161  macro (OMP_task_taskgroup, 0, arg) \
162  macro (OMP_task_join_bar, 0, arg) \
163  macro (OMP_task_plain_bar, 0, arg) \
164  macro (OMP_taskloop_scheduling, 0, arg) \
165  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
166  macro (OMP_idle, stats_flags_e::logEvent, arg) \
167  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
168  macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
169  macro (OMP_serial, stats_flags_e::logEvent, arg) \
170  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
171  arg) \
172  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
173  arg) \
174  macro (OMP_loop_static_iterations, \
175  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
176  macro (OMP_loop_static_total_iterations, \
177  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
178  macro (OMP_loop_dynamic_iterations, \
179  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
180  macro (OMP_loop_dynamic_total_iterations, \
181  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
182  macro (OMP_distribute_iterations, \
183  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
184  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
185 // clang-format on
186 
187 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
188 // initializing OpenMP or being created by a primary
189 // thread) until the thread is destroyed
190 // OMP_parallel -- Time thread spends executing work directly
191 // within a #pragma omp parallel
192 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
193 // OMP_loop_static -- Time thread spends executing loop iterations from
194 // a statically scheduled loop
195 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
196 // from a statically scheduled loop
197 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
198 // a dynamically scheduled loop
199 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
200 // from a dynamically scheduled loop
201 // OMP_critical -- Time thread spends executing critical section
202 // OMP_critical_wait -- Time thread spends waiting to enter
203 // a critical section
204 // OMP_single -- Time spent executing a "single" region
205 // OMP_master -- Time spent executing a "master" region
206 // OMP_masked -- Time spent executing a "masked" region
207 // OMP_task_immediate -- Time spent executing non-deferred tasks
208 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
209 // construct
210 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
211 // construct
212 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
213 // construct
214 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
215 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
216 // construct
217 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
218 // construct
219 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
220 // inside implicit barrier at end of worksharing
221 // construct
222 // OMP_idle -- Time worker threads spend waiting for next
223 // parallel region
224 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
225 // parallel region
226 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
227 // parallel region
228 // OMP_serial -- Time thread zero spends executing serial code
229 // OMP_set_numthreads -- Values passed to omp_set_num_threads
230 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
231 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
232 // statically scheduled loops
233 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
234 // dynamically scheduled loops
235 
236 #if (KMP_DEVELOPER_STATS)
237 // Timers which are of interest to runtime library developers, not end users.
238 // These have to be explicitly enabled in addition to the other stats.
239 
240 // KMP_fork_barrier -- time in __kmp_fork_barrier
241 // KMP_join_barrier -- time in __kmp_join_barrier
242 // KMP_barrier -- time in __kmp_barrier
243 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
244 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
245 // KMP_icv_copy -- start/stop timer for any ICV copying
246 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
247 // KMP_linear_release -- time in __kmp_linear_barrier_release
248 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
249 // KMP_tree_release -- time in __kmp_tree_barrier_release
250 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
251 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
252 // KMP_dist_gather -- time in __kmp_dist_barrier_gather
253 // KMP_dist_release -- time in __kmp_dist_barrier_release
254 // clang-format off
255 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
256  macro(KMP_fork_call, 0, arg) \
257  macro(KMP_join_call, 0, arg) \
258  macro(KMP_end_split_barrier, 0, arg) \
259  macro(KMP_hier_gather, 0, arg) \
260  macro(KMP_hier_release, 0, arg) \
261  macro(KMP_hyper_gather, 0, arg) \
262  macro(KMP_hyper_release, 0, arg) \
263  macro(KMP_dist_gather, 0, arg) \
264  macro(KMP_dist_release, 0, arg) \
265  macro(KMP_linear_gather, 0, arg) \
266  macro(KMP_linear_release, 0, arg) \
267  macro(KMP_tree_gather, 0, arg) \
268  macro(KMP_tree_release, 0, arg) \
269  macro(USER_resume, 0, arg) \
270  macro(USER_suspend, 0, arg) \
271  macro(USER_mwait, 0, arg) \
272  macro(KMP_allocate_team, 0, arg) \
273  macro(KMP_setup_icv_copy, 0, arg) \
274  macro(USER_icv_copy, 0, arg) \
275  macro (FOR_static_steal_stolen, \
276  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
277  macro (FOR_static_steal_chunks, \
278  stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
279 #else
280 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
281 #endif
282 // clang-format on
283 
303 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
304 
305 #define ENUMERATE(name, ignore, prefix) prefix##name,
306 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
307 
308 enum explicit_timer_e {
309  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
310 };
311 
312 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
313 #undef ENUMERATE
314 
315 /*
316  * A logarithmic histogram. It accumulates the number of values in each power of
317  * ten bin. So 1<=x<10, 10<=x<100, ...
318  * Mostly useful where we have some big outliers and want to see information
319  * about them.
320  */
321 class logHistogram {
322  enum {
323  numBins = 31, /* Number of powers of 10. If this changes you need to change
324  * the initializer for binMax */
325 
326  /*
327  * If you want to use this to analyse values that may be less than 1, (for
328  * instance times in s), then the logOffset gives you negative powers.
329  * In our case here, we're just looking at times in ticks, or counts, so we
330  * can never see values with magnitude < 1 (other than zero), so we can set
331  * it to 0. As above change the initializer if you change this.
332  */
333  logOffset = 0
334  };
335  uint32_t KMP_ALIGN_CACHE zeroCount;
336  struct {
337  uint32_t count;
338  double total;
339  } bins[numBins];
340 
341  static double binMax[numBins];
342 
343 #ifdef KMP_DEBUG
344  uint64_t _total;
345 
346  void check() const {
347  uint64_t t = zeroCount;
348  for (int i = 0; i < numBins; i++)
349  t += bins[i].count;
350  KMP_DEBUG_ASSERT(t == _total);
351  }
352 #else
353  void check() const {}
354 #endif
355 
356 public:
357  logHistogram() { reset(); }
358 
359  logHistogram(logHistogram const &o) {
360  for (int i = 0; i < numBins; i++)
361  bins[i] = o.bins[i];
362 #ifdef KMP_DEBUG
363  _total = o._total;
364 #endif
365  }
366 
367  void reset() {
368  zeroCount = 0;
369  for (int i = 0; i < numBins; i++) {
370  bins[i].count = 0;
371  bins[i].total = 0;
372  }
373 
374 #ifdef KMP_DEBUG
375  _total = 0;
376 #endif
377  }
378  uint32_t count(int b) const { return bins[b + logOffset].count; }
379  double total(int b) const { return bins[b + logOffset].total; }
380  static uint32_t findBin(double sample);
381 
382  logHistogram &operator+=(logHistogram const &o) {
383  zeroCount += o.zeroCount;
384  for (int i = 0; i < numBins; i++) {
385  bins[i].count += o.bins[i].count;
386  bins[i].total += o.bins[i].total;
387  }
388 #ifdef KMP_DEBUG
389  _total += o._total;
390  check();
391 #endif
392 
393  return *this;
394  }
395 
396  void addSample(double sample);
397  int minBin() const;
398  int maxBin() const;
399 
400  std::string format(char) const;
401 };
402 
403 class statistic {
404  double KMP_ALIGN_CACHE minVal;
405  double maxVal;
406  double meanVal;
407  double m2;
408  uint64_t sampleCount;
409  double offset;
410  bool collectingHist;
411  logHistogram hist;
412 
413 public:
414  statistic(bool doHist = bool(KMP_STATS_HIST)) {
415  reset();
416  collectingHist = doHist;
417  }
418  statistic(statistic const &o)
419  : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
420  sampleCount(o.sampleCount), offset(o.offset),
421  collectingHist(o.collectingHist), hist(o.hist) {}
422  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
423  : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
424  sampleCount(sc), offset(0.0), collectingHist(false) {}
425  bool haveHist() const { return collectingHist; }
426  double getMin() const { return minVal; }
427  double getMean() const { return meanVal; }
428  double getMax() const { return maxVal; }
429  uint64_t getCount() const { return sampleCount; }
430  double getSD() const { return sqrt(m2 / sampleCount); }
431  double getTotal() const { return sampleCount * meanVal; }
432  logHistogram const *getHist() const { return &hist; }
433  void setOffset(double d) { offset = d; }
434 
435  void reset() {
436  minVal = (std::numeric_limits<double>::max)();
437  maxVal = -minVal;
438  meanVal = 0.0;
439  m2 = 0.0;
440  sampleCount = 0;
441  offset = 0.0;
442  hist.reset();
443  }
444  void addSample(double sample);
445  void scale(double factor);
446  void scaleDown(double f) { scale(1. / f); }
447  void forceCount(uint64_t count) { sampleCount = count; }
448  statistic &operator+=(statistic const &other);
449 
450  std::string format(char unit, bool total = false) const;
451  std::string formatHist(char unit) const { return hist.format(unit); }
452 };
453 
454 struct statInfo {
455  const char *name;
456  uint32_t flags;
457 };
458 
459 class timeStat : public statistic {
460  static statInfo timerInfo[];
461 
462 public:
463  timeStat() : statistic() {}
464  static const char *name(timer_e e) { return timerInfo[e].name; }
465  static bool noTotal(timer_e e) {
466  return timerInfo[e].flags & stats_flags_e::noTotal;
467  }
468  static bool masterOnly(timer_e e) {
469  return timerInfo[e].flags & stats_flags_e::onlyInMaster;
470  }
471  static bool workerOnly(timer_e e) {
472  return timerInfo[e].flags & stats_flags_e::notInMaster;
473  }
474  static bool noUnits(timer_e e) {
475  return timerInfo[e].flags & stats_flags_e::noUnits;
476  }
477  static bool logEvent(timer_e e) {
478  return timerInfo[e].flags & stats_flags_e::logEvent;
479  }
480  static void clearEventFlags() {
481  for (int i = 0; i < TIMER_LAST; i++) {
482  timerInfo[i].flags &= (~(stats_flags_e::logEvent));
483  }
484  }
485 };
486 
487 // Where we need explicitly to start and end the timer, this version can be used
488 // Since these timers normally aren't nicely scoped, so don't have a good place
489 // to live on the stack of the thread, they're more work to use.
490 class explicitTimer {
491  timeStat *stat;
492  timer_e timerEnumValue;
493  tsc_tick_count startTime;
494  tsc_tick_count pauseStartTime;
495  tsc_tick_count::tsc_interval_t totalPauseTime;
496 
497 public:
498  explicitTimer(timeStat *s, timer_e te)
499  : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
500  totalPauseTime() {}
501 
502  // void setStat(timeStat *s) { stat = s; }
503  void start(tsc_tick_count tick);
504  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
505  void resume(tsc_tick_count tick) {
506  totalPauseTime += (tick - pauseStartTime);
507  }
508  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
509  void reset() {
510  startTime = 0;
511  pauseStartTime = 0;
512  totalPauseTime = 0;
513  }
514  timer_e get_type() const { return timerEnumValue; }
515 };
516 
517 // Where you need to partition a threads clock ticks into separate states
518 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
519 // DOING_NOTHING would render these conditions:
520 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
521 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
522 // versa
523 class partitionedTimers {
524 private:
525  std::vector<explicitTimer> timer_stack;
526 
527 public:
528  partitionedTimers();
529  void init(explicitTimer timer);
530  void exchange(explicitTimer timer);
531  void push(explicitTimer timer);
532  void pop();
533  void windup();
534 };
535 
536 // Special wrapper around the partitioned timers to aid timing code blocks
537 // It avoids the need to have an explicit end, leaving the scope suffices.
538 class blockPartitionedTimer {
539  partitionedTimers *part_timers;
540 
541 public:
542  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
543  : part_timers(pt) {
544  part_timers->push(timer);
545  }
546  ~blockPartitionedTimer() { part_timers->pop(); }
547 };
548 
549 // Special wrapper around the thread state to aid in keeping state in code
550 // blocks It avoids the need to have an explicit end, leaving the scope
551 // suffices.
552 class blockThreadState {
553  stats_state_e *state_pointer;
554  stats_state_e old_state;
555 
556 public:
557  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
558  : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
559  *state_pointer = new_state;
560  }
561  ~blockThreadState() { *state_pointer = old_state; }
562 };
563 
564 // If all you want is a count, then you can use this...
565 // The individual per-thread counts will be aggregated into a statistic at
566 // program exit.
567 class counter {
568  uint64_t value;
569  static const statInfo counterInfo[];
570 
571 public:
572  counter() : value(0) {}
573  void increment() { value++; }
574  uint64_t getValue() const { return value; }
575  void reset() { value = 0; }
576  static const char *name(counter_e e) { return counterInfo[e].name; }
577  static bool masterOnly(counter_e e) {
578  return counterInfo[e].flags & stats_flags_e::onlyInMaster;
579  }
580 };
581 
582 /* ****************************************************************
583  Class to implement an event
584 
585  There are four components to an event: start time, stop time
586  nest_level, and timer_name.
587  The start and stop time should be obvious (recorded in clock ticks).
588  The nest_level relates to the bar width in the timeline graph.
589  The timer_name is used to determine which timer event triggered this event.
590 
591  the interface to this class is through four read-only operations:
592  1) getStart() -- returns the start time as 64 bit integer
593  2) getStop() -- returns the stop time as 64 bit integer
594  3) getNestLevel() -- returns the nest level of the event
595  4) getTimerName() -- returns the timer name that triggered event
596 
597  *MORE ON NEST_LEVEL*
598  The nest level is used in the bar graph that represents the timeline.
599  Its main purpose is for showing how events are nested inside eachother.
600  For example, say events, A, B, and C are recorded. If the timeline
601  looks like this:
602 
603 Begin -------------------------------------------------------------> Time
604  | | | | | |
605  A B C C B A
606  start start start end end end
607 
608  Then A, B, C will have a nest level of 1, 2, 3 respectively.
609  These values are then used to calculate the barwidth so you can
610  see that inside A, B has occurred, and inside B, C has occurred.
611  Currently, this is shown with A's bar width being larger than B's
612  bar width, and B's bar width being larger than C's bar width.
613 
614 **************************************************************** */
615 class kmp_stats_event {
616  uint64_t start;
617  uint64_t stop;
618  int nest_level;
619  timer_e timer_name;
620 
621 public:
622  kmp_stats_event()
623  : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
624  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
625  : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
626  inline uint64_t getStart() const { return start; }
627  inline uint64_t getStop() const { return stop; }
628  inline int getNestLevel() const { return nest_level; }
629  inline timer_e getTimerName() const { return timer_name; }
630 };
631 
632 /* ****************************************************************
633  Class to implement a dynamically expandable array of events
634 
635  ---------------------------------------------------------
636  | event 1 | event 2 | event 3 | event 4 | ... | event N |
637  ---------------------------------------------------------
638 
639  An event is pushed onto the back of this array at every
640  explicitTimer->stop() call. The event records the thread #,
641  start time, stop time, and nest level related to the bar width.
642 
643  The event vector starts at size INIT_SIZE and grows (doubles in size)
644  if needed. An implication of this behavior is that log(N)
645  reallocations are needed (where N is number of events). If you want
646  to avoid reallocations, then set INIT_SIZE to a large value.
647 
648  the interface to this class is through six operations:
649  1) reset() -- sets the internal_size back to 0 but does not deallocate any
650  memory
651  2) size() -- returns the number of valid elements in the vector
652  3) push_back(start, stop, nest, timer_name) -- pushes an event onto
653  the back of the array
654  4) deallocate() -- frees all memory associated with the vector
655  5) sort() -- sorts the vector by start time
656  6) operator[index] or at(index) -- returns event reference at that index
657 **************************************************************** */
658 class kmp_stats_event_vector {
659  kmp_stats_event *events;
660  int internal_size;
661  int allocated_size;
662  static const int INIT_SIZE = 1024;
663 
664 public:
665  kmp_stats_event_vector() {
666  events =
667  (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
668  internal_size = 0;
669  allocated_size = INIT_SIZE;
670  }
671  ~kmp_stats_event_vector() {}
672  inline void reset() { internal_size = 0; }
673  inline int size() const { return internal_size; }
674  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
675  timer_e name) {
676  int i;
677  if (internal_size == allocated_size) {
678  kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
679  sizeof(kmp_stats_event) * allocated_size * 2);
680  for (i = 0; i < internal_size; i++)
681  tmp[i] = events[i];
682  __kmp_free(events);
683  events = tmp;
684  allocated_size *= 2;
685  }
686  events[internal_size] =
687  kmp_stats_event(start_time, stop_time, nest_level, name);
688  internal_size++;
689  return;
690  }
691  void deallocate();
692  void sort();
693  const kmp_stats_event &operator[](int index) const { return events[index]; }
694  kmp_stats_event &operator[](int index) { return events[index]; }
695  const kmp_stats_event &at(int index) const { return events[index]; }
696  kmp_stats_event &at(int index) { return events[index]; }
697 };
698 
699 /* ****************************************************************
700  Class to implement a doubly-linked, circular, statistics list
701 
702  |---| ---> |---| ---> |---| ---> |---| ---> ... next
703  | | | | | | | |
704  |---| <--- |---| <--- |---| <--- |---| <--- ... prev
705  Sentinel first second third
706  Node node node node
707 
708  The Sentinel Node is the user handle on the list.
709  The first node corresponds to thread 0's statistics.
710  The second node corresponds to thread 1's statistics and so on...
711 
712  Each node has a _timers, _counters, and _explicitTimers array to hold that
713  thread's statistics. The _explicitTimers point to the correct _timer and
714  update its statistics at every stop() call. The explicitTimers' pointers are
715  set up in the constructor. Each node also has an event vector to hold that
716  thread's timing events. The event vector expands as necessary and records
717  the start-stop times for each timer.
718 
719  The nestLevel variable is for plotting events and is related
720  to the bar width in the timeline graph.
721 
722  Every thread will have a thread local pointer to its node in
723  the list. The sentinel node is used by the primary thread to
724  store "dummy" statistics before __kmp_create_worker() is called.
725 **************************************************************** */
726 class kmp_stats_list {
727  int gtid;
728  timeStat _timers[TIMER_LAST + 1];
729  counter _counters[COUNTER_LAST + 1];
730  explicitTimer thread_life_timer;
731  partitionedTimers _partitionedTimers;
732  int _nestLevel; // one per thread
733  kmp_stats_event_vector _event_vector;
734  kmp_stats_list *next;
735  kmp_stats_list *prev;
736  stats_state_e state;
737  int thread_is_idle_flag;
738 
739 public:
740  kmp_stats_list()
741  : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
742  TIMER_OMP_worker_thread_life),
743  _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
744  thread_is_idle_flag(0) {}
745  ~kmp_stats_list() {}
746  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
747  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
748  inline partitionedTimers *getPartitionedTimers() {
749  return &_partitionedTimers;
750  }
751  inline timeStat *getTimers() { return _timers; }
752  inline counter *getCounters() { return _counters; }
753  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
754  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
755  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
756  inline void resetEventVector() { _event_vector.reset(); }
757  inline void incrementNestValue() { _nestLevel++; }
758  inline int getNestValue() { return _nestLevel; }
759  inline void decrementNestValue() { _nestLevel--; }
760  inline int getGtid() const { return gtid; }
761  inline void setGtid(int newgtid) { gtid = newgtid; }
762  inline void setState(stats_state_e newstate) { state = newstate; }
763  inline stats_state_e getState() const { return state; }
764  inline stats_state_e *getStatePointer() { return &state; }
765  inline bool isIdle() { return thread_is_idle_flag == 1; }
766  inline void setIdleFlag() { thread_is_idle_flag = 1; }
767  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
768  kmp_stats_list *push_back(int gtid); // returns newly created list node
769  inline void push_event(uint64_t start_time, uint64_t stop_time,
770  int nest_level, timer_e name) {
771  _event_vector.push_back(start_time, stop_time, nest_level, name);
772  }
773  void deallocate();
774  class iterator;
775  kmp_stats_list::iterator begin();
776  kmp_stats_list::iterator end();
777  int size();
778  class iterator {
779  kmp_stats_list *ptr;
780  friend kmp_stats_list::iterator kmp_stats_list::begin();
781  friend kmp_stats_list::iterator kmp_stats_list::end();
782 
783  public:
784  iterator();
785  ~iterator();
786  iterator operator++();
787  iterator operator++(int dummy);
788  iterator operator--();
789  iterator operator--(int dummy);
790  bool operator!=(const iterator &rhs);
791  bool operator==(const iterator &rhs);
792  kmp_stats_list *operator*() const; // dereference operator
793  };
794 };
795 
796 /* ****************************************************************
797  Class to encapsulate all output functions and the environment variables
798 
799  This module holds filenames for various outputs (normal stats, events, plot
800  file), as well as coloring information for the plot file.
801 
802  The filenames and flags variables are read from environment variables.
803  These are read once by the constructor of the global variable
804  __kmp_stats_output which calls init().
805 
806  During this init() call, event flags for the timeStat::timerInfo[] global
807  array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
808 
809  The only interface function that is public is outputStats(heading). This
810  function should print out everything it needs to, either to files or stderr,
811  depending on the environment variables described below
812 
813  ENVIRONMENT VARIABLES:
814  KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
815  file, otherwise, print to stderr
816  KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
817  either KMP_STATS_FILE or stderr
818  KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
819  otherwise, the plot file is sent to "events.plt"
820  KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
821  events
822  KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
823  otherwise, output is sent to "events.dat"
824 **************************************************************** */
825 class kmp_stats_output_module {
826 
827 public:
828  struct rgb_color {
829  float r;
830  float g;
831  float b;
832  };
833 
834 private:
835  std::string outputFileName;
836  static const char *eventsFileName;
837  static const char *plotFileName;
838  static int printPerThreadFlag;
839  static int printPerThreadEventsFlag;
840  static const rgb_color globalColorArray[];
841  static rgb_color timerColorInfo[];
842 
843  void init();
844  static void setupEventColors();
845  static void printPloticusFile();
846  static void printHeaderInfo(FILE *statsOut);
847  static void printTimerStats(FILE *statsOut, statistic const *theStats,
848  statistic const *totalStats);
849  static void printCounterStats(FILE *statsOut, statistic const *theStats);
850  static void printCounters(FILE *statsOut, counter const *theCounters);
851  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
852  int gtid);
853  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
854  static void windupExplicitTimers();
855  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
856 
857 public:
858  kmp_stats_output_module() { init(); }
859  void outputStats(const char *heading);
860 };
861 
862 #ifdef __cplusplus
863 extern "C" {
864 #endif
865 void __kmp_stats_init();
866 void __kmp_stats_fini();
867 void __kmp_reset_stats();
868 void __kmp_output_stats(const char *);
869 void __kmp_accumulate_stats_at_exit(void);
870 // thread local pointer to stats node within list
871 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
872 // head to stats list.
873 extern kmp_stats_list *__kmp_stats_list;
874 // lock for __kmp_stats_list
875 extern kmp_tas_lock_t __kmp_stats_lock;
876 // reference start time
877 extern tsc_tick_count __kmp_stats_start_time;
878 // interface to output
879 extern kmp_stats_output_module __kmp_stats_output;
880 
881 #ifdef __cplusplus
882 }
883 #endif
884 
885 // Simple, standard interfaces that drop out completely if stats aren't enabled
886 
898 #define KMP_COUNT_VALUE(name, value) \
899  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
900 
911 #define KMP_COUNT_BLOCK(name) \
912  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
913 
931 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
932 
940 #define KMP_INIT_PARTITIONED_TIMERS(name) \
941  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
942  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
943 
944 #define KMP_TIME_PARTITIONED_BLOCK(name) \
945  blockPartitionedTimer __PBLOCKTIME__( \
946  __kmp_stats_thread_ptr->getPartitionedTimers(), \
947  explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
948  TIMER_##name))
949 
950 #define KMP_PUSH_PARTITIONED_TIMER(name) \
951  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
952  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
953 
954 #define KMP_POP_PARTITIONED_TIMER() \
955  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
956 
957 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
958  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
959  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
960 
961 #define KMP_SET_THREAD_STATE(state_name) \
962  __kmp_stats_thread_ptr->setState(state_name)
963 
964 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
965 
966 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
967  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
968  state_name)
969 
977 #define KMP_RESET_STATS() __kmp_reset_stats()
978 
979 #if (KMP_DEVELOPER_STATS)
980 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
981 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
982 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
983 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
984 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
985 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
986  KMP_EXCHANGE_PARTITIONED_TIMER(n)
987 #else
988 // Null definitions
989 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
990 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
991 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
992 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
993 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
994 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
995 #endif
996 
997 #else // KMP_STATS_ENABLED
998 
999 // Null definitions
1000 #define KMP_COUNT_VALUE(n, v) ((void)0)
1001 #define KMP_COUNT_BLOCK(n) ((void)0)
1002 
1003 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
1004 #define KMP_RESET_STATS() ((void)0)
1005 
1006 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1007 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1008 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1009 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1010 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1011 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1012 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1013 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1014 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1015 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1016 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1017 #define KMP_GET_THREAD_STATE() ((void)0)
1018 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1019 #endif // KMP_STATS_ENABLED
1020 
1021 #endif // KMP_STATS_H
statistic is valid only for primary thread
Definition: kmp_stats.h:51
statistic is valid only for non-primary threads
Definition: kmp_stats.h:53
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:50
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:303
statistic doesn&#39;t need units printed next to it
Definition: kmp_stats.h:52
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:95
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63