www.mooseframework.org
PerfGraph.C
Go to the documentation of this file.
1 //* This file is part of the MOOSE framework
2 //* https://www.mooseframework.org
3 //*
4 //* All rights reserved, see COPYRIGHT for full restrictions
5 //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 //*
7 //* Licensed under LGPL 2.1, please see LICENSE for details
8 //* https://www.gnu.org/licenses/lgpl-2.1.html
9 
10 #include "PerfGraph.h"
11 
12 // MOOSE Includes
13 #include "PerfGuard.h"
14 #include "MooseError.h"
15 #include "PerfGraphLivePrint.h"
16 #include "MooseApp.h"
17 
18 // Note: do everything we can to make sure this only gets #included
19 // in the .C file... this is a heavily templated header that we
20 // don't want to expose to EVERY file in MOOSE...
21 #include "VariadicTable.h"
22 
23 // System Includes
24 #include <chrono>
25 #include <memory>
26 
27 PerfGraph::PerfGraph(const std::string & root_name,
28  MooseApp & app,
29  const bool live_all,
30  const bool perf_graph_live)
32  _moose_app(app),
33  _live_print_all(live_all),
34  _disable_live_print(!perf_graph_live),
35  _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
36  _pid(app.comm().rank()),
37  _root_name(root_name),
38  _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
39  _root_node(std::make_unique<PerfNode>(_root_node_id)),
40  _current_position(-1),
41  _stack(),
42  _execution_list_begin(0),
43  _execution_list_end(0),
44  _active(true),
45  _live_print_active(true),
46  _destructing(false),
47  _live_print_time_limit(5.0),
48  _live_print_mem_limit(100),
49  _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
50 {
51  if (_pid == 0 && !_disable_live_print)
52  {
53  // Start the printing thread
54  _print_thread = std::thread([this] { this->_live_print->start(); });
55  }
56 
58 }
59 
61 
62 void
64 {
65  if (_pid == 0 && !_disable_live_print)
66  {
67  {
68  // Unlike using atomics for execution_thread_end
69  // here we actually lock to ensure that either the print thread
70  // immediately sees that we are destructing or is immediately
71  // notified with the below notification. Without doing this
72  // it would be possible (but unlikely) for the print thread to
73  // hang for 1 second at the end of execution (which would not be
74  // good anytime you are running lots of fast calculations back-to-back
75  // like during testing or stochastic sampling).
76  std::lock_guard<std::mutex> lock(_destructing_mutex);
77  _destructing = true;
78  }
79 
80  _finished_section.notify_one();
81 
82  _print_thread.join();
83 
84  _disable_live_print = true;
85  }
86 }
87 
88 Real
90  const std::string & section_name,
91  const bool must_exist /* = true */)
92 {
93  update();
94 
95  const auto section_it =
96  _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
97 
98  if (section_it == _cumulative_section_info.end())
99  {
100  if (!must_exist || // isn't required to exist
101  _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
102  )
103  return 0;
104 
105  mooseError("Unknown PerfGraph section name \"",
106  section_name,
107  "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
108  "\"Root\".");
109  }
110 
111  const CumulativeSectionInfo & section_info = section_it->second;
112 
113  if (type == CALLS)
114  return section_info._num_calls;
115 
116  const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
117 
118  switch (type)
119  {
120  case SELF:
121  return section_info._self;
122  case CHILDREN:
123  return section_info._children;
124  case TOTAL:
125  return section_info._total;
126  case SELF_AVG:
127  return section_info._self / static_cast<Real>(section_info._num_calls);
128  case CHILDREN_AVG:
129  return section_info._children / static_cast<Real>(section_info._num_calls);
130  case TOTAL_AVG:
131  return section_info._total / static_cast<Real>(section_info._num_calls);
132  case SELF_PERCENT:
133  return 100. * (section_info._self / app_time);
134  case CHILDREN_PERCENT:
135  return 100. * (section_info._children / app_time);
136  case TOTAL_PERCENT:
137  return 100. * (section_info._total / app_time);
138  case SELF_MEMORY:
139  return section_info._self_memory;
140  case CHILDREN_MEMORY:
141  return section_info._children_memory;
142  case TOTAL_MEMORY:
143  return section_info._total_memory;
144  default:
145  ::mooseError("Unknown DataType");
146  }
147 }
148 
149 void
151  const IncrementState state,
152  const std::chrono::time_point<std::chrono::steady_clock> time,
153  const long int memory)
154 {
155  auto & section_increment = _execution_list[_execution_list_end];
156 
157  section_increment._id = id;
158  section_increment._state = state;
159  section_increment._time = time;
160  section_increment._memory = memory;
161  section_increment._beginning_num_printed = _console.numPrinted();
162 
163  // A note about this next section of code:
164  // It is only EVER run on the main thread - and therefore there can be
165  // no race conditions. All that is important here is that the print
166  // thread always sees a consistent value for _execution_list_end
167  auto next_execution_list_end = _execution_list_end + 1;
168 
169  // Are we at the end of our circular buffer?
170  if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
171  next_execution_list_end = 0;
172 
173  // This "release" will synchronize the above memory changes with the
174  // "acquire" in the printing thread
175  // All of the above memory operations will be seen by the
176  // printing thread before the printing thread sees this new value
177  _execution_list_end.store(next_execution_list_end, std::memory_order_release);
178 }
179 
180 void
182 {
183  if (!_active && !_live_print_active)
184  return;
185 
186  PerfNode * new_node = nullptr;
187 
188  if (id == _root_node_id)
189  new_node = _root_node.get();
190  else
191  new_node = _stack[_current_position]->getChild(id);
192 
193  MemoryUtils::Stats stats;
194  auto memory_success = MemoryUtils::getMemoryStats(stats);
195 
196  long int start_memory = 0;
197 
198  if (memory_success)
199  start_memory =
201  else if (_current_position !=
202  -1) // If we weren't able to get the memory stats, let's just use the parent's
203  start_memory = _stack[_current_position]->startMemory();
204 
205  // Set the start time
206  auto current_time = std::chrono::steady_clock::now();
207 
208  new_node->setStartTimeAndMemory(current_time, start_memory);
209 
210  // Increment the number of calls
211  new_node->incrementNumCalls();
212 
214 
215  if (_current_position >= MAX_STACK_SIZE)
216  mooseError("PerfGraph is out of stack space!");
217 
218  _stack[_current_position] = new_node;
219 
220  // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
223  addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
224 }
225 
226 void
228 {
229  if (!_active && !_live_print_active)
230  return;
231 
232  auto current_time = std::chrono::steady_clock::now();
233 
234  auto & current_node = _stack[_current_position];
235 
236  MemoryUtils::Stats stats;
237  auto memory_success = MemoryUtils::getMemoryStats(stats);
238 
239  long int current_memory = 0;
240 
241  if (memory_success)
242  current_memory =
244  else if (_current_position !=
245  -1) // If we weren't able to get the memory stats, let's just use the start memory
246  current_memory = _stack[_current_position]->startMemory();
247 
248  current_node->addTimeAndMemory(current_time, current_memory);
249 
251 
252  // Add this to the exection list
254  (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
256  {
257  addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
258 
259  // Tell the printing thread that a section has finished
260  //
261  // Note: no mutex is needed here because we're using an atomic
262  // in the predicate of the condition_variable in the thread
263  // This is technically correct - but there is a chance of missing a signal
264  // For us - that chance is low and doesn't matter (the timeout will just be hit
265  // instead). So - I would rather not have an extra lock here in the main thread.
266  _finished_section.notify_one();
267  }
268 }
269 
270 void
272 {
273  // First update all of the currently running nodes
274  auto now = std::chrono::steady_clock::now();
275 
276  MemoryUtils::Stats stats;
278  auto now_memory =
280 
281  for (int i = 0; i <= _current_position; i++)
282  {
283  auto node = _stack[i];
284  node->addTimeAndMemory(now, now_memory);
285  node->setStartTimeAndMemory(now, now_memory);
286  }
287 
288  // Zero out the entries
289  for (auto & section_time_it : _cumulative_section_info)
290  {
291  auto & section_time = section_time_it.second;
292 
293  section_time._num_calls = 0;
294  section_time._self = 0.;
295  section_time._children = 0.;
296  section_time._total = 0.;
297  section_time._self_memory = 0;
298  section_time._children_memory = 0;
299  section_time._total_memory = 0.;
300  }
301 
303 
304  // Update vector pointing to section times
305  // Note: we are doing this _after_ recursively filling
306  // because new entries may have been created
308 
309  for (auto & section_time_it : _cumulative_section_info)
310  {
311  auto id = _perf_graph_registry.sectionID(section_time_it.first);
312 
313  _cumulative_section_info_ptrs[id] = &section_time_it.second;
314  }
315 }
316 
317 void
319 {
320  const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
321 
322  // RHS insertion on purpose
323  auto & section_time = _cumulative_section_info[section_name];
324 
325  section_time._self += current_node.selfTimeSec();
326  section_time._children += current_node.childrenTimeSec();
327  section_time._total += current_node.totalTimeSec();
328  section_time._num_calls += current_node.numCalls();
329 
330  section_time._self_memory += current_node.selfMemory();
331  section_time._children_memory += current_node.childrenMemory();
332  section_time._total_memory += current_node.totalMemory();
333 
334  for (auto & child_it : current_node.children())
335  recursivelyUpdate(*child_it.second);
336 }
337 
339 PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
340 {
341  update();
342 
343  FullTable vtable({"Section",
344  "Calls",
345  "Self(s)",
346  "Avg(s)",
347  "%",
348  "Mem(MB)",
349  "Total(s)",
350  "Avg(s)",
351  "%",
352  "Mem(MB)"},
353  10);
354 
355  vtable.setColumnFormat({
356  VariadicTableColumnFormat::AUTO, // Section Name
366  });
367 
368  vtable.setColumnPrecision({
369  1, // Section Name
370  0, // Calls
371  3, // Self
372  3, // Avg.
373  2, // %
374  0, // Memory
375  3, // Total
376  3, // Avg.
377  2, // %
378  0, // Memory
379  });
380 
381  auto act = [this, &vtable](const PerfNode & node,
382  const moose::internal::PerfGraphSectionInfo & section_info,
383  const unsigned int depth)
384  {
385  vtable.addRow(std::string(depth * 2, ' ') + section_info._name, // Section Name
386  node.numCalls(), // Calls
387  node.selfTimeSec(), // Self
388  node.selfTimeAvg(), // Avg.
389  100. * node.selfTimeSec() / _root_node->totalTimeSec(), // %
390  node.selfMemory(), // Memory
391  node.totalTimeSec(), // Total
392  node.totalTimeAvg(), // Avg.
393  100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
394  node.totalMemory()); // Memory
395  };
396  treeRecurse(act, level, heaviest);
397 
398  return vtable;
399 }
400 
401 void
402 PerfGraph::print(const ConsoleStream & console, unsigned int level)
403 {
404  console << "\nPerformance Graph:\n";
405  treeTable(level).print(console);
406 }
407 
408 void
410 {
411  console << "\nHeaviest Branch:\n";
412  treeTable(MAX_STACK_SIZE, /* heaviest = */ true).print(console);
413 }
414 
415 void
416 PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
417 {
418  update();
419 
420  console << "\nHeaviest Sections:\n";
421 
422  // Indirect Sort The Self Time
423  std::vector<size_t> sorted;
426  sorted,
428  {
429  if (lhs && rhs)
430  return lhs->_self > rhs->_self;
431 
432  // If the LHS exists - it's definitely bigger than a non-existant RHS
433  if (lhs)
434  return true;
435 
436  // Both don't exist - so it doesn't matter how we sort them
437  return false;
438  });
439 
440  HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
441 
442  vtable.setColumnFormat({VariadicTableColumnFormat::AUTO, // Section; doesn't matter
448  );
449 
450  vtable.setColumnPrecision({
451  1, // Section
452  1, // Calls
453  3, // Time
454  3, // Avg.
455  2, // Percent
456  1 // Memory
457  });
458 
459  mooseAssert(!_cumulative_section_info_ptrs.empty(),
460  "update() must be run before printHeaviestSections()!");
461 
462  // The total time of the root node
463  auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
464 
465  // Now print out the largest ones
466  for (unsigned int i = 0; i < num_sections; i++)
467  {
468  auto id = sorted[i];
469 
471  continue;
472 
473  const auto & entry = *_cumulative_section_info_ptrs[id];
474  vtable.addRow(_perf_graph_registry.sectionInfo(id)._name, // Section
475  entry._num_calls, // Calls
476  entry._self, // Time
477  entry._self / static_cast<Real>(entry._num_calls), // Avg.
478  100. * entry._self / total_root_time, // Percent
479  entry._self_memory); // Memory
480  }
481 
482  vtable.print(console);
483 }
484 
485 void
486 dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
487 {
488  // We need to store the registry id -> section info map so that we can add
489  // registered sections that may not be added yet during recover
490  dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
491 
492  // Update before serializing the nodes so that the time/memory/calls are correct
493  perf_graph.update();
494 
495  // Recursively serialize all of the nodes
496  dataStore(stream, perf_graph._root_node, nullptr);
497 }
498 
499 void
500 dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
501 {
502  // Load in all of the recovered sections and register those that do not exist yet
503  std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
504  dataLoad(stream, recovered_section_info, nullptr);
505  for (const auto & info : recovered_section_info)
506  {
507  if (info._live_message.size())
509  info._name, info._level, info._live_message, info._print_dots);
510  else
511  perf_graph._perf_graph_registry.registerSection(info._name, info._level);
512  }
513 
514  // Update the current node time/memory/calls before loading the nodes as the load
515  // will append information to current nodes that exist
516  perf_graph.update();
517 
518  // Recursively load all of the nodes; this will append information to matching nodes
519  // and will create new nodes for section paths that do not exist
520  dataLoad(stream, perf_graph._root_node, &perf_graph);
521 }
long int totalMemory() const
Get the amount of memory added by this node.
Definition: PerfNode.h:161
void recursivelyUpdate(const PerfNode &current_node)
Updates the cumulative self/children/total time and memory for each section across all nodes that con...
Definition: PerfGraph.C:318
Real selfTimeSec() const
Get the time this node took in seconds.
Definition: PerfNode.h:115
PerfGraphRegistry & _perf_graph_registry
The PerfGraphRegistry.
Definition: PerfGraph.h:340
A helper class for re-directing output streams to Console output objects form MooseObjects.
Definition: ConsoleStream.h:30
long int _total_memory
Total memory gain for this section.
Definition: PerfGraph.h:235
long int _self_memory
Amount of memory gained within this section (without children)
Definition: PerfGraph.h:229
unsigned long int _num_calls
Number of times this section has been called.
Definition: PerfGraph.h:226
~PerfGraph()
Destructor.
Definition: PerfGraph.C:60
PerfGraph(const std::string &root_name, MooseApp &app, const bool live_all, const bool perf_graph_live)
Create a new PerfGraph.
Definition: PerfGraph.C:27
const processor_id_type _pid
This processor id.
Definition: PerfGraph.h:343
std::string _live_message
Message to print while the section is running.
bool sectionExists(const std::string &section_name) const
Whether or not a section with that name has been registered The name of the section.
void setStartTimeAndMemory(const std::chrono::time_point< std::chrono::steady_clock > time, const long int memory)
Set the current start time.
Definition: PerfNode.h:40
Real sectionData(const DataType type, const std::string &section_name, const bool must_exist=true)
Gets a PerfGraph result pertaining to a section.
Definition: PerfGraph.C:89
long int selfMemory() const
Get the amount of memory added by this node.
Definition: PerfNode.C:41
void print(StreamType &stream)
Pretty print the table of data.
Definition: VariadicTable.h:97
MPI_Info info
bool getMemoryStats(Stats &stats)
get all memory stats for the current process stats The Stats object to fill with the data ...
Definition: MemoryUtils.C:79
void treeRecurse(const Functor &act, const unsigned int level=MAX_STACK_SIZE, const bool heaviest=false) const
Definition: PerfGraph.h:479
std::thread _print_thread
The thread for printing sections as they execute.
Definition: PerfGraph.h:414
void mooseError(Args &&... args)
Emit an error message with the given stringified, concatenated args and terminate the application...
Definition: MooseError.h:299
Real selfTimeAvg() const
The average time this node took in seconds.
Definition: PerfNode.h:119
std::size_t _physical_memory
Definition: MemoryUtils.h:23
DataType
For retrieving values.
Definition: PerfGraph.h:51
A class for "pretty printing" a table of data.
void indirectSort(RandomAccessIterator beg, RandomAccessIterator end, std::vector< size_t > &b)
Definition: IndirectSort.h:68
const std::unique_ptr< PerfGraphLivePrint > _live_print
The object that is doing live printing.
Definition: PerfGraph.h:411
Real _total
Total amount of time used.
Definition: PerfGraph.h:223
Base class for MOOSE-based applications.
Definition: MooseApp.h:73
std::atomic< unsigned int > _execution_list_end
Where the print thread should stop reading the execution list.
Definition: PerfGraph.h:367
long int childrenMemory() const
Get the amount of memory added by this node.
Definition: PerfNode.C:47
IncrementState
The execution state of an increment.
Definition: PerfGraph.h:241
bool _live_print_all
Whether or not to put everything in the perf graph.
Definition: PerfGraph.h:334
Real totalTimeSec() const
The time this Node plus all of its children took in seconds.
Definition: PerfNode.h:128
Real _self
Amount of time used within this section (without children)
Definition: PerfGraph.h:217
Used to hold metadata about the registered sections Note: this is a class instead of a struct because...
std::condition_variable _finished_section
The condition_variable to wake the print thread.
Definition: PerfGraph.h:402
void update()
Updates the time section_time and time for all currently running nodes.
Definition: PerfGraph.C:271
unsigned int PerfID
Definition: MooseTypes.h:201
void incrementNumCalls()
Increments the number of calls.
Definition: PerfNode.h:75
void addToExecutionList(const PerfID id, const IncrementState state, const std::chrono::time_point< std::chrono::steady_clock > time, const long int memory)
Add the information to the execution list.
Definition: PerfGraph.C:150
std::vector< CumulativeSectionInfo * > _cumulative_section_info_ptrs
Pointers into _cumulative_section_info indexed on PerfID This is here for convenience and speed so we...
Definition: PerfGraph.h:384
unsigned long int numCalls() const
Get the number of times this node was called.
Definition: PerfNode.h:146
unsigned long long int numPrinted() const
The number of times something has been printed.
Definition: ConsoleStream.C:70
const std::unique_ptr< PerfNode > _root_node
The root node of the graph.
Definition: PerfGraph.h:352
std::array< PerfNode *, MAX_STACK_SIZE > _stack
The full callstack. Currently capped at a depth of 100.
Definition: PerfGraph.h:358
const std::map< PerfID, std::unique_ptr< PerfNode > > & children() const
Get the children.
Definition: PerfNode.h:106
Real childrenTimeSec() const
The time this node&#39;s children took in seconds.
Definition: PerfNode.h:141
PerfID id() const
Get the ID of this Node.
Definition: PerfNode.h:35
void dataLoad(std::istream &stream, PerfGraph &perf_graph, void *)
Definition: PerfGraph.C:500
std::mutex _destructing_mutex
The mutex to use with a condition_variable predicate to guard _destructing.
Definition: PerfGraph.h:399
std::unordered_map< std::string, CumulativeSectionInfo > _cumulative_section_info
The cumulative time and memory for each section.
Definition: PerfGraph.h:378
An inteface for the _console for outputting to the Console object.
bool _active
Whether or not timing is active.
Definition: PerfGraph.h:387
This is effectively a functor that runs on a separate thread and watches the state of the call stack ...
PerfID registerSection(const std::string &section_name, const unsigned int level)
Call to register a named section for timing.
void disableLivePrint()
Completely disables Live Print (cannot be restarted)
Definition: PerfGraph.C:63
int _current_position
The current node position in the stack.
Definition: PerfGraph.h:355
Real _children
Amount of time used by children.
Definition: PerfGraph.h:220
void pop()
Remove a Node from the end of the current scope.
Definition: PerfGraph.C:227
const PerfGraphSectionInfo & sectionInfo(const PerfID section_id) const
Given a PerfID return the PerfGraphSectionInfo The ID.
bool _disable_live_print
Whether or not live print is disabled (cannot be turned on again)
Definition: PerfGraph.h:337
void setColumnFormat(const std::vector< VariadicTableColumnFormat > &column_format)
Set how to format numbers for each column.
void print(const ConsoleStream &console, unsigned int level)
Print the tree out.
Definition: PerfGraph.C:402
void printHeaviestBranch(const ConsoleStream &console)
Print out the heaviest branch through the tree.
Definition: PerfGraph.C:409
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
Real totalTimeAvg() const
The average time this Node plus all of its children took in seconds.
Definition: PerfNode.h:132
const std::string _root_name
Name of the root node.
Definition: PerfGraph.h:346
PerfID sectionID(const std::string &section_name) const
Given a name return the PerfID The name of the section.
Use to hold the cumulative time and memory for each section, which comes from all of the PerfNodes th...
Definition: PerfGraph.h:214
long int _children_memory
Amount of memory gained by children.
Definition: PerfGraph.h:232
bool _destructing
Tell the print thread to teardown.
Definition: PerfGraph.h:396
void printHeaviestSections(const ConsoleStream &console, const unsigned int num_sections)
Print out the heaviest sections that were timed.
Definition: PerfGraph.C:416
std::vector< Item > _id_to_item
Vector of IDs to Items.
const ConsoleStream _console
An instance of helper class to write streams to the Console objects.
void push(const PerfID id)
Add a Node onto the end of the end of the current callstack.
Definition: PerfGraph.C:181
FullTable treeTable(const unsigned int level, const bool heaviest=false)
Helper for building a VariadicTable that represents the tree.
Definition: PerfGraph.C:339
The PerfGraph will hold the master list of all registered performance segments and the head PerfNode...
Definition: PerfGraph.h:43
std::size_t convertBytes(std::size_t bytes, MemUnits unit)
convert bytes to selected unit prefix
Definition: MemoryUtils.C:174
PerfGraphRegistry & getPerfGraphRegistry()
Get the global PerfGraphRegistry singleton.
const PerfGraphSectionInfo & readSectionInfo(PerfID section_id) const
Special accessor just for PerfGraph so that no locking is needed in PerfGraph.
std::array< SectionIncrement, MAX_EXECUTION_LIST_SIZE > _execution_list
A circular buffer for holding the execution list, this is read by the printing loop.
Definition: PerfGraph.h:361
std::atomic< bool > _live_print_active
Whether or not live printing is active.
Definition: PerfGraph.h:390
void dataStore(std::ostream &stream, PerfGraph &perf_graph, void *)
Definition: PerfGraph.C:486
const PerfID _root_node_id
The id for the root node.
Definition: PerfGraph.h:349
A node in the PerfGraph.
Definition: PerfNode.h:24