https://mooseframework.inl.gov
PerfGraph.C
Go to the documentation of this file.
1 //* This file is part of the MOOSE framework
2 //* https://mooseframework.inl.gov
3 //*
4 //* All rights reserved, see COPYRIGHT for full restrictions
5 //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 //*
7 //* Licensed under LGPL 2.1, please see LICENSE for details
8 //* https://www.gnu.org/licenses/lgpl-2.1.html
9 
10 #include "PerfGraph.h"
11 
12 // MOOSE Includes
13 #include "PerfGuard.h"
14 #include "MooseError.h"
15 #include "PerfGraphLivePrint.h"
16 #include "PerfGraphRegistry.h"
17 #include "MooseApp.h"
18 
19 // Note: do everything we can to make sure this only gets #included
20 // in the .C file... this is a heavily templated header that we
21 // don't want to expose to EVERY file in MOOSE...
22 #include "VariadicTable.h"
23 
24 // System Includes
25 #include <chrono>
26 #include <memory>
27 
28 PerfGraph::PerfGraph(const std::string & root_name,
29  MooseApp & app,
30  const bool live_all,
31  const bool perf_graph_live)
33  _moose_app(app),
34  _live_print_all(live_all),
35  _disable_live_print(!perf_graph_live),
36  _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
37  _pid(app.comm().rank()),
38  _root_name(root_name),
39  _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
40  _root_node(std::make_unique<PerfNode>(_root_node_id)),
41  _current_position(-1),
42  _stack(),
43  _execution_list_begin(0),
44  _execution_list_end(0),
45  _active(true),
46  _destructing(false),
47  _live_print_time_limit(5.0),
48  _live_print_mem_limit(100),
49  _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
50 {
52 }
53 
55 
56 void
58 {
59  if (_pid == 0 && !_disable_live_print)
60  {
61  // Start the printing thread
62  _print_thread = std::thread([this] { this->_live_print->start(); });
63  }
64 }
65 
66 void
68 {
69  if (_pid == 0 && !_disable_live_print)
70  {
71  {
72  // Unlike using atomics for execution_thread_end
73  // here we actually lock to ensure that either the print thread
74  // immediately sees that we are destructing or is immediately
75  // notified with the below notification. Without doing this
76  // it would be possible (but unlikely) for the print thread to
77  // hang for 1 second at the end of execution (which would not be
78  // good anytime you are running lots of fast calculations back-to-back
79  // like during testing or stochastic sampling).
80  std::lock_guard<std::mutex> lock(_destructing_mutex);
81  _destructing = true;
82  }
83 
84  _finished_section.notify_one();
85 
86  _print_thread.join();
87 
88  _disable_live_print = true;
89  }
90 }
91 
92 Real
94  const std::string & section_name,
95  const bool must_exist /* = true */)
96 {
97  update();
98 
99  const auto section_it =
100  _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
101 
102  if (section_it == _cumulative_section_info.end())
103  {
104  if (!must_exist || // isn't required to exist
105  _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
106  )
107  return 0;
108 
109  mooseError("Unknown PerfGraph section name \"",
110  section_name,
111  "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
112  "\"Root\".");
113  }
114 
115  const CumulativeSectionInfo & section_info = section_it->second;
116 
117  if (type == CALLS)
118  return section_info._num_calls;
119 
120  const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
121 
122  switch (type)
123  {
124  case SELF:
125  return section_info._self;
126  case CHILDREN:
127  return section_info._children;
128  case TOTAL:
129  return section_info._total;
130  case SELF_AVG:
131  return section_info._self / static_cast<Real>(section_info._num_calls);
132  case CHILDREN_AVG:
133  return section_info._children / static_cast<Real>(section_info._num_calls);
134  case TOTAL_AVG:
135  return section_info._total / static_cast<Real>(section_info._num_calls);
136  case SELF_PERCENT:
137  return 100. * (section_info._self / app_time);
138  case CHILDREN_PERCENT:
139  return 100. * (section_info._children / app_time);
140  case TOTAL_PERCENT:
141  return 100. * (section_info._total / app_time);
142  case SELF_MEMORY:
143  return section_info._self_memory;
144  case CHILDREN_MEMORY:
145  return section_info._children_memory;
146  case TOTAL_MEMORY:
147  return section_info._total_memory;
148  default:
149  ::mooseError("Unknown DataType");
150  }
151 }
152 
153 void
155  const IncrementState state,
156  const std::chrono::time_point<std::chrono::steady_clock> time,
157  const long int memory)
158 {
159  auto & section_increment = _execution_list[_execution_list_end];
160 
161  section_increment._id = id;
162  section_increment._state = state;
163  section_increment._time = time;
164  section_increment._memory = memory;
165  section_increment._beginning_num_printed = _console.numPrinted();
166 
167  // A note about this next section of code:
168  // It is only EVER run on the main thread - and therefore there can be
169  // no race conditions. All that is important here is that the print
170  // thread always sees a consistent value for _execution_list_end
171  auto next_execution_list_end = _execution_list_end + 1;
172 
173  // Are we at the end of our circular buffer?
174  if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
175  next_execution_list_end = 0;
176 
177  // This "release" will synchronize the above memory changes with the
178  // "acquire" in the printing thread
179  // All of the above memory operations will be seen by the
180  // printing thread before the printing thread sees this new value
181  _execution_list_end.store(next_execution_list_end, std::memory_order_release);
182 }
183 
184 void
186 {
187  if (!_active)
188  return;
189 
190  PerfNode * new_node = nullptr;
191 
192  if (id == _root_node_id)
193  new_node = _root_node.get();
194  else
195  new_node = _stack[_current_position]->getChild(id);
196 
197  MemoryUtils::Stats stats;
198  auto memory_success = MemoryUtils::getMemoryStats(stats);
199 
200  long int start_memory = 0;
201 
202  if (memory_success)
203  start_memory =
205  else if (_current_position !=
206  -1) // If we weren't able to get the memory stats, let's just use the parent's
207  start_memory = _stack[_current_position]->startMemory();
208 
209  // Set the start time
210  auto current_time = std::chrono::steady_clock::now();
211 
212  new_node->setStartTimeAndMemory(current_time, start_memory);
213 
214  // Increment the number of calls
215  new_node->incrementNumCalls();
216 
218 
219  if (_current_position >= MOOSE_MAX_STACK_SIZE)
220  mooseError("PerfGraph is out of stack space!");
221 
222  _stack[_current_position] = new_node;
223 
224  // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
225  if ((_pid == 0 && !_disable_live_print) &&
227  addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
228 }
229 
230 void
232 {
233  if (!_active)
234  return;
235 
236  auto current_time = std::chrono::steady_clock::now();
237 
238  auto & current_node = _stack[_current_position];
239 
240  MemoryUtils::Stats stats;
241  auto memory_success = MemoryUtils::getMemoryStats(stats);
242 
243  long int current_memory = 0;
244 
245  if (memory_success)
246  current_memory =
248  else if (_current_position !=
249  -1) // If we weren't able to get the memory stats, let's just use the start memory
250  current_memory = _stack[_current_position]->startMemory();
251 
252  current_node->addTimeAndMemory(current_time, current_memory);
253 
255 
256  // Add this to the exection list
257  if ((_pid == 0 && !_disable_live_print) &&
258  (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
260  {
261  addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
262 
263  // Tell the printing thread that a section has finished
264  //
265  // Note: no mutex is needed here because we're using an atomic
266  // in the predicate of the condition_variable in the thread
267  // This is technically correct - but there is a chance of missing a signal
268  // For us - that chance is low and doesn't matter (the timeout will just be hit
269  // instead). So - I would rather not have an extra lock here in the main thread.
270  _finished_section.notify_one();
271  }
272 }
273 
274 void
276 {
277  // First update all of the currently running nodes
278  auto now = std::chrono::steady_clock::now();
279 
280  MemoryUtils::Stats stats;
282  auto now_memory =
284 
285  for (int i = 0; i <= _current_position; i++)
286  {
287  auto node = _stack[i];
288  node->addTimeAndMemory(now, now_memory);
289  node->setStartTimeAndMemory(now, now_memory);
290  }
291 
292  // Zero out the entries
293  for (auto & section_time_it : _cumulative_section_info)
294  {
295  auto & section_time = section_time_it.second;
296 
297  section_time._num_calls = 0;
298  section_time._self = 0.;
299  section_time._children = 0.;
300  section_time._total = 0.;
301  section_time._self_memory = 0;
302  section_time._children_memory = 0;
303  section_time._total_memory = 0.;
304  }
305 
307 
308  // Update vector pointing to section times
309  // Note: we are doing this _after_ recursively filling
310  // because new entries may have been created
312 
313  for (auto & section_time_it : _cumulative_section_info)
314  {
315  auto id = _perf_graph_registry.sectionID(section_time_it.first);
316 
317  _cumulative_section_info_ptrs[id] = &section_time_it.second;
318  }
319 }
320 
321 void
323 {
324  const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
325 
326  // RHS insertion on purpose
327  auto & section_time = _cumulative_section_info[section_name];
328 
329  section_time._self += current_node.selfTimeSec();
330  section_time._children += current_node.childrenTimeSec();
331  section_time._total += current_node.totalTimeSec();
332  section_time._num_calls += current_node.numCalls();
333 
334  section_time._self_memory += current_node.selfMemory();
335  section_time._children_memory += current_node.childrenMemory();
336  section_time._total_memory += current_node.totalMemory();
337 
338  for (auto & child_it : current_node.children())
339  recursivelyUpdate(*child_it.second);
340 }
341 
343 PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
344 {
345  update();
346 
347  FullTable vtable({"Section",
348  "Calls",
349  "Self(s)",
350  "Avg(s)",
351  "%",
352  "Mem(MB)",
353  "Total(s)",
354  "Avg(s)",
355  "%",
356  "Mem(MB)"},
357  10);
358 
359  vtable.setColumnFormat({
360  VariadicTableColumnFormat::AUTO, // Section Name
370  });
371 
372  vtable.setColumnPrecision({
373  1, // Section Name
374  0, // Calls
375  3, // Self
376  3, // Avg.
377  2, // %
378  0, // Memory
379  3, // Total
380  3, // Avg.
381  2, // %
382  0, // Memory
383  });
384 
385  auto act = [this, &vtable](const PerfNode & node,
386  const moose::internal::PerfGraphSectionInfo & section_info,
387  const unsigned int depth)
388  {
389  vtable.addRow(std::string(depth * 2, ' ') + section_info._name, // Section Name
390  node.numCalls(), // Calls
391  node.selfTimeSec(), // Self
392  node.selfTimeAvg(), // Avg.
393  100. * node.selfTimeSec() / _root_node->totalTimeSec(), // %
394  node.selfMemory(), // Memory
395  node.totalTimeSec(), // Total
396  node.totalTimeAvg(), // Avg.
397  100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
398  node.totalMemory()); // Memory
399  };
400  treeRecurse(act, level, heaviest);
401 
402  return vtable;
403 }
404 
405 void
406 PerfGraph::print(const ConsoleStream & console, unsigned int level)
407 {
408  console << "\nPerformance Graph:\n";
409  treeTable(level).print(console);
410 }
411 
412 void
414 {
415  console << "\nHeaviest Branch:\n";
416  treeTable(MOOSE_MAX_STACK_SIZE, /* heaviest = */ true).print(console);
417 }
418 
419 void
420 PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
421 {
422  update();
423 
424  console << "\nHeaviest Sections:\n";
425 
426  // Indirect Sort The Self Time
427  std::vector<size_t> sorted;
430  sorted,
432  {
433  if (lhs && rhs)
434  return lhs->_self > rhs->_self;
435 
436  // If the LHS exists - it's definitely bigger than a non-existant RHS
437  if (lhs)
438  return true;
439 
440  // Both don't exist - so it doesn't matter how we sort them
441  return false;
442  });
443 
444  HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
445 
446  vtable.setColumnFormat({VariadicTableColumnFormat::AUTO, // Section; doesn't matter
452  );
453 
454  vtable.setColumnPrecision({
455  1, // Section
456  1, // Calls
457  3, // Time
458  3, // Avg.
459  2, // Percent
460  1 // Memory
461  });
462 
463  mooseAssert(!_cumulative_section_info_ptrs.empty(),
464  "update() must be run before printHeaviestSections()!");
465 
466  // The total time of the root node
467  auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
468 
469  // Now print out the largest ones
470  for (unsigned int i = 0; i < num_sections; i++)
471  {
472  auto id = sorted[i];
473 
475  continue;
476 
477  const auto & entry = *_cumulative_section_info_ptrs[id];
478  vtable.addRow(_perf_graph_registry.sectionInfo(id)._name, // Section
479  entry._num_calls, // Calls
480  entry._self, // Time
481  entry._self / static_cast<Real>(entry._num_calls), // Avg.
482  100. * entry._self / total_root_time, // Percent
483  entry._self_memory); // Memory
484  }
485 
486  vtable.print(console);
487 }
488 
489 void
490 dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
491 {
492  // We need to store the registry id -> section info map so that we can add
493  // registered sections that may not be added yet during recover
494  dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
495 
496  // Update before serializing the nodes so that the time/memory/calls are correct
497  perf_graph.update();
498 
499  // Recursively serialize all of the nodes
500  dataStore(stream, perf_graph._root_node, nullptr);
501 }
502 
503 void
504 dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
505 {
506  // Load in all of the recovered sections and register those that do not exist yet
507  std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
508  dataLoad(stream, recovered_section_info, nullptr);
509  for (const auto & info : recovered_section_info)
510  {
511  if (info._live_message.size())
513  info._name, info._level, info._live_message, info._print_dots);
514  else
515  perf_graph._perf_graph_registry.registerSection(info._name, info._level);
516  }
517 
518  // Update the current node time/memory/calls before loading the nodes as the load
519  // will append information to current nodes that exist
520  perf_graph.update();
521 
522  // Recursively load all of the nodes; this will append information to matching nodes
523  // and will create new nodes for section paths that do not exist
524  dataLoad(stream, perf_graph._root_node, &perf_graph);
525 }
long int totalMemory() const
Get the amount of memory added by this node.
Definition: PerfNode.h:161
void recursivelyUpdate(const PerfNode &current_node)
Updates the cumulative self/children/total time and memory for each section across all nodes that con...
Definition: PerfGraph.C:322
Real selfTimeSec() const
Get the time this node took in seconds.
Definition: PerfNode.h:115
PerfGraphRegistry & _perf_graph_registry
The PerfGraphRegistry.
Definition: PerfGraph.h:337
A helper class for re-directing output streams to Console output objects form MooseObjects.
Definition: ConsoleStream.h:30
long int _total_memory
Total memory gain for this section.
Definition: PerfGraph.h:235
long int _self_memory
Amount of memory gained within this section (without children)
Definition: PerfGraph.h:229
unsigned long int _num_calls
Number of times this section has been called.
Definition: PerfGraph.h:226
~PerfGraph()
Destructor.
Definition: PerfGraph.C:54
PerfGraph(const std::string &root_name, MooseApp &app, const bool live_all, const bool perf_graph_live)
Create a new PerfGraph.
Definition: PerfGraph.C:28
std::string _live_message
Message to print while the section is running.
bool sectionExists(const std::string &section_name) const
Whether or not a section with that name has been registered The name of the section.
void setStartTimeAndMemory(const std::chrono::time_point< std::chrono::steady_clock > time, const long int memory)
Set the current start time.
Definition: PerfNode.h:40
Real sectionData(const DataType type, const std::string &section_name, const bool must_exist=true)
Gets a PerfGraph result pertaining to a section.
Definition: PerfGraph.C:93
long int selfMemory() const
Get the amount of memory added by this node.
Definition: PerfNode.C:41
void print(StreamType &stream)
Pretty print the table of data.
Definition: VariadicTable.h:97
MPI_Info info
bool getMemoryStats(Stats &stats)
get all memory stats for the current process stats The Stats object to fill with the data ...
Definition: MemoryUtils.C:79
std::thread _print_thread
The thread for printing sections as they execute.
Definition: PerfGraph.h:408
void mooseError(Args &&... args)
Emit an error message with the given stringified, concatenated args and terminate the application...
Definition: MooseError.h:302
Real selfTimeAvg() const
The average time this node took in seconds.
Definition: PerfNode.h:119
std::size_t _physical_memory
Definition: MemoryUtils.h:23
DataType
For retrieving values.
Definition: PerfGraph.h:51
A class for "pretty printing" a table of data.
Definition: PerfGraph.h:34
void indirectSort(RandomAccessIterator beg, RandomAccessIterator end, std::vector< size_t > &b)
Definition: IndirectSort.h:68
const std::unique_ptr< PerfGraphLivePrint > _live_print
The object that is doing live printing.
Definition: PerfGraph.h:405
Real _total
Total amount of time used.
Definition: PerfGraph.h:223
Base class for MOOSE-based applications.
Definition: MooseApp.h:96
std::atomic< unsigned int > _execution_list_end
Where the print thread should stop reading the execution list.
Definition: PerfGraph.h:364
long int childrenMemory() const
Get the amount of memory added by this node.
Definition: PerfNode.C:47
IncrementState
The execution state of an increment.
Definition: PerfGraph.h:241
bool _live_print_all
Whether or not to put everything in the perf graph.
Definition: PerfGraph.h:331
Real totalTimeSec() const
The time this Node plus all of its children took in seconds.
Definition: PerfNode.h:128
Real _self
Amount of time used within this section (without children)
Definition: PerfGraph.h:217
Used to hold metadata about the registered sections Note: this is a class instead of a struct because...
std::condition_variable _finished_section
The condition_variable to wake the print thread.
Definition: PerfGraph.h:396
void update()
Updates the time section_time and time for all currently running nodes.
Definition: PerfGraph.C:275
unsigned int PerfID
Definition: MooseTypes.h:212
void incrementNumCalls()
Increments the number of calls.
Definition: PerfNode.h:75
void addToExecutionList(const PerfID id, const IncrementState state, const std::chrono::time_point< std::chrono::steady_clock > time, const long int memory)
Add the information to the execution list.
Definition: PerfGraph.C:154
std::vector< CumulativeSectionInfo * > _cumulative_section_info_ptrs
Pointers into _cumulative_section_info indexed on PerfID This is here for convenience and speed so we...
Definition: PerfGraph.h:381
unsigned long int numCalls() const
Get the number of times this node was called.
Definition: PerfNode.h:146
void treeRecurse(const Functor &act, const unsigned int level=MOOSE_MAX_STACK_SIZE, const bool heaviest=false) const
Definition: PerfGraph.h:473
unsigned long long int numPrinted() const
The number of times something has been printed.
Definition: ConsoleStream.C:68
const std::unique_ptr< PerfNode > _root_node
The root node of the graph.
Definition: PerfGraph.h:349
const std::map< PerfID, std::unique_ptr< PerfNode > > & children() const
Get the children.
Definition: PerfNode.h:106
Real childrenTimeSec() const
The time this node&#39;s children took in seconds.
Definition: PerfNode.h:141
PerfID id() const
Get the ID of this Node.
Definition: PerfNode.h:35
void dataLoad(std::istream &stream, PerfGraph &perf_graph, void *)
Definition: PerfGraph.C:504
std::mutex _destructing_mutex
The mutex to use with a condition_variable predicate to guard _destructing.
Definition: PerfGraph.h:393
std::unordered_map< std::string, CumulativeSectionInfo > _cumulative_section_info
The cumulative time and memory for each section.
Definition: PerfGraph.h:375
An inteface for the _console for outputting to the Console object.
std::deque< Item > _id_to_item
Vector of IDs to Items.
bool _active
Whether or not timing is active.
Definition: PerfGraph.h:384
This is effectively a functor that runs on a separate thread and watches the state of the call stack ...
PerfID registerSection(const std::string &section_name, const unsigned int level)
Call to register a named section for timing.
void disableLivePrint()
Completely disables Live Print (cannot be restarted)
Definition: PerfGraph.C:67
int _current_position
The current node position in the stack.
Definition: PerfGraph.h:352
Real _children
Amount of time used by children.
Definition: PerfGraph.h:220
void pop()
Remove a Node from the end of the current scope.
Definition: PerfGraph.C:231
const PerfGraphSectionInfo & sectionInfo(const PerfID section_id) const
Given a PerfID return the PerfGraphSectionInfo The ID.
bool _disable_live_print
Whether or not live print is disabled (cannot be turned on again)
Definition: PerfGraph.h:334
void setColumnFormat(const std::vector< VariadicTableColumnFormat > &column_format)
Set how to format numbers for each column.
void print(const ConsoleStream &console, unsigned int level)
Print the tree out.
Definition: PerfGraph.C:406
std::array< PerfNode *, MOOSE_MAX_STACK_SIZE > _stack
The full callstack. Currently capped at a depth of 100.
Definition: PerfGraph.h:355
void printHeaviestBranch(const ConsoleStream &console)
Print out the heaviest branch through the tree.
Definition: PerfGraph.C:413
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
Real totalTimeAvg() const
The average time this Node plus all of its children took in seconds.
Definition: PerfNode.h:132
const std::string _root_name
Name of the root node.
Definition: PerfGraph.h:343
PerfID sectionID(const std::string &section_name) const
Given a name return the PerfID The name of the section.
Use to hold the cumulative time and memory for each section, which comes from all of the PerfNodes th...
Definition: PerfGraph.h:214
const libMesh::processor_id_type _pid
This processor id.
Definition: PerfGraph.h:340
long int _children_memory
Amount of memory gained by children.
Definition: PerfGraph.h:232
bool _destructing
Tell the print thread to teardown.
Definition: PerfGraph.h:390
void printHeaviestSections(const ConsoleStream &console, const unsigned int num_sections)
Print out the heaviest sections that were timed.
Definition: PerfGraph.C:420
const ConsoleStream _console
An instance of helper class to write streams to the Console objects.
void push(const PerfID id)
Add a Node onto the end of the end of the current callstack.
Definition: PerfGraph.C:185
FullTable treeTable(const unsigned int level, const bool heaviest=false)
Helper for building a VariadicTable that represents the tree.
Definition: PerfGraph.C:343
The PerfGraph will hold the master list of all registered performance segments and the head PerfNode...
Definition: PerfGraph.h:43
std::size_t convertBytes(std::size_t bytes, MemUnits unit)
convert bytes to selected unit prefix
Definition: MemoryUtils.C:174
PerfGraphRegistry & getPerfGraphRegistry()
Get the global PerfGraphRegistry singleton.
const PerfGraphSectionInfo & readSectionInfo(PerfID section_id) const
Special accessor just for PerfGraph so that no locking is needed in PerfGraph.
std::array< SectionIncrement, MAX_EXECUTION_LIST_SIZE > _execution_list
A circular buffer for holding the execution list, this is read by the printing loop.
Definition: PerfGraph.h:358
void enableLivePrint()
Enables Live Print.
Definition: PerfGraph.C:57
void dataStore(std::ostream &stream, PerfGraph &perf_graph, void *)
Definition: PerfGraph.C:490
const PerfID _root_node_id
The id for the root node.
Definition: PerfGraph.h:346
A node in the PerfGraph.
Definition: PerfNode.h:24