Line data Source code
1 : //* This file is part of the MOOSE framework
2 : //* https://mooseframework.inl.gov
3 : //*
4 : //* All rights reserved, see COPYRIGHT for full restrictions
5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 : //*
7 : //* Licensed under LGPL 2.1, please see LICENSE for details
8 : //* https://www.gnu.org/licenses/lgpl-2.1.html
9 :
10 : #include "PerfGraph.h"
11 :
12 : // MOOSE Includes
13 : #include "PerfGuard.h"
14 : #include "MooseError.h"
15 : #include "PerfGraphLivePrint.h"
16 : #include "PerfGraphRegistry.h"
17 : #include "MooseApp.h"
18 :
19 : // Note: do everything we can to make sure this only gets #included
20 : // in the .C file... this is a heavily templated header that we
21 : // don't want to expose to EVERY file in MOOSE...
22 : #include "VariadicTable.h"
23 :
24 : // System Includes
25 : #include <chrono>
26 : #include <memory>
27 :
28 66995 : PerfGraph::PerfGraph(const std::string & root_name,
29 : MooseApp & app,
30 : const bool live_all,
31 66995 : const bool perf_graph_live)
32 : : ConsoleStreamInterface(app),
33 66995 : _moose_app(app),
34 66995 : _live_print_all(live_all),
35 66995 : _disable_live_print(!perf_graph_live),
36 66995 : _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
37 66995 : _pid(app.comm().rank()),
38 66995 : _root_name(root_name),
39 66995 : _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
40 66995 : _root_node(std::make_unique<PerfNode>(_root_node_id)),
41 66995 : _current_position(-1),
42 66995 : _stack(),
43 66995 : _execution_list_begin(0),
44 66995 : _execution_list_end(0),
45 66995 : _max_memory(0),
46 66995 : _active(true),
47 66995 : _destructing(false),
48 66995 : _live_print_time_limit(5.0),
49 66995 : _live_print_mem_limit(100),
50 334975 : _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
51 : {
52 66995 : push(_root_node_id);
53 66995 : }
54 :
55 62876 : PerfGraph::~PerfGraph() { disableLivePrint(); }
56 :
57 : void
58 66993 : PerfGraph::enableLivePrint()
59 : {
60 66993 : if (_pid == 0 && !_disable_live_print)
61 : {
62 : // Start the printing thread
63 4636 : _print_thread = std::thread([this] { this->_live_print->start(); });
64 : }
65 66993 : }
66 :
67 : void
68 63127 : PerfGraph::disableLivePrint()
69 : {
70 63127 : if (_pid == 0 && !_disable_live_print)
71 : {
72 : {
73 : // Unlike using atomics for execution_thread_end
74 : // here we actually lock to ensure that either the print thread
75 : // immediately sees that we are destructing or is immediately
76 : // notified with the below notification. Without doing this
77 : // it would be possible (but unlikely) for the print thread to
78 : // hang for 1 second at the end of execution (which would not be
79 : // good anytime you are running lots of fast calculations back-to-back
80 : // like during testing or stochastic sampling).
81 2315 : std::lock_guard<std::mutex> lock(_destructing_mutex);
82 2315 : _destructing = true;
83 2315 : }
84 :
85 2315 : _finished_section.notify_one();
86 :
87 2315 : _print_thread.join();
88 :
89 2315 : _disable_live_print = true;
90 : }
91 63127 : }
92 :
93 : Real
94 564 : PerfGraph::sectionData(const DataType type,
95 : const std::string & section_name,
96 : const bool must_exist /* = true */)
97 : {
98 564 : update();
99 :
100 : const auto section_it =
101 564 : _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
102 :
103 564 : if (section_it == _cumulative_section_info.end())
104 : {
105 17 : if (!must_exist || // isn't required to exist
106 3 : _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
107 : )
108 11 : return 0;
109 :
110 3 : mooseError("Unknown PerfGraph section name \"",
111 : section_name,
112 : "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
113 : "\"Root\".");
114 : }
115 :
116 550 : const CumulativeSectionInfo & section_info = section_it->second;
117 :
118 550 : if (type == CALLS)
119 62 : return section_info._num_calls;
120 :
121 488 : const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
122 :
123 488 : switch (type)
124 : {
125 31 : case SELF:
126 31 : return section_info._self;
127 11 : case CHILDREN:
128 11 : return section_info._children;
129 267 : case TOTAL:
130 267 : return section_info._total;
131 11 : case SELF_AVG:
132 11 : return section_info._self / static_cast<Real>(section_info._num_calls);
133 11 : case CHILDREN_AVG:
134 11 : return section_info._children / static_cast<Real>(section_info._num_calls);
135 51 : case TOTAL_AVG:
136 51 : return section_info._total / static_cast<Real>(section_info._num_calls);
137 31 : case SELF_PERCENT:
138 31 : return 100. * (section_info._self / app_time);
139 11 : case CHILDREN_PERCENT:
140 11 : return 100. * (section_info._children / app_time);
141 31 : case TOTAL_PERCENT:
142 31 : return 100. * (section_info._total / app_time);
143 11 : case SELF_MEMORY:
144 11 : return section_info._self_memory;
145 11 : case CHILDREN_MEMORY:
146 11 : return section_info._children_memory;
147 11 : case TOTAL_MEMORY:
148 11 : return section_info._total_memory;
149 0 : default:
150 0 : ::mooseError("Unknown DataType");
151 : }
152 : }
153 :
154 : void
155 406452 : PerfGraph::addToExecutionList(const PerfID id,
156 : const IncrementState state,
157 : const std::chrono::time_point<std::chrono::steady_clock> time,
158 : const long int memory)
159 : {
160 406452 : auto & section_increment = _execution_list[_execution_list_end];
161 :
162 406452 : section_increment._id = id;
163 406452 : section_increment._state = state;
164 406452 : section_increment._time = time;
165 406452 : section_increment._memory = memory;
166 406452 : section_increment._beginning_num_printed = _console.numPrinted();
167 :
168 : // A note about this next section of code:
169 : // It is only EVER run on the main thread - and therefore there can be
170 : // no race conditions. All that is important here is that the print
171 : // thread always sees a consistent value for _execution_list_end
172 406452 : auto next_execution_list_end = _execution_list_end + 1;
173 :
174 : // Are we at the end of our circular buffer?
175 406452 : if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
176 0 : next_execution_list_end = 0;
177 :
178 : // This "release" will synchronize the above memory changes with the
179 : // "acquire" in the printing thread
180 : // All of the above memory operations will be seen by the
181 : // printing thread before the printing thread sees this new value
182 406452 : _execution_list_end.store(next_execution_list_end, std::memory_order_release);
183 406452 : }
184 :
185 : void
186 85909669 : PerfGraph::push(const PerfID id)
187 : {
188 85909669 : if (!_active)
189 3397 : return;
190 :
191 85906272 : PerfNode * new_node = nullptr;
192 :
193 85906272 : if (id == _root_node_id)
194 66995 : new_node = _root_node.get();
195 : else
196 85839277 : new_node = _stack[_current_position]->getChild(id);
197 :
198 : MemoryUtils::Stats stats;
199 85906272 : auto memory_success = MemoryUtils::getMemoryStats(stats);
200 :
201 85906272 : long int start_memory = 0;
202 :
203 85906272 : if (memory_success)
204 : {
205 : const auto memory_mb =
206 85906272 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
207 85906272 : start_memory = memory_mb;
208 85906272 : updateMaxMemory(memory_mb);
209 : }
210 : // If we weren't able to get the memory stats, let's just use the parent's
211 0 : else if (_current_position != -1)
212 0 : start_memory = _stack[_current_position]->startMemory();
213 :
214 : // Set the start time
215 85906272 : auto current_time = std::chrono::steady_clock::now();
216 :
217 85906272 : new_node->setStartTimeAndMemory(current_time, start_memory);
218 :
219 : // Increment the number of calls
220 85906272 : new_node->incrementNumCalls();
221 :
222 85906272 : _current_position++;
223 :
224 85906272 : if (_current_position >= MOOSE_MAX_STACK_SIZE)
225 0 : mooseError("PerfGraph is out of stack space!");
226 :
227 85906272 : _stack[_current_position] = new_node;
228 :
229 : // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
230 86497324 : if ((_pid == 0 && !_disable_live_print) &&
231 591052 : (!_perf_graph_registry.readSectionInfo(id)._live_message.empty() || _live_print_all))
232 203423 : addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
233 : }
234 :
235 : void
236 85828289 : PerfGraph::pop()
237 : {
238 85828289 : if (!_active)
239 3433 : return;
240 :
241 85824856 : auto current_time = std::chrono::steady_clock::now();
242 :
243 85824856 : auto & current_node = _stack[_current_position];
244 :
245 : MemoryUtils::Stats stats;
246 85824856 : auto memory_success = MemoryUtils::getMemoryStats(stats);
247 :
248 85824856 : long int current_memory = 0;
249 :
250 85824856 : if (memory_success)
251 : {
252 : const auto memory_mb =
253 85824856 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
254 85824856 : current_memory = memory_mb;
255 85824856 : updateMaxMemory(memory_mb);
256 : }
257 : // If we weren't able to get the memory stats, let's just use the start memory
258 0 : else if (_current_position !=
259 : -1) // If we weren't able to get the memory stats, let's just use the start memory
260 0 : current_memory = _stack[_current_position]->startMemory();
261 :
262 85824856 : current_node->addTimeAndMemory(current_time, current_memory);
263 :
264 85824856 : _current_position--;
265 :
266 : // Add this to the exection list
267 86412996 : if ((_pid == 0 && !_disable_live_print) &&
268 588140 : (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
269 385111 : _live_print_all))
270 : {
271 203029 : addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
272 :
273 : // Tell the printing thread that a section has finished
274 : //
275 : // Note: no mutex is needed here because we're using an atomic
276 : // in the predicate of the condition_variable in the thread
277 : // This is technically correct - but there is a chance of missing a signal
278 : // For us - that chance is low and doesn't matter (the timeout will just be hit
279 : // instead). So - I would rather not have an extra lock here in the main thread.
280 203029 : _finished_section.notify_one();
281 : }
282 : }
283 :
284 : void
285 112801 : PerfGraph::update()
286 : {
287 : // First update all of the currently running nodes
288 112801 : auto now = std::chrono::steady_clock::now();
289 :
290 : MemoryUtils::Stats stats;
291 112801 : MemoryUtils::getMemoryStats(stats);
292 : auto now_memory =
293 112801 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
294 :
295 650014 : for (int i = 0; i <= _current_position; i++)
296 : {
297 537213 : auto node = _stack[i];
298 537213 : node->addTimeAndMemory(now, now_memory);
299 537213 : node->setStartTimeAndMemory(now, now_memory);
300 : }
301 :
302 : // Zero out the entries
303 7145900 : for (auto & section_time_it : _cumulative_section_info)
304 : {
305 7033099 : auto & section_time = section_time_it.second;
306 :
307 7033099 : section_time._num_calls = 0;
308 7033099 : section_time._self = 0.;
309 7033099 : section_time._children = 0.;
310 7033099 : section_time._total = 0.;
311 7033099 : section_time._self_memory = 0;
312 7033099 : section_time._children_memory = 0;
313 7033099 : section_time._total_memory = 0.;
314 : }
315 :
316 112801 : recursivelyUpdate(*_root_node);
317 :
318 : // Update vector pointing to section times
319 : // Note: we are doing this _after_ recursively filling
320 : // because new entries may have been created
321 112801 : _cumulative_section_info_ptrs.resize(_perf_graph_registry.numSections());
322 :
323 14813216 : for (auto & section_time_it : _cumulative_section_info)
324 : {
325 14700415 : auto id = _perf_graph_registry.sectionID(section_time_it.first);
326 :
327 14700415 : _cumulative_section_info_ptrs[id] = §ion_time_it.second;
328 : }
329 112801 : }
330 :
331 : void
332 17539997 : PerfGraph::recursivelyUpdate(const PerfNode & current_node)
333 : {
334 17539997 : const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
335 :
336 : // RHS insertion on purpose
337 17539997 : auto & section_time = _cumulative_section_info[section_name];
338 :
339 17539997 : section_time._self += current_node.selfTimeSec();
340 17539997 : section_time._children += current_node.childrenTimeSec();
341 17539997 : section_time._total += current_node.totalTimeSec();
342 17539997 : section_time._num_calls += current_node.numCalls();
343 :
344 17539997 : section_time._self_memory += current_node.selfMemory();
345 17539997 : section_time._children_memory += current_node.childrenMemory();
346 17539997 : section_time._total_memory += current_node.totalMemory();
347 :
348 34967193 : for (auto & child_it : current_node.children())
349 17427196 : recursivelyUpdate(*child_it.second);
350 17539997 : }
351 :
352 : PerfGraph::FullTable
353 53346 : PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
354 : {
355 53346 : update();
356 :
357 : FullTable vtable({"Section",
358 : "Calls",
359 : "Self(s)",
360 : "Avg(s)",
361 : "%",
362 : "Mem(MB)",
363 : "Total(s)",
364 : "Avg(s)",
365 : "%",
366 : "Mem(MB)"},
367 106692 : 10);
368 :
369 106692 : vtable.setColumnFormat({
370 : VariadicTableColumnFormat::AUTO, // Section Name
371 : VariadicTableColumnFormat::AUTO, // Calls
372 : VariadicTableColumnFormat::FIXED, // Self
373 : VariadicTableColumnFormat::FIXED, // Avg.
374 : VariadicTableColumnFormat::PERCENT, // %
375 : VariadicTableColumnFormat::AUTO, // Memory
376 : VariadicTableColumnFormat::FIXED, // Total
377 : VariadicTableColumnFormat::FIXED, // Avg.
378 : VariadicTableColumnFormat::PERCENT, // %
379 : VariadicTableColumnFormat::AUTO, // Memory
380 : });
381 :
382 106692 : vtable.setColumnPrecision({
383 : 1, // Section Name
384 : 0, // Calls
385 : 3, // Self
386 : 3, // Avg.
387 : 2, // %
388 : 0, // Memory
389 : 3, // Total
390 : 3, // Avg.
391 : 2, // %
392 : 0, // Memory
393 : });
394 :
395 666378 : auto act = [this, &vtable](const PerfNode & node,
396 : const moose::internal::PerfGraphSectionInfo & section_info,
397 : const unsigned int depth)
398 : {
399 2665512 : vtable.addRow(std::string(depth * 2, ' ') + section_info._name, // Section Name
400 : node.numCalls(), // Calls
401 : node.selfTimeSec(), // Self
402 : node.selfTimeAvg(), // Avg.
403 666378 : 100. * node.selfTimeSec() / _root_node->totalTimeSec(), // %
404 : node.selfMemory(), // Memory
405 : node.totalTimeSec(), // Total
406 : node.totalTimeAvg(), // Avg.
407 666378 : 100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
408 : node.totalMemory()); // Memory
409 666378 : };
410 53346 : treeRecurse(act, level, heaviest);
411 :
412 106692 : return vtable;
413 0 : }
414 :
415 : void
416 53326 : PerfGraph::print(const ConsoleStream & console, unsigned int level)
417 : {
418 53326 : console << "\nPerformance Graph:\n";
419 53326 : treeTable(level).print(console);
420 53326 : }
421 :
422 : void
423 20 : PerfGraph::printHeaviestBranch(const ConsoleStream & console)
424 : {
425 20 : console << "\nHeaviest Branch:\n";
426 20 : treeTable(MOOSE_MAX_STACK_SIZE, /* heaviest = */ true).print(console);
427 20 : }
428 :
429 : void
430 20 : PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
431 : {
432 20 : update();
433 :
434 20 : console << "\nHeaviest Sections:\n";
435 :
436 : // Indirect Sort The Self Time
437 20 : std::vector<size_t> sorted;
438 20 : Moose::indirectSort(_cumulative_section_info_ptrs.begin(),
439 : _cumulative_section_info_ptrs.end(),
440 : sorted,
441 17933 : [](CumulativeSectionInfo * lhs, CumulativeSectionInfo * rhs)
442 : {
443 17933 : if (lhs && rhs)
444 17761 : return lhs->_self > rhs->_self;
445 :
446 : // If the LHS exists - it's definitely bigger than a non-existant RHS
447 172 : if (lhs)
448 64 : return true;
449 :
450 : // Both don't exist - so it doesn't matter how we sort them
451 108 : return false;
452 : });
453 :
454 40 : HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
455 :
456 40 : vtable.setColumnFormat({VariadicTableColumnFormat::AUTO, // Section; doesn't matter
457 : VariadicTableColumnFormat::AUTO, // Calls
458 : VariadicTableColumnFormat::FIXED, // Time
459 : VariadicTableColumnFormat::FIXED, // Avg.
460 : VariadicTableColumnFormat::PERCENT, // Percent
461 : VariadicTableColumnFormat::AUTO} // Memory
462 : );
463 :
464 40 : vtable.setColumnPrecision({
465 : 1, // Section
466 : 1, // Calls
467 : 3, // Time
468 : 3, // Avg.
469 : 2, // Percent
470 : 1 // Memory
471 : });
472 :
473 : mooseAssert(!_cumulative_section_info_ptrs.empty(),
474 : "update() must be run before printHeaviestSections()!");
475 :
476 : // The total time of the root node
477 20 : auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
478 :
479 : // Now print out the largest ones
480 165 : for (unsigned int i = 0; i < num_sections; i++)
481 : {
482 145 : auto id = sorted[i];
483 :
484 145 : if (!_cumulative_section_info_ptrs[id])
485 0 : continue;
486 :
487 145 : const auto & entry = *_cumulative_section_info_ptrs[id];
488 145 : vtable.addRow(_perf_graph_registry.sectionInfo(id)._name, // Section
489 145 : entry._num_calls, // Calls
490 145 : entry._self, // Time
491 145 : entry._self / static_cast<Real>(entry._num_calls), // Avg.
492 145 : 100. * entry._self / total_root_time, // Percent
493 145 : entry._self_memory); // Memory
494 : }
495 :
496 20 : vtable.print(console);
497 20 : }
498 :
499 : void
500 171731128 : PerfGraph::updateMaxMemory(const std::size_t current_memory)
501 : {
502 : // We shouldn't need to lock _max_memory in-between getting
503 : // it and setting it as this should only ever be set outside
504 : // of threads. _max_memory is an atomic so that getting it
505 : // is thread safe.
506 171731128 : if (current_memory > getMaxMemory())
507 879843 : _max_memory = current_memory;
508 171731128 : }
509 :
510 : void
511 45637 : dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
512 : {
513 : // We need to store the registry id -> section info map so that we can add
514 : // registered sections that may not be added yet during recover
515 45637 : dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
516 :
517 : // Update before serializing the nodes so that the time/memory/calls are correct
518 45637 : perf_graph.update();
519 :
520 : // Recursively serialize all of the nodes
521 45637 : dataStore(stream, perf_graph._root_node, nullptr);
522 45637 : }
523 :
524 : void
525 13192 : dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
526 : {
527 : // Load in all of the recovered sections and register those that do not exist yet
528 13192 : std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
529 13192 : dataLoad(stream, recovered_section_info, nullptr);
530 2149055 : for (const auto & info : recovered_section_info)
531 : {
532 2135863 : if (info._live_message.size())
533 942117 : perf_graph._perf_graph_registry.registerSection(
534 942117 : info._name, info._level, info._live_message, info._print_dots);
535 : else
536 1193746 : perf_graph._perf_graph_registry.registerSection(info._name, info._level);
537 : }
538 :
539 : // Update the current node time/memory/calls before loading the nodes as the load
540 : // will append information to current nodes that exist
541 13192 : perf_graph.update();
542 :
543 : // Recursively load all of the nodes; this will append information to matching nodes
544 : // and will create new nodes for section paths that do not exist
545 13192 : dataLoad(stream, perf_graph._root_node, &perf_graph);
546 13192 : }
|