Line data Source code
1 : //* This file is part of the MOOSE framework
2 : //* https://mooseframework.inl.gov
3 : //*
4 : //* All rights reserved, see COPYRIGHT for full restrictions
5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 : //*
7 : //* Licensed under LGPL 2.1, please see LICENSE for details
8 : //* https://www.gnu.org/licenses/lgpl-2.1.html
9 :
10 : #include "PerfGraph.h"
11 :
12 : // MOOSE Includes
13 : #include "PerfGuard.h"
14 : #include "MooseError.h"
15 : #include "PerfGraphLivePrint.h"
16 : #include "PerfGraphRegistry.h"
17 : #include "MooseApp.h"
18 :
19 : // Note: do everything we can to make sure this only gets #included
20 : // in the .C file... this is a heavily templated header that we
21 : // don't want to expose to EVERY file in MOOSE...
22 : #include "VariadicTable.h"
23 :
24 : // System Includes
25 : #include <chrono>
26 : #include <memory>
27 :
28 62755 : PerfGraph::PerfGraph(const std::string & root_name,
29 : MooseApp & app,
30 : const bool live_all,
31 62755 : const bool perf_graph_live)
32 : : ConsoleStreamInterface(app),
33 62755 : _moose_app(app),
34 62755 : _live_print_all(live_all),
35 62755 : _disable_live_print(!perf_graph_live),
36 62755 : _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
37 62755 : _pid(app.comm().rank()),
38 62755 : _root_name(root_name),
39 62755 : _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
40 62755 : _root_node(std::make_unique<PerfNode>(_root_node_id)),
41 62755 : _current_position(-1),
42 62755 : _stack(),
43 62755 : _execution_list_begin(0),
44 62755 : _execution_list_end(0),
45 62755 : _active(true),
46 62755 : _destructing(false),
47 62755 : _live_print_time_limit(5.0),
48 62755 : _live_print_mem_limit(100),
49 251020 : _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
50 : {
51 62755 : push(_root_node_id);
52 62755 : }
53 :
54 57254 : PerfGraph::~PerfGraph() { disableLivePrint(); }
55 :
56 : void
57 62754 : PerfGraph::enableLivePrint()
58 : {
59 62754 : if (_pid == 0 && !_disable_live_print)
60 : {
61 : // Start the printing thread
62 97486 : _print_thread = std::thread([this] { this->_live_print->start(); });
63 : }
64 62754 : }
65 :
66 : void
67 60373 : PerfGraph::disableLivePrint()
68 : {
69 60373 : if (_pid == 0 && !_disable_live_print)
70 : {
71 : {
72 : // Unlike using atomics for execution_thread_end
73 : // here we actually lock to ensure that either the print thread
74 : // immediately sees that we are destructing or is immediately
75 : // notified with the below notification. Without doing this
76 : // it would be possible (but unlikely) for the print thread to
77 : // hang for 1 second at the end of execution (which would not be
78 : // good anytime you are running lots of fast calculations back-to-back
79 : // like during testing or stochastic sampling).
80 43334 : std::lock_guard<std::mutex> lock(_destructing_mutex);
81 43334 : _destructing = true;
82 43334 : }
83 :
84 43334 : _finished_section.notify_one();
85 :
86 43334 : _print_thread.join();
87 :
88 43334 : _disable_live_print = true;
89 : }
90 60373 : }
91 :
92 : Real
93 592 : PerfGraph::sectionData(const DataType type,
94 : const std::string & section_name,
95 : const bool must_exist /* = true */)
96 : {
97 592 : update();
98 :
99 : const auto section_it =
100 592 : _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
101 :
102 592 : if (section_it == _cumulative_section_info.end())
103 : {
104 19 : if (!must_exist || // isn't required to exist
105 4 : _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
106 : )
107 11 : return 0;
108 :
109 4 : mooseError("Unknown PerfGraph section name \"",
110 : section_name,
111 : "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
112 : "\"Root\".");
113 : }
114 :
115 577 : const CumulativeSectionInfo & section_info = section_it->second;
116 :
117 577 : if (type == CALLS)
118 64 : return section_info._num_calls;
119 :
120 513 : const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
121 :
122 513 : switch (type)
123 : {
124 32 : case SELF:
125 32 : return section_info._self;
126 11 : case CHILDREN:
127 11 : return section_info._children;
128 287 : case TOTAL:
129 287 : return section_info._total;
130 11 : case SELF_AVG:
131 11 : return section_info._self / static_cast<Real>(section_info._num_calls);
132 11 : case CHILDREN_AVG:
133 11 : return section_info._children / static_cast<Real>(section_info._num_calls);
134 53 : case TOTAL_AVG:
135 53 : return section_info._total / static_cast<Real>(section_info._num_calls);
136 32 : case SELF_PERCENT:
137 32 : return 100. * (section_info._self / app_time);
138 11 : case CHILDREN_PERCENT:
139 11 : return 100. * (section_info._children / app_time);
140 32 : case TOTAL_PERCENT:
141 32 : return 100. * (section_info._total / app_time);
142 11 : case SELF_MEMORY:
143 11 : return section_info._self_memory;
144 11 : case CHILDREN_MEMORY:
145 11 : return section_info._children_memory;
146 11 : case TOTAL_MEMORY:
147 11 : return section_info._total_memory;
148 0 : default:
149 0 : ::mooseError("Unknown DataType");
150 : }
151 : }
152 :
153 : void
154 50760578 : PerfGraph::addToExecutionList(const PerfID id,
155 : const IncrementState state,
156 : const std::chrono::time_point<std::chrono::steady_clock> time,
157 : const long int memory)
158 : {
159 50760578 : auto & section_increment = _execution_list[_execution_list_end];
160 :
161 50760578 : section_increment._id = id;
162 50760578 : section_increment._state = state;
163 50760578 : section_increment._time = time;
164 50760578 : section_increment._memory = memory;
165 50760578 : section_increment._beginning_num_printed = _console.numPrinted();
166 :
167 : // A note about this next section of code:
168 : // It is only EVER run on the main thread - and therefore there can be
169 : // no race conditions. All that is important here is that the print
170 : // thread always sees a consistent value for _execution_list_end
171 50760578 : auto next_execution_list_end = _execution_list_end + 1;
172 :
173 : // Are we at the end of our circular buffer?
174 50760578 : if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
175 1652 : next_execution_list_end = 0;
176 :
177 : // This "release" will synchronize the above memory changes with the
178 : // "acquire" in the printing thread
179 : // All of the above memory operations will be seen by the
180 : // printing thread before the printing thread sees this new value
181 50760578 : _execution_list_end.store(next_execution_list_end, std::memory_order_release);
182 50760578 : }
183 :
184 : void
185 93319643 : PerfGraph::push(const PerfID id)
186 : {
187 93319643 : if (!_active)
188 4374 : return;
189 :
190 93315269 : PerfNode * new_node = nullptr;
191 :
192 93315269 : if (id == _root_node_id)
193 62755 : new_node = _root_node.get();
194 : else
195 93252514 : new_node = _stack[_current_position]->getChild(id);
196 :
197 : MemoryUtils::Stats stats;
198 93315269 : auto memory_success = MemoryUtils::getMemoryStats(stats);
199 :
200 93315269 : long int start_memory = 0;
201 :
202 93315269 : if (memory_success)
203 93315269 : start_memory =
204 93315269 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
205 0 : else if (_current_position !=
206 : -1) // If we weren't able to get the memory stats, let's just use the parent's
207 0 : start_memory = _stack[_current_position]->startMemory();
208 :
209 : // Set the start time
210 93315269 : auto current_time = std::chrono::steady_clock::now();
211 :
212 93315269 : new_node->setStartTimeAndMemory(current_time, start_memory);
213 :
214 : // Increment the number of calls
215 93315269 : new_node->incrementNumCalls();
216 :
217 93315269 : _current_position++;
218 :
219 93315269 : if (_current_position >= MOOSE_MAX_STACK_SIZE)
220 0 : mooseError("PerfGraph is out of stack space!");
221 :
222 93315269 : _stack[_current_position] = new_node;
223 :
224 : // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
225 160637963 : if ((_pid == 0 && !_disable_live_print) &&
226 67322694 : (!_perf_graph_registry.readSectionInfo(id)._live_message.empty() || _live_print_all))
227 25385355 : addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
228 : }
229 :
230 : void
231 93239038 : PerfGraph::pop()
232 : {
233 93239038 : if (!_active)
234 4414 : return;
235 :
236 93234624 : auto current_time = std::chrono::steady_clock::now();
237 :
238 93234624 : auto & current_node = _stack[_current_position];
239 :
240 : MemoryUtils::Stats stats;
241 93234624 : auto memory_success = MemoryUtils::getMemoryStats(stats);
242 :
243 93234624 : long int current_memory = 0;
244 :
245 93234624 : if (memory_success)
246 93234624 : current_memory =
247 93234624 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
248 0 : else if (_current_position !=
249 : -1) // If we weren't able to get the memory stats, let's just use the start memory
250 0 : current_memory = _stack[_current_position]->startMemory();
251 :
252 93234624 : current_node->addTimeAndMemory(current_time, current_memory);
253 :
254 93234624 : _current_position--;
255 :
256 : // Add this to the exection list
257 160484864 : if ((_pid == 0 && !_disable_live_print) &&
258 67250240 : (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
259 41875017 : _live_print_all))
260 : {
261 25375223 : addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
262 :
263 : // Tell the printing thread that a section has finished
264 : //
265 : // Note: no mutex is needed here because we're using an atomic
266 : // in the predicate of the condition_variable in the thread
267 : // This is technically correct - but there is a chance of missing a signal
268 : // For us - that chance is low and doesn't matter (the timeout will just be hit
269 : // instead). So - I would rather not have an extra lock here in the main thread.
270 25375223 : _finished_section.notify_one();
271 : }
272 : }
273 :
274 : void
275 103063 : PerfGraph::update()
276 : {
277 : // First update all of the currently running nodes
278 103063 : auto now = std::chrono::steady_clock::now();
279 :
280 : MemoryUtils::Stats stats;
281 103063 : MemoryUtils::getMemoryStats(stats);
282 : auto now_memory =
283 103063 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
284 :
285 584022 : for (int i = 0; i <= _current_position; i++)
286 : {
287 480959 : auto node = _stack[i];
288 480959 : node->addTimeAndMemory(now, now_memory);
289 480959 : node->setStartTimeAndMemory(now, now_memory);
290 : }
291 :
292 : // Zero out the entries
293 6572721 : for (auto & section_time_it : _cumulative_section_info)
294 : {
295 6469658 : auto & section_time = section_time_it.second;
296 :
297 6469658 : section_time._num_calls = 0;
298 6469658 : section_time._self = 0.;
299 6469658 : section_time._children = 0.;
300 6469658 : section_time._total = 0.;
301 6469658 : section_time._self_memory = 0;
302 6469658 : section_time._children_memory = 0;
303 6469658 : section_time._total_memory = 0.;
304 : }
305 :
306 103063 : recursivelyUpdate(*_root_node);
307 :
308 : // Update vector pointing to section times
309 : // Note: we are doing this _after_ recursively filling
310 : // because new entries may have been created
311 103063 : _cumulative_section_info_ptrs.resize(_perf_graph_registry.numSections());
312 :
313 13361592 : for (auto & section_time_it : _cumulative_section_info)
314 : {
315 13258529 : auto id = _perf_graph_registry.sectionID(section_time_it.first);
316 :
317 13258529 : _cumulative_section_info_ptrs[id] = §ion_time_it.second;
318 : }
319 103063 : }
320 :
321 : void
322 16198262 : PerfGraph::recursivelyUpdate(const PerfNode & current_node)
323 : {
324 16198262 : const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
325 :
326 : // RHS insertion on purpose
327 16198262 : auto & section_time = _cumulative_section_info[section_name];
328 :
329 16198262 : section_time._self += current_node.selfTimeSec();
330 16198262 : section_time._children += current_node.childrenTimeSec();
331 16198262 : section_time._total += current_node.totalTimeSec();
332 16198262 : section_time._num_calls += current_node.numCalls();
333 :
334 16198262 : section_time._self_memory += current_node.selfMemory();
335 16198262 : section_time._children_memory += current_node.childrenMemory();
336 16198262 : section_time._total_memory += current_node.totalMemory();
337 :
338 32293461 : for (auto & child_it : current_node.children())
339 16095199 : recursivelyUpdate(*child_it.second);
340 16198262 : }
341 :
342 : PerfGraph::FullTable
343 47826 : PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
344 : {
345 47826 : update();
346 :
347 : FullTable vtable({"Section",
348 : "Calls",
349 : "Self(s)",
350 : "Avg(s)",
351 : "%",
352 : "Mem(MB)",
353 : "Total(s)",
354 : "Avg(s)",
355 : "%",
356 : "Mem(MB)"},
357 526086 : 10);
358 :
359 47826 : vtable.setColumnFormat({
360 : VariadicTableColumnFormat::AUTO, // Section Name
361 : VariadicTableColumnFormat::AUTO, // Calls
362 : VariadicTableColumnFormat::FIXED, // Self
363 : VariadicTableColumnFormat::FIXED, // Avg.
364 : VariadicTableColumnFormat::PERCENT, // %
365 : VariadicTableColumnFormat::AUTO, // Memory
366 : VariadicTableColumnFormat::FIXED, // Total
367 : VariadicTableColumnFormat::FIXED, // Avg.
368 : VariadicTableColumnFormat::PERCENT, // %
369 : VariadicTableColumnFormat::AUTO, // Memory
370 : });
371 :
372 47826 : vtable.setColumnPrecision({
373 : 1, // Section Name
374 : 0, // Calls
375 : 3, // Self
376 : 3, // Avg.
377 : 2, // %
378 : 0, // Memory
379 : 3, // Total
380 : 3, // Avg.
381 : 2, // %
382 : 0, // Memory
383 : });
384 :
385 809387 : auto act = [this, &vtable](const PerfNode & node,
386 : const moose::internal::PerfGraphSectionInfo & section_info,
387 2428161 : const unsigned int depth)
388 : {
389 2428161 : vtable.addRow(std::string(depth * 2, ' ') + section_info._name, // Section Name
390 : node.numCalls(), // Calls
391 : node.selfTimeSec(), // Self
392 : node.selfTimeAvg(), // Avg.
393 809387 : 100. * node.selfTimeSec() / _root_node->totalTimeSec(), // %
394 : node.selfMemory(), // Memory
395 : node.totalTimeSec(), // Total
396 : node.totalTimeAvg(), // Avg.
397 809387 : 100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
398 : node.totalMemory()); // Memory
399 809387 : };
400 47826 : treeRecurse(act, level, heaviest);
401 :
402 95652 : return vtable;
403 95652 : }
404 :
405 : void
406 47805 : PerfGraph::print(const ConsoleStream & console, unsigned int level)
407 : {
408 47805 : console << "\nPerformance Graph:\n";
409 47805 : treeTable(level).print(console);
410 47805 : }
411 :
412 : void
413 21 : PerfGraph::printHeaviestBranch(const ConsoleStream & console)
414 : {
415 21 : console << "\nHeaviest Branch:\n";
416 21 : treeTable(MOOSE_MAX_STACK_SIZE, /* heaviest = */ true).print(console);
417 21 : }
418 :
419 : void
420 21 : PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
421 : {
422 21 : update();
423 :
424 21 : console << "\nHeaviest Sections:\n";
425 :
426 : // Indirect Sort The Self Time
427 21 : std::vector<size_t> sorted;
428 21 : Moose::indirectSort(_cumulative_section_info_ptrs.begin(),
429 : _cumulative_section_info_ptrs.end(),
430 : sorted,
431 23912 : [](CumulativeSectionInfo * lhs, CumulativeSectionInfo * rhs)
432 : {
433 23912 : if (lhs && rhs)
434 23715 : return lhs->_self > rhs->_self;
435 :
436 : // If the LHS exists - it's definitely bigger than a non-existant RHS
437 197 : if (lhs)
438 117 : return true;
439 :
440 : // Both don't exist - so it doesn't matter how we sort them
441 80 : return false;
442 : });
443 :
444 147 : HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
445 :
446 21 : vtable.setColumnFormat({VariadicTableColumnFormat::AUTO, // Section; doesn't matter
447 : VariadicTableColumnFormat::AUTO, // Calls
448 : VariadicTableColumnFormat::FIXED, // Time
449 : VariadicTableColumnFormat::FIXED, // Avg.
450 : VariadicTableColumnFormat::PERCENT, // Percent
451 : VariadicTableColumnFormat::AUTO} // Memory
452 : );
453 :
454 21 : vtable.setColumnPrecision({
455 : 1, // Section
456 : 1, // Calls
457 : 3, // Time
458 : 3, // Avg.
459 : 2, // Percent
460 : 1 // Memory
461 : });
462 :
463 : mooseAssert(!_cumulative_section_info_ptrs.empty(),
464 : "update() must be run before printHeaviestSections()!");
465 :
466 : // The total time of the root node
467 21 : auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
468 :
469 : // Now print out the largest ones
470 176 : for (unsigned int i = 0; i < num_sections; i++)
471 : {
472 155 : auto id = sorted[i];
473 :
474 155 : if (!_cumulative_section_info_ptrs[id])
475 0 : continue;
476 :
477 155 : const auto & entry = *_cumulative_section_info_ptrs[id];
478 155 : vtable.addRow(_perf_graph_registry.sectionInfo(id)._name, // Section
479 155 : entry._num_calls, // Calls
480 155 : entry._self, // Time
481 155 : entry._self / static_cast<Real>(entry._num_calls), // Avg.
482 155 : 100. * entry._self / total_root_time, // Percent
483 155 : entry._self_memory); // Memory
484 : }
485 :
486 21 : vtable.print(console);
487 63 : }
488 :
489 : void
490 41839 : dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
491 : {
492 : // We need to store the registry id -> section info map so that we can add
493 : // registered sections that may not be added yet during recover
494 41839 : dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
495 :
496 : // Update before serializing the nodes so that the time/memory/calls are correct
497 41839 : perf_graph.update();
498 :
499 : // Recursively serialize all of the nodes
500 41839 : dataStore(stream, perf_graph._root_node, nullptr);
501 41839 : }
502 :
503 : void
504 12748 : dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
505 : {
506 : // Load in all of the recovered sections and register those that do not exist yet
507 12748 : std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
508 12748 : dataLoad(stream, recovered_section_info, nullptr);
509 2050459 : for (const auto & info : recovered_section_info)
510 : {
511 2037711 : if (info._live_message.size())
512 904203 : perf_graph._perf_graph_registry.registerSection(
513 904203 : info._name, info._level, info._live_message, info._print_dots);
514 : else
515 1133508 : perf_graph._perf_graph_registry.registerSection(info._name, info._level);
516 : }
517 :
518 : // Update the current node time/memory/calls before loading the nodes as the load
519 : // will append information to current nodes that exist
520 12748 : perf_graph.update();
521 :
522 : // Recursively load all of the nodes; this will append information to matching nodes
523 : // and will create new nodes for section paths that do not exist
524 12748 : dataLoad(stream, perf_graph._root_node, &perf_graph);
525 12748 : }
|