Line data Source code
1 : //* This file is part of the MOOSE framework
2 : //* https://mooseframework.inl.gov
3 : //*
4 : //* All rights reserved, see COPYRIGHT for full restrictions
5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 : //*
7 : //* Licensed under LGPL 2.1, please see LICENSE for details
8 : //* https://www.gnu.org/licenses/lgpl-2.1.html
9 :
10 : #include "PerfGraph.h"
11 :
12 : // MOOSE Includes
13 : #include "PerfGuard.h"
14 : #include "MooseError.h"
15 : #include "PerfGraphLivePrint.h"
16 : #include "PerfGraphRegistry.h"
17 : #include "MooseApp.h"
18 :
19 : // Note: do everything we can to make sure this only gets #included
20 : // in the .C file... this is a heavily templated header that we
21 : // don't want to expose to EVERY file in MOOSE...
22 : #include "VariadicTable.h"
23 :
24 : // System Includes
25 : #include <chrono>
26 : #include <memory>
27 :
28 67610 : PerfGraph::PerfGraph(const std::string & root_name,
29 : MooseApp & app,
30 : const bool live_all,
31 67610 : const bool perf_graph_live)
32 : : ConsoleStreamInterface(app),
33 67610 : _moose_app(app),
34 67610 : _live_print_all(live_all),
35 67610 : _disable_live_print(!perf_graph_live),
36 67610 : _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
37 67610 : _pid(app.comm().rank()),
38 67610 : _root_name(root_name),
39 67610 : _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
40 67610 : _root_node(std::make_unique<PerfNode>(_root_node_id)),
41 67610 : _current_position(-1),
42 67610 : _stack(),
43 67610 : _execution_list_begin(0),
44 67610 : _execution_list_end(0),
45 67610 : _active(true),
46 67610 : _destructing(false),
47 67610 : _live_print_time_limit(5.0),
48 67610 : _live_print_mem_limit(100),
49 270440 : _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
50 : {
51 67610 : push(_root_node_id);
52 67610 : }
53 :
54 62074 : PerfGraph::~PerfGraph() { disableLivePrint(); }
55 :
56 : void
57 67609 : PerfGraph::enableLivePrint()
58 : {
59 67609 : if (_pid == 0 && !_disable_live_print)
60 : {
61 : // Start the printing thread
62 106488 : _print_thread = std::thread([this] { this->_live_print->start(); });
63 : }
64 67609 : }
65 :
66 : void
67 65440 : PerfGraph::disableLivePrint()
68 : {
69 65440 : if (_pid == 0 && !_disable_live_print)
70 : {
71 : {
72 : // Unlike using atomics for execution_thread_end
73 : // here we actually lock to ensure that either the print thread
74 : // immediately sees that we are destructing or is immediately
75 : // notified with the below notification. Without doing this
76 : // it would be possible (but unlikely) for the print thread to
77 : // hang for 1 second at the end of execution (which would not be
78 : // good anytime you are running lots of fast calculations back-to-back
79 : // like during testing or stochastic sampling).
80 47800 : std::lock_guard<std::mutex> lock(_destructing_mutex);
81 47800 : _destructing = true;
82 47800 : }
83 :
84 47800 : _finished_section.notify_one();
85 :
86 47800 : _print_thread.join();
87 :
88 47800 : _disable_live_print = true;
89 : }
90 65440 : }
91 :
92 : Real
93 621 : PerfGraph::sectionData(const DataType type,
94 : const std::string & section_name,
95 : const bool must_exist /* = true */)
96 : {
97 621 : update();
98 :
99 : const auto section_it =
100 621 : _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
101 :
102 621 : if (section_it == _cumulative_section_info.end())
103 : {
104 20 : if (!must_exist || // isn't required to exist
105 4 : _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
106 : )
107 12 : return 0;
108 :
109 4 : mooseError("Unknown PerfGraph section name \"",
110 : section_name,
111 : "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
112 : "\"Root\".");
113 : }
114 :
115 605 : const CumulativeSectionInfo & section_info = section_it->second;
116 :
117 605 : if (type == CALLS)
118 68 : return section_info._num_calls;
119 :
120 537 : const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
121 :
122 537 : switch (type)
123 : {
124 34 : case SELF:
125 34 : return section_info._self;
126 12 : case CHILDREN:
127 12 : return section_info._children;
128 295 : case TOTAL:
129 295 : return section_info._total;
130 12 : case SELF_AVG:
131 12 : return section_info._self / static_cast<Real>(section_info._num_calls);
132 12 : case CHILDREN_AVG:
133 12 : return section_info._children / static_cast<Real>(section_info._num_calls);
134 56 : case TOTAL_AVG:
135 56 : return section_info._total / static_cast<Real>(section_info._num_calls);
136 34 : case SELF_PERCENT:
137 34 : return 100. * (section_info._self / app_time);
138 12 : case CHILDREN_PERCENT:
139 12 : return 100. * (section_info._children / app_time);
140 34 : case TOTAL_PERCENT:
141 34 : return 100. * (section_info._total / app_time);
142 12 : case SELF_MEMORY:
143 12 : return section_info._self_memory;
144 12 : case CHILDREN_MEMORY:
145 12 : return section_info._children_memory;
146 12 : case TOTAL_MEMORY:
147 12 : return section_info._total_memory;
148 0 : default:
149 0 : ::mooseError("Unknown DataType");
150 : }
151 : }
152 :
153 : void
154 56306379 : PerfGraph::addToExecutionList(const PerfID id,
155 : const IncrementState state,
156 : const std::chrono::time_point<std::chrono::steady_clock> time,
157 : const long int memory)
158 : {
159 56306379 : auto & section_increment = _execution_list[_execution_list_end];
160 :
161 56306379 : section_increment._id = id;
162 56306379 : section_increment._state = state;
163 56306379 : section_increment._time = time;
164 56306379 : section_increment._memory = memory;
165 56306379 : section_increment._beginning_num_printed = _console.numPrinted();
166 :
167 : // A note about this next section of code:
168 : // It is only EVER run on the main thread - and therefore there can be
169 : // no race conditions. All that is important here is that the print
170 : // thread always sees a consistent value for _execution_list_end
171 56306379 : auto next_execution_list_end = _execution_list_end + 1;
172 :
173 : // Are we at the end of our circular buffer?
174 56306379 : if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
175 1829 : next_execution_list_end = 0;
176 :
177 : // This "release" will synchronize the above memory changes with the
178 : // "acquire" in the printing thread
179 : // All of the above memory operations will be seen by the
180 : // printing thread before the printing thread sees this new value
181 56306379 : _execution_list_end.store(next_execution_list_end, std::memory_order_release);
182 56306379 : }
183 :
184 : void
185 100919356 : PerfGraph::push(const PerfID id)
186 : {
187 100919356 : if (!_active)
188 4384 : return;
189 :
190 100914972 : PerfNode * new_node = nullptr;
191 :
192 100914972 : if (id == _root_node_id)
193 67610 : new_node = _root_node.get();
194 : else
195 100847362 : new_node = _stack[_current_position]->getChild(id);
196 :
197 : MemoryUtils::Stats stats;
198 100914972 : auto memory_success = MemoryUtils::getMemoryStats(stats);
199 :
200 100914972 : long int start_memory = 0;
201 :
202 100914972 : if (memory_success)
203 100914972 : start_memory =
204 100914972 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
205 0 : else if (_current_position !=
206 : -1) // If we weren't able to get the memory stats, let's just use the parent's
207 0 : start_memory = _stack[_current_position]->startMemory();
208 :
209 : // Set the start time
210 100914972 : auto current_time = std::chrono::steady_clock::now();
211 :
212 100914972 : new_node->setStartTimeAndMemory(current_time, start_memory);
213 :
214 : // Increment the number of calls
215 100914972 : new_node->incrementNumCalls();
216 :
217 100914972 : _current_position++;
218 :
219 100914972 : if (_current_position >= MOOSE_MAX_STACK_SIZE)
220 0 : mooseError("PerfGraph is out of stack space!");
221 :
222 100914972 : _stack[_current_position] = new_node;
223 :
224 : // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
225 175541341 : if ((_pid == 0 && !_disable_live_print) &&
226 74626369 : (!_perf_graph_registry.readSectionInfo(id)._live_message.empty() || _live_print_all))
227 28158487 : addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
228 : }
229 :
230 : void
231 100833795 : PerfGraph::pop()
232 : {
233 100833795 : if (!_active)
234 4424 : return;
235 :
236 100829371 : auto current_time = std::chrono::steady_clock::now();
237 :
238 100829371 : auto & current_node = _stack[_current_position];
239 :
240 : MemoryUtils::Stats stats;
241 100829371 : auto memory_success = MemoryUtils::getMemoryStats(stats);
242 :
243 100829371 : long int current_memory = 0;
244 :
245 100829371 : if (memory_success)
246 100829371 : current_memory =
247 100829371 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
248 0 : else if (_current_position !=
249 : -1) // If we weren't able to get the memory stats, let's just use the start memory
250 0 : current_memory = _stack[_current_position]->startMemory();
251 :
252 100829371 : current_node->addTimeAndMemory(current_time, current_memory);
253 :
254 100829371 : _current_position--;
255 :
256 : // Add this to the exection list
257 175378045 : if ((_pid == 0 && !_disable_live_print) &&
258 74548674 : (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
259 46400782 : _live_print_all))
260 : {
261 28147892 : addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
262 :
263 : // Tell the printing thread that a section has finished
264 : //
265 : // Note: no mutex is needed here because we're using an atomic
266 : // in the predicate of the condition_variable in the thread
267 : // This is technically correct - but there is a chance of missing a signal
268 : // For us - that chance is low and doesn't matter (the timeout will just be hit
269 : // instead). So - I would rather not have an extra lock here in the main thread.
270 28147892 : _finished_section.notify_one();
271 : }
272 : }
273 :
274 : void
275 111911 : PerfGraph::update()
276 : {
277 : // First update all of the currently running nodes
278 111911 : auto now = std::chrono::steady_clock::now();
279 :
280 : MemoryUtils::Stats stats;
281 111911 : MemoryUtils::getMemoryStats(stats);
282 : auto now_memory =
283 111911 : MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
284 :
285 630342 : for (int i = 0; i <= _current_position; i++)
286 : {
287 518431 : auto node = _stack[i];
288 518431 : node->addTimeAndMemory(now, now_memory);
289 518431 : node->setStartTimeAndMemory(now, now_memory);
290 : }
291 :
292 : // Zero out the entries
293 7141501 : for (auto & section_time_it : _cumulative_section_info)
294 : {
295 7029590 : auto & section_time = section_time_it.second;
296 :
297 7029590 : section_time._num_calls = 0;
298 7029590 : section_time._self = 0.;
299 7029590 : section_time._children = 0.;
300 7029590 : section_time._total = 0.;
301 7029590 : section_time._self_memory = 0;
302 7029590 : section_time._children_memory = 0;
303 7029590 : section_time._total_memory = 0.;
304 : }
305 :
306 111911 : recursivelyUpdate(*_root_node);
307 :
308 : // Update vector pointing to section times
309 : // Note: we are doing this _after_ recursively filling
310 : // because new entries may have been created
311 111911 : _cumulative_section_info_ptrs.resize(_perf_graph_registry.numSections());
312 :
313 14515738 : for (auto & section_time_it : _cumulative_section_info)
314 : {
315 14403827 : auto id = _perf_graph_registry.sectionID(section_time_it.first);
316 :
317 14403827 : _cumulative_section_info_ptrs[id] = §ion_time_it.second;
318 : }
319 111911 : }
320 :
321 : void
322 17593030 : PerfGraph::recursivelyUpdate(const PerfNode & current_node)
323 : {
324 17593030 : const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
325 :
326 : // RHS insertion on purpose
327 17593030 : auto & section_time = _cumulative_section_info[section_name];
328 :
329 17593030 : section_time._self += current_node.selfTimeSec();
330 17593030 : section_time._children += current_node.childrenTimeSec();
331 17593030 : section_time._total += current_node.totalTimeSec();
332 17593030 : section_time._num_calls += current_node.numCalls();
333 :
334 17593030 : section_time._self_memory += current_node.selfMemory();
335 17593030 : section_time._children_memory += current_node.childrenMemory();
336 17593030 : section_time._total_memory += current_node.totalMemory();
337 :
338 35074149 : for (auto & child_it : current_node.children())
339 17481119 : recursivelyUpdate(*child_it.second);
340 17593030 : }
341 :
342 : PerfGraph::FullTable
343 52274 : PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
344 : {
345 52274 : update();
346 :
347 : FullTable vtable({"Section",
348 : "Calls",
349 : "Self(s)",
350 : "Avg(s)",
351 : "%",
352 : "Mem(MB)",
353 : "Total(s)",
354 : "Avg(s)",
355 : "%",
356 : "Mem(MB)"},
357 575014 : 10);
358 :
359 52274 : vtable.setColumnFormat({
360 : VariadicTableColumnFormat::AUTO, // Section Name
361 : VariadicTableColumnFormat::AUTO, // Calls
362 : VariadicTableColumnFormat::FIXED, // Self
363 : VariadicTableColumnFormat::FIXED, // Avg.
364 : VariadicTableColumnFormat::PERCENT, // %
365 : VariadicTableColumnFormat::AUTO, // Memory
366 : VariadicTableColumnFormat::FIXED, // Total
367 : VariadicTableColumnFormat::FIXED, // Avg.
368 : VariadicTableColumnFormat::PERCENT, // %
369 : VariadicTableColumnFormat::AUTO, // Memory
370 : });
371 :
372 52274 : vtable.setColumnPrecision({
373 : 1, // Section Name
374 : 0, // Calls
375 : 3, // Self
376 : 3, // Avg.
377 : 2, // %
378 : 0, // Memory
379 : 3, // Total
380 : 3, // Avg.
381 : 2, // %
382 : 0, // Memory
383 : });
384 :
385 883151 : auto act = [this, &vtable](const PerfNode & node,
386 : const moose::internal::PerfGraphSectionInfo & section_info,
387 2649453 : const unsigned int depth)
388 : {
389 2649453 : vtable.addRow(std::string(depth * 2, ' ') + section_info._name, // Section Name
390 : node.numCalls(), // Calls
391 : node.selfTimeSec(), // Self
392 : node.selfTimeAvg(), // Avg.
393 883151 : 100. * node.selfTimeSec() / _root_node->totalTimeSec(), // %
394 : node.selfMemory(), // Memory
395 : node.totalTimeSec(), // Total
396 : node.totalTimeAvg(), // Avg.
397 883151 : 100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
398 : node.totalMemory()); // Memory
399 883151 : };
400 52274 : treeRecurse(act, level, heaviest);
401 :
402 104548 : return vtable;
403 104548 : }
404 :
405 : void
406 52252 : PerfGraph::print(const ConsoleStream & console, unsigned int level)
407 : {
408 52252 : console << "\nPerformance Graph:\n";
409 52252 : treeTable(level).print(console);
410 52252 : }
411 :
412 : void
413 22 : PerfGraph::printHeaviestBranch(const ConsoleStream & console)
414 : {
415 22 : console << "\nHeaviest Branch:\n";
416 22 : treeTable(MOOSE_MAX_STACK_SIZE, /* heaviest = */ true).print(console);
417 22 : }
418 :
419 : void
420 22 : PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
421 : {
422 22 : update();
423 :
424 22 : console << "\nHeaviest Sections:\n";
425 :
426 : // Indirect Sort The Self Time
427 22 : std::vector<size_t> sorted;
428 22 : Moose::indirectSort(_cumulative_section_info_ptrs.begin(),
429 : _cumulative_section_info_ptrs.end(),
430 : sorted,
431 25234 : [](CumulativeSectionInfo * lhs, CumulativeSectionInfo * rhs)
432 : {
433 25234 : if (lhs && rhs)
434 25022 : return lhs->_self > rhs->_self;
435 :
436 : // If the LHS exists - it's definitely bigger than a non-existant RHS
437 212 : if (lhs)
438 131 : return true;
439 :
440 : // Both don't exist - so it doesn't matter how we sort them
441 81 : return false;
442 : });
443 :
444 154 : HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
445 :
446 22 : vtable.setColumnFormat({VariadicTableColumnFormat::AUTO, // Section; doesn't matter
447 : VariadicTableColumnFormat::AUTO, // Calls
448 : VariadicTableColumnFormat::FIXED, // Time
449 : VariadicTableColumnFormat::FIXED, // Avg.
450 : VariadicTableColumnFormat::PERCENT, // Percent
451 : VariadicTableColumnFormat::AUTO} // Memory
452 : );
453 :
454 22 : vtable.setColumnPrecision({
455 : 1, // Section
456 : 1, // Calls
457 : 3, // Time
458 : 3, // Avg.
459 : 2, // Percent
460 : 1 // Memory
461 : });
462 :
463 : mooseAssert(!_cumulative_section_info_ptrs.empty(),
464 : "update() must be run before printHeaviestSections()!");
465 :
466 : // The total time of the root node
467 22 : auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
468 :
469 : // Now print out the largest ones
470 182 : for (unsigned int i = 0; i < num_sections; i++)
471 : {
472 160 : auto id = sorted[i];
473 :
474 160 : if (!_cumulative_section_info_ptrs[id])
475 0 : continue;
476 :
477 160 : const auto & entry = *_cumulative_section_info_ptrs[id];
478 160 : vtable.addRow(_perf_graph_registry.sectionInfo(id)._name, // Section
479 160 : entry._num_calls, // Calls
480 160 : entry._self, // Time
481 160 : entry._self / static_cast<Real>(entry._num_calls), // Avg.
482 160 : 100. * entry._self / total_root_time, // Percent
483 160 : entry._self_memory); // Memory
484 : }
485 :
486 22 : vtable.print(console);
487 66 : }
488 :
489 : void
490 45214 : dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
491 : {
492 : // We need to store the registry id -> section info map so that we can add
493 : // registered sections that may not be added yet during recover
494 45214 : dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
495 :
496 : // Update before serializing the nodes so that the time/memory/calls are correct
497 45214 : perf_graph.update();
498 :
499 : // Recursively serialize all of the nodes
500 45214 : dataStore(stream, perf_graph._root_node, nullptr);
501 45214 : }
502 :
503 : void
504 13740 : dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
505 : {
506 : // Load in all of the recovered sections and register those that do not exist yet
507 13740 : std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
508 13740 : dataLoad(stream, recovered_section_info, nullptr);
509 2220748 : for (const auto & info : recovered_section_info)
510 : {
511 2207008 : if (info._live_message.size())
512 985744 : perf_graph._perf_graph_registry.registerSection(
513 985744 : info._name, info._level, info._live_message, info._print_dots);
514 : else
515 1221264 : perf_graph._perf_graph_registry.registerSection(info._name, info._level);
516 : }
517 :
518 : // Update the current node time/memory/calls before loading the nodes as the load
519 : // will append information to current nodes that exist
520 13740 : perf_graph.update();
521 :
522 : // Recursively load all of the nodes; this will append information to matching nodes
523 : // and will create new nodes for section paths that do not exist
524 13740 : dataLoad(stream, perf_graph._root_node, &perf_graph);
525 13740 : }
|