Line data Source code
1 : //* This file is part of the MOOSE framework
2 : //* https://mooseframework.inl.gov
3 : //*
4 : //* All rights reserved, see COPYRIGHT for full restrictions
5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 : //*
7 : //* Licensed under LGPL 2.1, please see LICENSE for details
8 : //* https://www.gnu.org/licenses/lgpl-2.1.html
9 :
10 : #pragma once
11 :
12 : // MOOSE Includes
13 : #include "MooseTypes.h"
14 : #include "PerfNode.h"
15 : #include "IndirectSort.h"
16 : #include "ConsoleStream.h"
17 : #include "ConsoleStreamInterface.h"
18 : #include "MooseError.h"
19 : #include "MemoryUtils.h"
20 : #include "PerfGraphRegistry.h"
21 :
22 : // System Includes
23 : #include <array>
24 : #include <atomic>
25 : #include <thread>
26 : #include <future>
27 : #include <mutex>
28 :
29 : // Forward Declarations
30 : class PerfGuard;
31 : class PerfGraphLivePrint;
32 :
33 : template <class... Ts>
34 : class VariadicTable;
35 :
36 : #define MOOSE_MAX_STACK_SIZE 100
37 : #define MAX_EXECUTION_LIST_SIZE 10000
38 :
39 : /**
40 : * The PerfGraph will hold the master list of all registered performance segments and
41 : * the head PerfNode
42 : */
43 : class PerfGraph : protected ConsoleStreamInterface
44 : {
45 : public:
46 : using PerfGraphRegistry = moose::internal::PerfGraphRegistry;
47 :
48 : /**
49 : * For retrieving values
50 : */
51 : enum DataType
52 : {
53 : SELF,
54 : CHILDREN,
55 : TOTAL,
56 : SELF_AVG,
57 : CHILDREN_AVG,
58 : TOTAL_AVG,
59 : SELF_PERCENT,
60 : CHILDREN_PERCENT,
61 : TOTAL_PERCENT,
62 : SELF_MEMORY,
63 : CHILDREN_MEMORY,
64 : TOTAL_MEMORY,
65 : CALLS
66 : };
67 :
68 : /**
69 : * DataType in a MooseEnum for use in InputParameters in objects that query
70 : * the PerfGraph with sectionData.
71 : */
72 3907 : static MooseEnum dataTypeEnum()
73 : {
74 : return MooseEnum(
75 : "SELF CHILDREN TOTAL SELF_AVG CHILDREN_AVG TOTAL_AVG SELF_PERCENT CHILDREN_PERCENT "
76 15628 : "TOTAL_PERCENT SELF_MEMORY CHILDREN_MEMORY TOTAL_MEMORY CALLS");
77 : }
78 :
79 : /**
80 : * Create a new PerfGraph
81 : *
82 : * @param root_name The name of the root node
83 : * @param app The MooseApp this PerfGraph is for
84 : * @param live_all Whether every message should be printed
85 : * @param perf_graph_live Enable/disable PerfGraphLive (permanently)
86 : */
87 : PerfGraph(const std::string & root_name,
88 : MooseApp & app,
89 : const bool live_all,
90 : const bool perf_graph_live);
91 :
92 : /**
93 : * Destructor
94 : */
95 : ~PerfGraph();
96 :
97 : /**
98 : * Print the tree out
99 : *
100 : * @param console The output stream to output to
101 : * @param level The log level, the higher the number the more output you get
102 : */
103 : void print(const ConsoleStream & console, unsigned int level);
104 :
105 : /**
106 : * Print out the heaviest branch through the tree
107 : *
108 : * @param console The output stream to output to
109 : */
110 : void printHeaviestBranch(const ConsoleStream & console);
111 :
112 : /**
113 : * Print out the heaviest sections that were timed
114 : *
115 : * @param console The output stream to output to
116 : */
117 : void printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections);
118 :
119 : /**
120 : * Whether or not timing is active
121 : *
122 : * When not active no timing information will be kept
123 : */
124 : bool active() const { return _active; }
125 :
126 : /**
127 : * Turn on or off timing
128 : */
129 9 : void setActive(bool active) { _active = active; }
130 :
131 : /**
132 : * Enables Live Print
133 : */
134 : void enableLivePrint();
135 :
136 : /**
137 : * Completely disables Live Print (cannot be restarted)
138 : */
139 : void disableLivePrint();
140 :
141 : /**
142 : * Forces all sections to be output live
143 : */
144 : void setLivePrintAll(bool active) { _live_print_all = active; }
145 :
146 : /**
147 : * Set the time limit before a message prints
148 : */
149 65924 : void setLiveTimeLimit(Real time_limit)
150 : {
151 65924 : _live_print_time_limit.store(time_limit, std::memory_order_relaxed);
152 65924 : }
153 :
154 : /**
155 : * Sert the memory limit before a message prints
156 : */
157 65924 : void setLiveMemoryLimit(unsigned int mem_limit)
158 : {
159 65924 : _live_print_mem_limit.store(mem_limit, std::memory_order_relaxed);
160 65924 : }
161 :
162 : /**
163 : * Gets a PerfGraph result pertaining to a section
164 : * @param type The result type to retrieve
165 : * @param section_name The name of the section
166 : * @param must_exist Whether not the section must exist; if false and the
167 : * section does not exist, returns 0, if true and the section does not exist,
168 : * exit with an error
169 : */
170 : Real
171 : sectionData(const DataType type, const std::string & section_name, const bool must_exist = true);
172 :
173 : /**
174 : * Updates the time section_time and time for all currently running nodes
175 : */
176 : void update();
177 :
178 : /**
179 : * Get the maximum memory allocation in MB.
180 : *
181 : * This is thread safe.
182 : */
183 171731170 : std::size_t getMaxMemory() const { return _max_memory; }
184 :
185 : /**
186 : * @returns The MooseApp
187 : */
188 3758759 : MooseApp & mooseApp() { return _moose_app; }
189 :
190 : /**
191 : * @returns A constant reference to the root node
192 : */
193 2068749 : const PerfNode & rootNode() const { return *_root_node; }
194 :
195 : template <typename Functor>
196 : void treeRecurse(const Functor & act,
197 : const unsigned int level = MOOSE_MAX_STACK_SIZE,
198 : const bool heaviest = false) const;
199 :
200 : protected:
201 : typedef VariadicTable<std::string,
202 : unsigned long int,
203 : Real,
204 : Real,
205 : Real,
206 : long int,
207 : Real,
208 : Real,
209 : Real,
210 : long int>
211 : FullTable;
212 :
213 : typedef VariadicTable<std::string, unsigned long int, Real, Real, Real, long int> HeaviestTable;
214 :
215 : /**
216 : * Use to hold the cumulative time and memory for each section, which comes
217 : * from all of the PerfNodes that contribute to said section
218 : *
219 : * These will be filled by update()
220 : */
221 : struct CumulativeSectionInfo
222 : {
223 : /// Amount of time used within this section (without children)
224 : Real _self = 0.;
225 :
226 : /// Amount of time used by children
227 : Real _children = 0.;
228 :
229 : /// Total amount of time used
230 : Real _total = 0.;
231 :
232 : /// Number of times this section has been called
233 : unsigned long int _num_calls = 0;
234 :
235 : /// Amount of memory gained within this section (without children)
236 : long int _self_memory = 0;
237 :
238 : /// Amount of memory gained by children
239 : long int _children_memory = 0;
240 :
241 : /// Total memory gain for this section
242 : long int _total_memory = 0;
243 : };
244 :
245 : /**
246 : * The execution state of an increment.
247 : */
248 : enum IncrementState
249 : {
250 : /// Section just started running
251 : STARTED,
252 :
253 : /// This section has already started printing
254 : PRINTED,
255 :
256 : /// The section is complete
257 : FINISHED
258 : };
259 :
260 : /**
261 : * Use to hold an increment of time and memory for a section
262 : * This is used in the LivePrint capability.
263 : */
264 : class SectionIncrement
265 : {
266 : public:
267 676649500 : SectionIncrement()
268 676649500 : : _state(IncrementState::FINISHED),
269 676649500 : _print_stack_level(0),
270 676649500 : _num_dots(0),
271 676649500 : _time(std::chrono::seconds(0)),
272 676649500 : _memory(0),
273 676649500 : _beginning_num_printed(0)
274 : {
275 676649500 : }
276 :
277 : PerfID _id;
278 :
279 : /// Whether or not this increment is the start of an increment or
280 : /// the finishing of an increment.
281 : IncrementState _state;
282 :
283 : /// How much to indent this section
284 : unsigned int _print_stack_level;
285 :
286 : /// How many dots have been printed for this section
287 : unsigned int _num_dots;
288 :
289 : /// Either the starting time or final time depending on _state
290 : std::chrono::time_point<std::chrono::steady_clock> _time;
291 :
292 : /// Either the starting memory or final memory depending on _state
293 : long int _memory;
294 :
295 : /// The _console numPrinted() at the time this section was created
296 : unsigned long long int _beginning_num_printed;
297 : };
298 :
299 : /**
300 : * Add the information to the execution list
301 : *
302 : * Should only be called by push() and pop()
303 : */
304 : inline void addToExecutionList(const PerfID id,
305 : const IncrementState state,
306 : const std::chrono::time_point<std::chrono::steady_clock> time,
307 : const long int memory);
308 :
309 : /**
310 : * Add a Node onto the end of the end of the current callstack
311 : *
312 : * Note: only accessible by using PerfGuard!
313 : */
314 : void push(const PerfID id);
315 :
316 : /**
317 : * Remove a Node from the end of the current scope
318 : *
319 : * Note: only accessible by using PerfGuard!
320 : */
321 : void pop();
322 :
323 : /**
324 : * Updates the cumulative self/children/total time and memory for each section
325 : * across all nodes that contribute to said section in _cumulative_section_info
326 : *
327 : * Note: requires that the contents in each CumulativeSectionInfo in
328 : * _cumulative_section_info be initially resized and zeroed
329 : *
330 : * @param current_node The current node to work on
331 : */
332 : void recursivelyUpdate(const PerfNode & current_node);
333 :
334 : /// The MooseApp
335 : MooseApp & _moose_app;
336 :
337 : /// Whether or not to put everything in the perf graph
338 : bool _live_print_all;
339 :
340 : /// Whether or not live print is disabled (cannot be turned on again)
341 : bool _disable_live_print;
342 :
343 : /// The PerfGraphRegistry
344 : PerfGraphRegistry & _perf_graph_registry;
345 :
346 : /// This processor id
347 : const libMesh::processor_id_type _pid;
348 :
349 : /// Name of the root node
350 : const std::string _root_name;
351 :
352 : /// The id for the root node
353 : const PerfID _root_node_id;
354 :
355 : /// The root node of the graph
356 : const std::unique_ptr<PerfNode> _root_node;
357 :
358 : /// The current node position in the stack
359 : int _current_position;
360 :
361 : /// The full callstack. Currently capped at a depth of 100
362 : std::array<PerfNode *, MOOSE_MAX_STACK_SIZE> _stack;
363 :
364 : /// A circular buffer for holding the execution list, this is read by the printing loop
365 : std::array<SectionIncrement, MAX_EXECUTION_LIST_SIZE> _execution_list;
366 :
367 : /// Where the print thread should start reading the execution list
368 : std::atomic<unsigned int> _execution_list_begin;
369 :
370 : /// Where the print thread should stop reading the execution list
371 : std::atomic<unsigned int> _execution_list_end;
372 :
373 : /// The cumulative time and memory for each section. This is updated on update()
374 : /// Note that this is _total_ cumulative time/memory across every place
375 : /// that section is in the graph
376 : ///
377 : /// I'm making this a map so that we can give out references to the values
378 : /// The three values are: self, children
379 : /// The map is on std::string because we might need to be able to retrieve
380 : /// timing values in a "late binding" situation _before_ the section
381 : /// has been registered.
382 : std::unordered_map<std::string, CumulativeSectionInfo> _cumulative_section_info;
383 :
384 : /// Pointers into _cumulative_section_info indexed on PerfID
385 : /// This is here for convenience and speed so we don't need
386 : /// to iterate over the above map much - and it makes it
387 : /// easier to sort
388 : std::vector<CumulativeSectionInfo *> _cumulative_section_info_ptrs;
389 :
390 : /// Maximum memory encountered during push and pop
391 : std::atomic<std::size_t> _max_memory;
392 :
393 : /// Whether or not timing is active
394 : bool _active;
395 :
396 : /// The promise to the print thread that will signal when to stop
397 : std::promise<bool> _done;
398 :
399 : /// Tell the print thread to teardown
400 : bool _destructing;
401 :
402 : /// The mutex to use with a condition_variable predicate to guard _destructing
403 : std::mutex _destructing_mutex;
404 :
405 : /// The condition_variable to wake the print thread
406 : std::condition_variable _finished_section;
407 :
408 : /// The time limit before a message is printed (in seconds)
409 : std::atomic<Real> _live_print_time_limit;
410 :
411 : /// The memory limit before a message is printed (in MB)
412 : std::atomic<unsigned int> _live_print_mem_limit;
413 :
414 : /// The object that is doing live printing
415 : const std::unique_ptr<PerfGraphLivePrint> _live_print;
416 :
417 : /// The thread for printing sections as they execute
418 : std::thread _print_thread;
419 :
420 : // Here so PerfGuard is the only thing that can call push/pop
421 : friend class PerfGuard;
422 : friend class PerfGraphLivePrint;
423 : friend void dataStore(std::ostream &, PerfGraph &, void *);
424 : friend void dataLoad(std::istream &, PerfGraph &, void *);
425 :
426 : private:
427 : /**
428 : * Helper for building a VariadicTable that represents the tree.
429 : *
430 : * @param level The level to print out below (<=)
431 : * @param heaviest Show only the heaviest branch
432 : */
433 : FullTable treeTable(const unsigned int level, const bool heaviest = false);
434 :
435 : template <typename Functor>
436 : void treeRecurseInternal(const PerfNode & node,
437 : const Functor & act,
438 : const unsigned int level,
439 : const bool heaviest,
440 : unsigned int current_depth) const;
441 :
442 : /**
443 : * Update _max_memory if current_memory > _max_memory.
444 : */
445 : void updateMaxMemory(const std::size_t current_memory);
446 : };
447 :
448 : template <typename Functor>
449 : void
450 8835609 : PerfGraph::treeRecurseInternal(const PerfNode & node,
451 : const Functor & act,
452 : const unsigned int level,
453 : const bool heaviest,
454 : unsigned int current_depth) const
455 : {
456 : mooseAssert(_perf_graph_registry.sectionExists(node.id()), "Unable to find section name!");
457 :
458 8835609 : const auto & current_section_info = _perf_graph_registry.readSectionInfo(node.id());
459 8835609 : if (current_section_info._level <= level)
460 : {
461 : mooseAssert(!_cumulative_section_info_ptrs.empty(), "update() must be run before treeRecurse!");
462 666378 : act(node, current_section_info, current_depth++);
463 : }
464 :
465 8835609 : if (heaviest)
466 : {
467 174 : const PerfNode * heaviest_child = nullptr;
468 1063 : for (const auto & child_it : node.children())
469 : {
470 889 : const auto & current_child = *child_it.second;
471 :
472 889 : if (!heaviest_child || (current_child.totalTime() > heaviest_child->totalTime()))
473 297 : heaviest_child = ¤t_child;
474 : }
475 :
476 174 : if (heaviest_child)
477 154 : treeRecurseInternal(*heaviest_child, act, level, true, current_depth);
478 : }
479 : else
480 : {
481 17617544 : for (const auto & child_it : node.children())
482 8782109 : treeRecurseInternal(*child_it.second, act, level, false, current_depth);
483 : }
484 8835609 : }
485 :
486 : template <typename Functor>
487 : void
488 53346 : PerfGraph::treeRecurse(const Functor & act,
489 : const unsigned int level /* = MOOSE_MAX_STACK_SIZE */,
490 : const bool heaviest /* = false */) const
491 : {
492 : mooseAssert(_root_node, "Root node does not exist; calling this too early");
493 53346 : treeRecurseInternal(*_root_node, act, level, heaviest, 0);
494 53346 : }
495 :
496 : void dataStore(std::ostream & stream, PerfGraph & perf_graph, void * context);
497 : void dataLoad(std::istream & stream, PerfGraph & perf_graph, void * context);
|