LCOV - code coverage report
Current view: top level - src/utils - PerfGraph.C (source / functions) Hit Total Coverage
Test: idaholab/moose framework: 419b9d Lines: 223 231 96.5 %
Date: 2025-08-08 20:01:16 Functions: 19 19 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //* This file is part of the MOOSE framework
       2             : //* https://mooseframework.inl.gov
       3             : //*
       4             : //* All rights reserved, see COPYRIGHT for full restrictions
       5             : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
       6             : //*
       7             : //* Licensed under LGPL 2.1, please see LICENSE for details
       8             : //* https://www.gnu.org/licenses/lgpl-2.1.html
       9             : 
      10             : #include "PerfGraph.h"
      11             : 
      12             : // MOOSE Includes
      13             : #include "PerfGuard.h"
      14             : #include "MooseError.h"
      15             : #include "PerfGraphLivePrint.h"
      16             : #include "PerfGraphRegistry.h"
      17             : #include "MooseApp.h"
      18             : 
      19             : // Note: do everything we can to make sure this only gets #included
      20             : // in the .C file... this is a heavily templated header that we
      21             : // don't want to expose to EVERY file in MOOSE...
      22             : #include "VariadicTable.h"
      23             : 
      24             : // System Includes
      25             : #include <chrono>
      26             : #include <memory>
      27             : 
      28       67610 : PerfGraph::PerfGraph(const std::string & root_name,
      29             :                      MooseApp & app,
      30             :                      const bool live_all,
      31       67610 :                      const bool perf_graph_live)
      32             :   : ConsoleStreamInterface(app),
      33       67610 :     _moose_app(app),
      34       67610 :     _live_print_all(live_all),
      35       67610 :     _disable_live_print(!perf_graph_live),
      36       67610 :     _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
      37       67610 :     _pid(app.comm().rank()),
      38       67610 :     _root_name(root_name),
      39       67610 :     _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
      40       67610 :     _root_node(std::make_unique<PerfNode>(_root_node_id)),
      41       67610 :     _current_position(-1),
      42       67610 :     _stack(),
      43       67610 :     _execution_list_begin(0),
      44       67610 :     _execution_list_end(0),
      45       67610 :     _active(true),
      46       67610 :     _destructing(false),
      47       67610 :     _live_print_time_limit(5.0),
      48       67610 :     _live_print_mem_limit(100),
      49      270440 :     _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
      50             : {
      51       67610 :   push(_root_node_id);
      52       67610 : }
      53             : 
      54       62074 : PerfGraph::~PerfGraph() { disableLivePrint(); }
      55             : 
      56             : void
      57       67609 : PerfGraph::enableLivePrint()
      58             : {
      59       67609 :   if (_pid == 0 && !_disable_live_print)
      60             :   {
      61             :     // Start the printing thread
      62      106488 :     _print_thread = std::thread([this] { this->_live_print->start(); });
      63             :   }
      64       67609 : }
      65             : 
      66             : void
      67       65440 : PerfGraph::disableLivePrint()
      68             : {
      69       65440 :   if (_pid == 0 && !_disable_live_print)
      70             :   {
      71             :     {
      72             :       // Unlike using atomics for execution_thread_end
      73             :       // here we actually lock to ensure that either the print thread
      74             :       // immediately sees that we are destructing or is immediately
      75             :       // notified with the below notification.  Without doing this
      76             :       // it would be possible (but unlikely) for the print thread to
      77             :       // hang for 1 second at the end of execution (which would not be
      78             :       // good anytime you are running lots of fast calculations back-to-back
      79             :       // like during testing or stochastic sampling).
      80       47800 :       std::lock_guard<std::mutex> lock(_destructing_mutex);
      81       47800 :       _destructing = true;
      82       47800 :     }
      83             : 
      84       47800 :     _finished_section.notify_one();
      85             : 
      86       47800 :     _print_thread.join();
      87             : 
      88       47800 :     _disable_live_print = true;
      89             :   }
      90       65440 : }
      91             : 
      92             : Real
      93         621 : PerfGraph::sectionData(const DataType type,
      94             :                        const std::string & section_name,
      95             :                        const bool must_exist /* = true */)
      96             : {
      97         621 :   update();
      98             : 
      99             :   const auto section_it =
     100         621 :       _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
     101             : 
     102         621 :   if (section_it == _cumulative_section_info.end())
     103             :   {
     104          20 :     if (!must_exist ||                                   // isn't required to exist
     105           4 :         _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
     106             :     )
     107          12 :       return 0;
     108             : 
     109           4 :     mooseError("Unknown PerfGraph section name \"",
     110             :                section_name,
     111             :                "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
     112             :                "\"Root\".");
     113             :   }
     114             : 
     115         605 :   const CumulativeSectionInfo & section_info = section_it->second;
     116             : 
     117         605 :   if (type == CALLS)
     118          68 :     return section_info._num_calls;
     119             : 
     120         537 :   const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
     121             : 
     122         537 :   switch (type)
     123             :   {
     124          34 :     case SELF:
     125          34 :       return section_info._self;
     126          12 :     case CHILDREN:
     127          12 :       return section_info._children;
     128         295 :     case TOTAL:
     129         295 :       return section_info._total;
     130          12 :     case SELF_AVG:
     131          12 :       return section_info._self / static_cast<Real>(section_info._num_calls);
     132          12 :     case CHILDREN_AVG:
     133          12 :       return section_info._children / static_cast<Real>(section_info._num_calls);
     134          56 :     case TOTAL_AVG:
     135          56 :       return section_info._total / static_cast<Real>(section_info._num_calls);
     136          34 :     case SELF_PERCENT:
     137          34 :       return 100. * (section_info._self / app_time);
     138          12 :     case CHILDREN_PERCENT:
     139          12 :       return 100. * (section_info._children / app_time);
     140          34 :     case TOTAL_PERCENT:
     141          34 :       return 100. * (section_info._total / app_time);
     142          12 :     case SELF_MEMORY:
     143          12 :       return section_info._self_memory;
     144          12 :     case CHILDREN_MEMORY:
     145          12 :       return section_info._children_memory;
     146          12 :     case TOTAL_MEMORY:
     147          12 :       return section_info._total_memory;
     148           0 :     default:
     149           0 :       ::mooseError("Unknown DataType");
     150             :   }
     151             : }
     152             : 
     153             : void
     154    56306379 : PerfGraph::addToExecutionList(const PerfID id,
     155             :                               const IncrementState state,
     156             :                               const std::chrono::time_point<std::chrono::steady_clock> time,
     157             :                               const long int memory)
     158             : {
     159    56306379 :   auto & section_increment = _execution_list[_execution_list_end];
     160             : 
     161    56306379 :   section_increment._id = id;
     162    56306379 :   section_increment._state = state;
     163    56306379 :   section_increment._time = time;
     164    56306379 :   section_increment._memory = memory;
     165    56306379 :   section_increment._beginning_num_printed = _console.numPrinted();
     166             : 
     167             :   // A note about this next section of code:
     168             :   // It is only EVER run on the main thread - and therefore there can be
     169             :   // no race conditions.  All that is important here is that the print
     170             :   // thread always sees a consistent value for _execution_list_end
     171    56306379 :   auto next_execution_list_end = _execution_list_end + 1;
     172             : 
     173             :   // Are we at the end of our circular buffer?
     174    56306379 :   if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
     175        1829 :     next_execution_list_end = 0;
     176             : 
     177             :   // This "release" will synchronize the above memory changes with the
     178             :   // "acquire" in the printing thread
     179             :   // All of the above memory operations will be seen by the
     180             :   // printing thread before the printing thread sees this new value
     181    56306379 :   _execution_list_end.store(next_execution_list_end, std::memory_order_release);
     182    56306379 : }
     183             : 
     184             : void
     185   100919356 : PerfGraph::push(const PerfID id)
     186             : {
     187   100919356 :   if (!_active)
     188        4384 :     return;
     189             : 
     190   100914972 :   PerfNode * new_node = nullptr;
     191             : 
     192   100914972 :   if (id == _root_node_id)
     193       67610 :     new_node = _root_node.get();
     194             :   else
     195   100847362 :     new_node = _stack[_current_position]->getChild(id);
     196             : 
     197             :   MemoryUtils::Stats stats;
     198   100914972 :   auto memory_success = MemoryUtils::getMemoryStats(stats);
     199             : 
     200   100914972 :   long int start_memory = 0;
     201             : 
     202   100914972 :   if (memory_success)
     203   100914972 :     start_memory =
     204   100914972 :         MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     205           0 :   else if (_current_position !=
     206             :            -1) // If we weren't able to get the memory stats, let's just use the parent's
     207           0 :     start_memory = _stack[_current_position]->startMemory();
     208             : 
     209             :   // Set the start time
     210   100914972 :   auto current_time = std::chrono::steady_clock::now();
     211             : 
     212   100914972 :   new_node->setStartTimeAndMemory(current_time, start_memory);
     213             : 
     214             :   // Increment the number of calls
     215   100914972 :   new_node->incrementNumCalls();
     216             : 
     217   100914972 :   _current_position++;
     218             : 
     219   100914972 :   if (_current_position >= MOOSE_MAX_STACK_SIZE)
     220           0 :     mooseError("PerfGraph is out of stack space!");
     221             : 
     222   100914972 :   _stack[_current_position] = new_node;
     223             : 
     224             :   // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
     225   175541341 :   if ((_pid == 0 && !_disable_live_print) &&
     226    74626369 :       (!_perf_graph_registry.readSectionInfo(id)._live_message.empty() || _live_print_all))
     227    28158487 :     addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
     228             : }
     229             : 
     230             : void
     231   100833795 : PerfGraph::pop()
     232             : {
     233   100833795 :   if (!_active)
     234        4424 :     return;
     235             : 
     236   100829371 :   auto current_time = std::chrono::steady_clock::now();
     237             : 
     238   100829371 :   auto & current_node = _stack[_current_position];
     239             : 
     240             :   MemoryUtils::Stats stats;
     241   100829371 :   auto memory_success = MemoryUtils::getMemoryStats(stats);
     242             : 
     243   100829371 :   long int current_memory = 0;
     244             : 
     245   100829371 :   if (memory_success)
     246   100829371 :     current_memory =
     247   100829371 :         MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     248           0 :   else if (_current_position !=
     249             :            -1) // If we weren't able to get the memory stats, let's just use the start memory
     250           0 :     current_memory = _stack[_current_position]->startMemory();
     251             : 
     252   100829371 :   current_node->addTimeAndMemory(current_time, current_memory);
     253             : 
     254   100829371 :   _current_position--;
     255             : 
     256             :   // Add this to the exection list
     257   175378045 :   if ((_pid == 0 && !_disable_live_print) &&
     258    74548674 :       (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
     259    46400782 :        _live_print_all))
     260             :   {
     261    28147892 :     addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
     262             : 
     263             :     // Tell the printing thread that a section has finished
     264             :     //
     265             :     // Note: no mutex is needed here because we're using an atomic
     266             :     // in the predicate of the condition_variable in the thread
     267             :     // This is technically correct - but there is a chance of missing a signal
     268             :     // For us - that chance is low and doesn't matter (the timeout will just be hit
     269             :     // instead). So - I would rather not have an extra lock here in the main thread.
     270    28147892 :     _finished_section.notify_one();
     271             :   }
     272             : }
     273             : 
     274             : void
     275      111911 : PerfGraph::update()
     276             : {
     277             :   // First update all of the currently running nodes
     278      111911 :   auto now = std::chrono::steady_clock::now();
     279             : 
     280             :   MemoryUtils::Stats stats;
     281      111911 :   MemoryUtils::getMemoryStats(stats);
     282             :   auto now_memory =
     283      111911 :       MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     284             : 
     285      630342 :   for (int i = 0; i <= _current_position; i++)
     286             :   {
     287      518431 :     auto node = _stack[i];
     288      518431 :     node->addTimeAndMemory(now, now_memory);
     289      518431 :     node->setStartTimeAndMemory(now, now_memory);
     290             :   }
     291             : 
     292             :   // Zero out the entries
     293     7141501 :   for (auto & section_time_it : _cumulative_section_info)
     294             :   {
     295     7029590 :     auto & section_time = section_time_it.second;
     296             : 
     297     7029590 :     section_time._num_calls = 0;
     298     7029590 :     section_time._self = 0.;
     299     7029590 :     section_time._children = 0.;
     300     7029590 :     section_time._total = 0.;
     301     7029590 :     section_time._self_memory = 0;
     302     7029590 :     section_time._children_memory = 0;
     303     7029590 :     section_time._total_memory = 0.;
     304             :   }
     305             : 
     306      111911 :   recursivelyUpdate(*_root_node);
     307             : 
     308             :   // Update vector pointing to section times
     309             :   // Note: we are doing this _after_ recursively filling
     310             :   // because new entries may have been created
     311      111911 :   _cumulative_section_info_ptrs.resize(_perf_graph_registry.numSections());
     312             : 
     313    14515738 :   for (auto & section_time_it : _cumulative_section_info)
     314             :   {
     315    14403827 :     auto id = _perf_graph_registry.sectionID(section_time_it.first);
     316             : 
     317    14403827 :     _cumulative_section_info_ptrs[id] = &section_time_it.second;
     318             :   }
     319      111911 : }
     320             : 
     321             : void
     322    17593030 : PerfGraph::recursivelyUpdate(const PerfNode & current_node)
     323             : {
     324    17593030 :   const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
     325             : 
     326             :   // RHS insertion on purpose
     327    17593030 :   auto & section_time = _cumulative_section_info[section_name];
     328             : 
     329    17593030 :   section_time._self += current_node.selfTimeSec();
     330    17593030 :   section_time._children += current_node.childrenTimeSec();
     331    17593030 :   section_time._total += current_node.totalTimeSec();
     332    17593030 :   section_time._num_calls += current_node.numCalls();
     333             : 
     334    17593030 :   section_time._self_memory += current_node.selfMemory();
     335    17593030 :   section_time._children_memory += current_node.childrenMemory();
     336    17593030 :   section_time._total_memory += current_node.totalMemory();
     337             : 
     338    35074149 :   for (auto & child_it : current_node.children())
     339    17481119 :     recursivelyUpdate(*child_it.second);
     340    17593030 : }
     341             : 
     342             : PerfGraph::FullTable
     343       52274 : PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
     344             : {
     345       52274 :   update();
     346             : 
     347             :   FullTable vtable({"Section",
     348             :                     "Calls",
     349             :                     "Self(s)",
     350             :                     "Avg(s)",
     351             :                     "%",
     352             :                     "Mem(MB)",
     353             :                     "Total(s)",
     354             :                     "Avg(s)",
     355             :                     "%",
     356             :                     "Mem(MB)"},
     357      575014 :                    10);
     358             : 
     359       52274 :   vtable.setColumnFormat({
     360             :       VariadicTableColumnFormat::AUTO,    // Section Name
     361             :       VariadicTableColumnFormat::AUTO,    // Calls
     362             :       VariadicTableColumnFormat::FIXED,   // Self
     363             :       VariadicTableColumnFormat::FIXED,   // Avg.
     364             :       VariadicTableColumnFormat::PERCENT, // %
     365             :       VariadicTableColumnFormat::AUTO,    // Memory
     366             :       VariadicTableColumnFormat::FIXED,   // Total
     367             :       VariadicTableColumnFormat::FIXED,   // Avg.
     368             :       VariadicTableColumnFormat::PERCENT, // %
     369             :       VariadicTableColumnFormat::AUTO,    // Memory
     370             :   });
     371             : 
     372       52274 :   vtable.setColumnPrecision({
     373             :       1, // Section Name
     374             :       0, // Calls
     375             :       3, // Self
     376             :       3, // Avg.
     377             :       2, // %
     378             :       0, // Memory
     379             :       3, // Total
     380             :       3, // Avg.
     381             :       2, // %
     382             :       0, // Memory
     383             :   });
     384             : 
     385      883151 :   auto act = [this, &vtable](const PerfNode & node,
     386             :                              const moose::internal::PerfGraphSectionInfo & section_info,
     387     2649453 :                              const unsigned int depth)
     388             :   {
     389     2649453 :     vtable.addRow(std::string(depth * 2, ' ') + section_info._name,        // Section Name
     390             :                   node.numCalls(),                                         // Calls
     391             :                   node.selfTimeSec(),                                      // Self
     392             :                   node.selfTimeAvg(),                                      // Avg.
     393      883151 :                   100. * node.selfTimeSec() / _root_node->totalTimeSec(),  // %
     394             :                   node.selfMemory(),                                       // Memory
     395             :                   node.totalTimeSec(),                                     // Total
     396             :                   node.totalTimeAvg(),                                     // Avg.
     397      883151 :                   100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
     398             :                   node.totalMemory());                                     // Memory
     399      883151 :   };
     400       52274 :   treeRecurse(act, level, heaviest);
     401             : 
     402      104548 :   return vtable;
     403      104548 : }
     404             : 
     405             : void
     406       52252 : PerfGraph::print(const ConsoleStream & console, unsigned int level)
     407             : {
     408       52252 :   console << "\nPerformance Graph:\n";
     409       52252 :   treeTable(level).print(console);
     410       52252 : }
     411             : 
     412             : void
     413          22 : PerfGraph::printHeaviestBranch(const ConsoleStream & console)
     414             : {
     415          22 :   console << "\nHeaviest Branch:\n";
     416          22 :   treeTable(MOOSE_MAX_STACK_SIZE, /* heaviest = */ true).print(console);
     417          22 : }
     418             : 
     419             : void
     420          22 : PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
     421             : {
     422          22 :   update();
     423             : 
     424          22 :   console << "\nHeaviest Sections:\n";
     425             : 
     426             :   // Indirect Sort The Self Time
     427          22 :   std::vector<size_t> sorted;
     428          22 :   Moose::indirectSort(_cumulative_section_info_ptrs.begin(),
     429             :                       _cumulative_section_info_ptrs.end(),
     430             :                       sorted,
     431       25234 :                       [](CumulativeSectionInfo * lhs, CumulativeSectionInfo * rhs)
     432             :                       {
     433       25234 :                         if (lhs && rhs)
     434       25022 :                           return lhs->_self > rhs->_self;
     435             : 
     436             :                         // If the LHS exists - it's definitely bigger than a non-existant RHS
     437         212 :                         if (lhs)
     438         131 :                           return true;
     439             : 
     440             :                         // Both don't exist - so it doesn't matter how we sort them
     441          81 :                         return false;
     442             :                       });
     443             : 
     444         154 :   HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
     445             : 
     446          22 :   vtable.setColumnFormat({VariadicTableColumnFormat::AUTO,    // Section; doesn't matter
     447             :                           VariadicTableColumnFormat::AUTO,    // Calls
     448             :                           VariadicTableColumnFormat::FIXED,   // Time
     449             :                           VariadicTableColumnFormat::FIXED,   // Avg.
     450             :                           VariadicTableColumnFormat::PERCENT, // Percent
     451             :                           VariadicTableColumnFormat::AUTO}    // Memory
     452             :   );
     453             : 
     454          22 :   vtable.setColumnPrecision({
     455             :       1, // Section
     456             :       1, // Calls
     457             :       3, // Time
     458             :       3, // Avg.
     459             :       2, // Percent
     460             :       1  // Memory
     461             :   });
     462             : 
     463             :   mooseAssert(!_cumulative_section_info_ptrs.empty(),
     464             :               "update() must be run before printHeaviestSections()!");
     465             : 
     466             :   // The total time of the root node
     467          22 :   auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
     468             : 
     469             :   // Now print out the largest ones
     470         182 :   for (unsigned int i = 0; i < num_sections; i++)
     471             :   {
     472         160 :     auto id = sorted[i];
     473             : 
     474         160 :     if (!_cumulative_section_info_ptrs[id])
     475           0 :       continue;
     476             : 
     477         160 :     const auto & entry = *_cumulative_section_info_ptrs[id];
     478         160 :     vtable.addRow(_perf_graph_registry.sectionInfo(id)._name,        // Section
     479         160 :                   entry._num_calls,                                  // Calls
     480         160 :                   entry._self,                                       // Time
     481         160 :                   entry._self / static_cast<Real>(entry._num_calls), // Avg.
     482         160 :                   100. * entry._self / total_root_time,              // Percent
     483         160 :                   entry._self_memory);                               // Memory
     484             :   }
     485             : 
     486          22 :   vtable.print(console);
     487          66 : }
     488             : 
     489             : void
     490       45214 : dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
     491             : {
     492             :   // We need to store the registry id -> section info map so that we can add
     493             :   // registered sections that may not be added yet during recover
     494       45214 :   dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
     495             : 
     496             :   // Update before serializing the nodes so that the time/memory/calls are correct
     497       45214 :   perf_graph.update();
     498             : 
     499             :   // Recursively serialize all of the nodes
     500       45214 :   dataStore(stream, perf_graph._root_node, nullptr);
     501       45214 : }
     502             : 
     503             : void
     504       13740 : dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
     505             : {
     506             :   // Load in all of the recovered sections and register those that do not exist yet
     507       13740 :   std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
     508       13740 :   dataLoad(stream, recovered_section_info, nullptr);
     509     2220748 :   for (const auto & info : recovered_section_info)
     510             :   {
     511     2207008 :     if (info._live_message.size())
     512      985744 :       perf_graph._perf_graph_registry.registerSection(
     513      985744 :           info._name, info._level, info._live_message, info._print_dots);
     514             :     else
     515     1221264 :       perf_graph._perf_graph_registry.registerSection(info._name, info._level);
     516             :   }
     517             : 
     518             :   // Update the current node time/memory/calls before loading the nodes as the load
     519             :   // will append information to current nodes that exist
     520       13740 :   perf_graph.update();
     521             : 
     522             :   // Recursively load all of the nodes; this will append information to matching nodes
     523             :   // and will create new nodes for section paths that do not exist
     524       13740 :   dataLoad(stream, perf_graph._root_node, &perf_graph);
     525       13740 : }

Generated by: LCOV version 1.14