LCOV - code coverage report
Current view: top level - src/utils - PerfGraph.C (source / functions) Hit Total Coverage
Test: idaholab/moose framework: 2bf808 Lines: 223 231 96.5 %
Date: 2025-07-17 01:28:37 Functions: 19 19 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //* This file is part of the MOOSE framework
       2             : //* https://mooseframework.inl.gov
       3             : //*
       4             : //* All rights reserved, see COPYRIGHT for full restrictions
       5             : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
       6             : //*
       7             : //* Licensed under LGPL 2.1, please see LICENSE for details
       8             : //* https://www.gnu.org/licenses/lgpl-2.1.html
       9             : 
      10             : #include "PerfGraph.h"
      11             : 
      12             : // MOOSE Includes
      13             : #include "PerfGuard.h"
      14             : #include "MooseError.h"
      15             : #include "PerfGraphLivePrint.h"
      16             : #include "PerfGraphRegistry.h"
      17             : #include "MooseApp.h"
      18             : 
      19             : // Note: do everything we can to make sure this only gets #included
      20             : // in the .C file... this is a heavily templated header that we
      21             : // don't want to expose to EVERY file in MOOSE...
      22             : #include "VariadicTable.h"
      23             : 
      24             : // System Includes
      25             : #include <chrono>
      26             : #include <memory>
      27             : 
      28       62755 : PerfGraph::PerfGraph(const std::string & root_name,
      29             :                      MooseApp & app,
      30             :                      const bool live_all,
      31       62755 :                      const bool perf_graph_live)
      32             :   : ConsoleStreamInterface(app),
      33       62755 :     _moose_app(app),
      34       62755 :     _live_print_all(live_all),
      35       62755 :     _disable_live_print(!perf_graph_live),
      36       62755 :     _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
      37       62755 :     _pid(app.comm().rank()),
      38       62755 :     _root_name(root_name),
      39       62755 :     _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
      40       62755 :     _root_node(std::make_unique<PerfNode>(_root_node_id)),
      41       62755 :     _current_position(-1),
      42       62755 :     _stack(),
      43       62755 :     _execution_list_begin(0),
      44       62755 :     _execution_list_end(0),
      45       62755 :     _active(true),
      46       62755 :     _destructing(false),
      47       62755 :     _live_print_time_limit(5.0),
      48       62755 :     _live_print_mem_limit(100),
      49      251020 :     _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
      50             : {
      51       62755 :   push(_root_node_id);
      52       62755 : }
      53             : 
      54       57254 : PerfGraph::~PerfGraph() { disableLivePrint(); }
      55             : 
      56             : void
      57       62754 : PerfGraph::enableLivePrint()
      58             : {
      59       62754 :   if (_pid == 0 && !_disable_live_print)
      60             :   {
      61             :     // Start the printing thread
      62       97486 :     _print_thread = std::thread([this] { this->_live_print->start(); });
      63             :   }
      64       62754 : }
      65             : 
      66             : void
      67       60373 : PerfGraph::disableLivePrint()
      68             : {
      69       60373 :   if (_pid == 0 && !_disable_live_print)
      70             :   {
      71             :     {
      72             :       // Unlike using atomics for execution_thread_end
      73             :       // here we actually lock to ensure that either the print thread
      74             :       // immediately sees that we are destructing or is immediately
      75             :       // notified with the below notification.  Without doing this
      76             :       // it would be possible (but unlikely) for the print thread to
      77             :       // hang for 1 second at the end of execution (which would not be
      78             :       // good anytime you are running lots of fast calculations back-to-back
      79             :       // like during testing or stochastic sampling).
      80       43334 :       std::lock_guard<std::mutex> lock(_destructing_mutex);
      81       43334 :       _destructing = true;
      82       43334 :     }
      83             : 
      84       43334 :     _finished_section.notify_one();
      85             : 
      86       43334 :     _print_thread.join();
      87             : 
      88       43334 :     _disable_live_print = true;
      89             :   }
      90       60373 : }
      91             : 
      92             : Real
      93         592 : PerfGraph::sectionData(const DataType type,
      94             :                        const std::string & section_name,
      95             :                        const bool must_exist /* = true */)
      96             : {
      97         592 :   update();
      98             : 
      99             :   const auto section_it =
     100         592 :       _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
     101             : 
     102         592 :   if (section_it == _cumulative_section_info.end())
     103             :   {
     104          19 :     if (!must_exist ||                                   // isn't required to exist
     105           4 :         _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
     106             :     )
     107          11 :       return 0;
     108             : 
     109           4 :     mooseError("Unknown PerfGraph section name \"",
     110             :                section_name,
     111             :                "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
     112             :                "\"Root\".");
     113             :   }
     114             : 
     115         577 :   const CumulativeSectionInfo & section_info = section_it->second;
     116             : 
     117         577 :   if (type == CALLS)
     118          64 :     return section_info._num_calls;
     119             : 
     120         513 :   const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
     121             : 
     122         513 :   switch (type)
     123             :   {
     124          32 :     case SELF:
     125          32 :       return section_info._self;
     126          11 :     case CHILDREN:
     127          11 :       return section_info._children;
     128         287 :     case TOTAL:
     129         287 :       return section_info._total;
     130          11 :     case SELF_AVG:
     131          11 :       return section_info._self / static_cast<Real>(section_info._num_calls);
     132          11 :     case CHILDREN_AVG:
     133          11 :       return section_info._children / static_cast<Real>(section_info._num_calls);
     134          53 :     case TOTAL_AVG:
     135          53 :       return section_info._total / static_cast<Real>(section_info._num_calls);
     136          32 :     case SELF_PERCENT:
     137          32 :       return 100. * (section_info._self / app_time);
     138          11 :     case CHILDREN_PERCENT:
     139          11 :       return 100. * (section_info._children / app_time);
     140          32 :     case TOTAL_PERCENT:
     141          32 :       return 100. * (section_info._total / app_time);
     142          11 :     case SELF_MEMORY:
     143          11 :       return section_info._self_memory;
     144          11 :     case CHILDREN_MEMORY:
     145          11 :       return section_info._children_memory;
     146          11 :     case TOTAL_MEMORY:
     147          11 :       return section_info._total_memory;
     148           0 :     default:
     149           0 :       ::mooseError("Unknown DataType");
     150             :   }
     151             : }
     152             : 
     153             : void
     154    50760578 : PerfGraph::addToExecutionList(const PerfID id,
     155             :                               const IncrementState state,
     156             :                               const std::chrono::time_point<std::chrono::steady_clock> time,
     157             :                               const long int memory)
     158             : {
     159    50760578 :   auto & section_increment = _execution_list[_execution_list_end];
     160             : 
     161    50760578 :   section_increment._id = id;
     162    50760578 :   section_increment._state = state;
     163    50760578 :   section_increment._time = time;
     164    50760578 :   section_increment._memory = memory;
     165    50760578 :   section_increment._beginning_num_printed = _console.numPrinted();
     166             : 
     167             :   // A note about this next section of code:
     168             :   // It is only EVER run on the main thread - and therefore there can be
     169             :   // no race conditions.  All that is important here is that the print
     170             :   // thread always sees a consistent value for _execution_list_end
     171    50760578 :   auto next_execution_list_end = _execution_list_end + 1;
     172             : 
     173             :   // Are we at the end of our circular buffer?
     174    50760578 :   if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
     175        1652 :     next_execution_list_end = 0;
     176             : 
     177             :   // This "release" will synchronize the above memory changes with the
     178             :   // "acquire" in the printing thread
     179             :   // All of the above memory operations will be seen by the
     180             :   // printing thread before the printing thread sees this new value
     181    50760578 :   _execution_list_end.store(next_execution_list_end, std::memory_order_release);
     182    50760578 : }
     183             : 
     184             : void
     185    93319643 : PerfGraph::push(const PerfID id)
     186             : {
     187    93319643 :   if (!_active)
     188        4374 :     return;
     189             : 
     190    93315269 :   PerfNode * new_node = nullptr;
     191             : 
     192    93315269 :   if (id == _root_node_id)
     193       62755 :     new_node = _root_node.get();
     194             :   else
     195    93252514 :     new_node = _stack[_current_position]->getChild(id);
     196             : 
     197             :   MemoryUtils::Stats stats;
     198    93315269 :   auto memory_success = MemoryUtils::getMemoryStats(stats);
     199             : 
     200    93315269 :   long int start_memory = 0;
     201             : 
     202    93315269 :   if (memory_success)
     203    93315269 :     start_memory =
     204    93315269 :         MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     205           0 :   else if (_current_position !=
     206             :            -1) // If we weren't able to get the memory stats, let's just use the parent's
     207           0 :     start_memory = _stack[_current_position]->startMemory();
     208             : 
     209             :   // Set the start time
     210    93315269 :   auto current_time = std::chrono::steady_clock::now();
     211             : 
     212    93315269 :   new_node->setStartTimeAndMemory(current_time, start_memory);
     213             : 
     214             :   // Increment the number of calls
     215    93315269 :   new_node->incrementNumCalls();
     216             : 
     217    93315269 :   _current_position++;
     218             : 
     219    93315269 :   if (_current_position >= MOOSE_MAX_STACK_SIZE)
     220           0 :     mooseError("PerfGraph is out of stack space!");
     221             : 
     222    93315269 :   _stack[_current_position] = new_node;
     223             : 
     224             :   // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
     225   160637963 :   if ((_pid == 0 && !_disable_live_print) &&
     226    67322694 :       (!_perf_graph_registry.readSectionInfo(id)._live_message.empty() || _live_print_all))
     227    25385355 :     addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
     228             : }
     229             : 
     230             : void
     231    93239038 : PerfGraph::pop()
     232             : {
     233    93239038 :   if (!_active)
     234        4414 :     return;
     235             : 
     236    93234624 :   auto current_time = std::chrono::steady_clock::now();
     237             : 
     238    93234624 :   auto & current_node = _stack[_current_position];
     239             : 
     240             :   MemoryUtils::Stats stats;
     241    93234624 :   auto memory_success = MemoryUtils::getMemoryStats(stats);
     242             : 
     243    93234624 :   long int current_memory = 0;
     244             : 
     245    93234624 :   if (memory_success)
     246    93234624 :     current_memory =
     247    93234624 :         MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     248           0 :   else if (_current_position !=
     249             :            -1) // If we weren't able to get the memory stats, let's just use the start memory
     250           0 :     current_memory = _stack[_current_position]->startMemory();
     251             : 
     252    93234624 :   current_node->addTimeAndMemory(current_time, current_memory);
     253             : 
     254    93234624 :   _current_position--;
     255             : 
     256             :   // Add this to the exection list
     257   160484864 :   if ((_pid == 0 && !_disable_live_print) &&
     258    67250240 :       (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
     259    41875017 :        _live_print_all))
     260             :   {
     261    25375223 :     addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
     262             : 
     263             :     // Tell the printing thread that a section has finished
     264             :     //
     265             :     // Note: no mutex is needed here because we're using an atomic
     266             :     // in the predicate of the condition_variable in the thread
     267             :     // This is technically correct - but there is a chance of missing a signal
     268             :     // For us - that chance is low and doesn't matter (the timeout will just be hit
     269             :     // instead). So - I would rather not have an extra lock here in the main thread.
     270    25375223 :     _finished_section.notify_one();
     271             :   }
     272             : }
     273             : 
     274             : void
     275      103063 : PerfGraph::update()
     276             : {
     277             :   // First update all of the currently running nodes
     278      103063 :   auto now = std::chrono::steady_clock::now();
     279             : 
     280             :   MemoryUtils::Stats stats;
     281      103063 :   MemoryUtils::getMemoryStats(stats);
     282             :   auto now_memory =
     283      103063 :       MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     284             : 
     285      584022 :   for (int i = 0; i <= _current_position; i++)
     286             :   {
     287      480959 :     auto node = _stack[i];
     288      480959 :     node->addTimeAndMemory(now, now_memory);
     289      480959 :     node->setStartTimeAndMemory(now, now_memory);
     290             :   }
     291             : 
     292             :   // Zero out the entries
     293     6572721 :   for (auto & section_time_it : _cumulative_section_info)
     294             :   {
     295     6469658 :     auto & section_time = section_time_it.second;
     296             : 
     297     6469658 :     section_time._num_calls = 0;
     298     6469658 :     section_time._self = 0.;
     299     6469658 :     section_time._children = 0.;
     300     6469658 :     section_time._total = 0.;
     301     6469658 :     section_time._self_memory = 0;
     302     6469658 :     section_time._children_memory = 0;
     303     6469658 :     section_time._total_memory = 0.;
     304             :   }
     305             : 
     306      103063 :   recursivelyUpdate(*_root_node);
     307             : 
     308             :   // Update vector pointing to section times
     309             :   // Note: we are doing this _after_ recursively filling
     310             :   // because new entries may have been created
     311      103063 :   _cumulative_section_info_ptrs.resize(_perf_graph_registry.numSections());
     312             : 
     313    13361592 :   for (auto & section_time_it : _cumulative_section_info)
     314             :   {
     315    13258529 :     auto id = _perf_graph_registry.sectionID(section_time_it.first);
     316             : 
     317    13258529 :     _cumulative_section_info_ptrs[id] = &section_time_it.second;
     318             :   }
     319      103063 : }
     320             : 
     321             : void
     322    16198262 : PerfGraph::recursivelyUpdate(const PerfNode & current_node)
     323             : {
     324    16198262 :   const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
     325             : 
     326             :   // RHS insertion on purpose
     327    16198262 :   auto & section_time = _cumulative_section_info[section_name];
     328             : 
     329    16198262 :   section_time._self += current_node.selfTimeSec();
     330    16198262 :   section_time._children += current_node.childrenTimeSec();
     331    16198262 :   section_time._total += current_node.totalTimeSec();
     332    16198262 :   section_time._num_calls += current_node.numCalls();
     333             : 
     334    16198262 :   section_time._self_memory += current_node.selfMemory();
     335    16198262 :   section_time._children_memory += current_node.childrenMemory();
     336    16198262 :   section_time._total_memory += current_node.totalMemory();
     337             : 
     338    32293461 :   for (auto & child_it : current_node.children())
     339    16095199 :     recursivelyUpdate(*child_it.second);
     340    16198262 : }
     341             : 
     342             : PerfGraph::FullTable
     343       47826 : PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
     344             : {
     345       47826 :   update();
     346             : 
     347             :   FullTable vtable({"Section",
     348             :                     "Calls",
     349             :                     "Self(s)",
     350             :                     "Avg(s)",
     351             :                     "%",
     352             :                     "Mem(MB)",
     353             :                     "Total(s)",
     354             :                     "Avg(s)",
     355             :                     "%",
     356             :                     "Mem(MB)"},
     357      526086 :                    10);
     358             : 
     359       47826 :   vtable.setColumnFormat({
     360             :       VariadicTableColumnFormat::AUTO,    // Section Name
     361             :       VariadicTableColumnFormat::AUTO,    // Calls
     362             :       VariadicTableColumnFormat::FIXED,   // Self
     363             :       VariadicTableColumnFormat::FIXED,   // Avg.
     364             :       VariadicTableColumnFormat::PERCENT, // %
     365             :       VariadicTableColumnFormat::AUTO,    // Memory
     366             :       VariadicTableColumnFormat::FIXED,   // Total
     367             :       VariadicTableColumnFormat::FIXED,   // Avg.
     368             :       VariadicTableColumnFormat::PERCENT, // %
     369             :       VariadicTableColumnFormat::AUTO,    // Memory
     370             :   });
     371             : 
     372       47826 :   vtable.setColumnPrecision({
     373             :       1, // Section Name
     374             :       0, // Calls
     375             :       3, // Self
     376             :       3, // Avg.
     377             :       2, // %
     378             :       0, // Memory
     379             :       3, // Total
     380             :       3, // Avg.
     381             :       2, // %
     382             :       0, // Memory
     383             :   });
     384             : 
     385      809387 :   auto act = [this, &vtable](const PerfNode & node,
     386             :                              const moose::internal::PerfGraphSectionInfo & section_info,
     387     2428161 :                              const unsigned int depth)
     388             :   {
     389     2428161 :     vtable.addRow(std::string(depth * 2, ' ') + section_info._name,        // Section Name
     390             :                   node.numCalls(),                                         // Calls
     391             :                   node.selfTimeSec(),                                      // Self
     392             :                   node.selfTimeAvg(),                                      // Avg.
     393      809387 :                   100. * node.selfTimeSec() / _root_node->totalTimeSec(),  // %
     394             :                   node.selfMemory(),                                       // Memory
     395             :                   node.totalTimeSec(),                                     // Total
     396             :                   node.totalTimeAvg(),                                     // Avg.
     397      809387 :                   100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
     398             :                   node.totalMemory());                                     // Memory
     399      809387 :   };
     400       47826 :   treeRecurse(act, level, heaviest);
     401             : 
     402       95652 :   return vtable;
     403       95652 : }
     404             : 
     405             : void
     406       47805 : PerfGraph::print(const ConsoleStream & console, unsigned int level)
     407             : {
     408       47805 :   console << "\nPerformance Graph:\n";
     409       47805 :   treeTable(level).print(console);
     410       47805 : }
     411             : 
     412             : void
     413          21 : PerfGraph::printHeaviestBranch(const ConsoleStream & console)
     414             : {
     415          21 :   console << "\nHeaviest Branch:\n";
     416          21 :   treeTable(MOOSE_MAX_STACK_SIZE, /* heaviest = */ true).print(console);
     417          21 : }
     418             : 
     419             : void
     420          21 : PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
     421             : {
     422          21 :   update();
     423             : 
     424          21 :   console << "\nHeaviest Sections:\n";
     425             : 
     426             :   // Indirect Sort The Self Time
     427          21 :   std::vector<size_t> sorted;
     428          21 :   Moose::indirectSort(_cumulative_section_info_ptrs.begin(),
     429             :                       _cumulative_section_info_ptrs.end(),
     430             :                       sorted,
     431       23912 :                       [](CumulativeSectionInfo * lhs, CumulativeSectionInfo * rhs)
     432             :                       {
     433       23912 :                         if (lhs && rhs)
     434       23715 :                           return lhs->_self > rhs->_self;
     435             : 
     436             :                         // If the LHS exists - it's definitely bigger than a non-existant RHS
     437         197 :                         if (lhs)
     438         117 :                           return true;
     439             : 
     440             :                         // Both don't exist - so it doesn't matter how we sort them
     441          80 :                         return false;
     442             :                       });
     443             : 
     444         147 :   HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
     445             : 
     446          21 :   vtable.setColumnFormat({VariadicTableColumnFormat::AUTO,    // Section; doesn't matter
     447             :                           VariadicTableColumnFormat::AUTO,    // Calls
     448             :                           VariadicTableColumnFormat::FIXED,   // Time
     449             :                           VariadicTableColumnFormat::FIXED,   // Avg.
     450             :                           VariadicTableColumnFormat::PERCENT, // Percent
     451             :                           VariadicTableColumnFormat::AUTO}    // Memory
     452             :   );
     453             : 
     454          21 :   vtable.setColumnPrecision({
     455             :       1, // Section
     456             :       1, // Calls
     457             :       3, // Time
     458             :       3, // Avg.
     459             :       2, // Percent
     460             :       1  // Memory
     461             :   });
     462             : 
     463             :   mooseAssert(!_cumulative_section_info_ptrs.empty(),
     464             :               "update() must be run before printHeaviestSections()!");
     465             : 
     466             :   // The total time of the root node
     467          21 :   auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
     468             : 
     469             :   // Now print out the largest ones
     470         176 :   for (unsigned int i = 0; i < num_sections; i++)
     471             :   {
     472         155 :     auto id = sorted[i];
     473             : 
     474         155 :     if (!_cumulative_section_info_ptrs[id])
     475           0 :       continue;
     476             : 
     477         155 :     const auto & entry = *_cumulative_section_info_ptrs[id];
     478         155 :     vtable.addRow(_perf_graph_registry.sectionInfo(id)._name,        // Section
     479         155 :                   entry._num_calls,                                  // Calls
     480         155 :                   entry._self,                                       // Time
     481         155 :                   entry._self / static_cast<Real>(entry._num_calls), // Avg.
     482         155 :                   100. * entry._self / total_root_time,              // Percent
     483         155 :                   entry._self_memory);                               // Memory
     484             :   }
     485             : 
     486          21 :   vtable.print(console);
     487          63 : }
     488             : 
     489             : void
     490       41839 : dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
     491             : {
     492             :   // We need to store the registry id -> section info map so that we can add
     493             :   // registered sections that may not be added yet during recover
     494       41839 :   dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
     495             : 
     496             :   // Update before serializing the nodes so that the time/memory/calls are correct
     497       41839 :   perf_graph.update();
     498             : 
     499             :   // Recursively serialize all of the nodes
     500       41839 :   dataStore(stream, perf_graph._root_node, nullptr);
     501       41839 : }
     502             : 
     503             : void
     504       12748 : dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
     505             : {
     506             :   // Load in all of the recovered sections and register those that do not exist yet
     507       12748 :   std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
     508       12748 :   dataLoad(stream, recovered_section_info, nullptr);
     509     2050459 :   for (const auto & info : recovered_section_info)
     510             :   {
     511     2037711 :     if (info._live_message.size())
     512      904203 :       perf_graph._perf_graph_registry.registerSection(
     513      904203 :           info._name, info._level, info._live_message, info._print_dots);
     514             :     else
     515     1133508 :       perf_graph._perf_graph_registry.registerSection(info._name, info._level);
     516             :   }
     517             : 
     518             :   // Update the current node time/memory/calls before loading the nodes as the load
     519             :   // will append information to current nodes that exist
     520       12748 :   perf_graph.update();
     521             : 
     522             :   // Recursively load all of the nodes; this will append information to matching nodes
     523             :   // and will create new nodes for section paths that do not exist
     524       12748 :   dataLoad(stream, perf_graph._root_node, &perf_graph);
     525       12748 : }

Generated by: LCOV version 1.14