LCOV - code coverage report
Current view: top level - src/utils - PerfGraph.C (source / functions) Hit Total Coverage
Test: idaholab/moose framework: #32971 (54bef8) with base c6cf66 Lines: 227 237 95.8 %
Date: 2026-05-29 20:35:17 Functions: 20 20 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //* This file is part of the MOOSE framework
       2             : //* https://mooseframework.inl.gov
       3             : //*
       4             : //* All rights reserved, see COPYRIGHT for full restrictions
       5             : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
       6             : //*
       7             : //* Licensed under LGPL 2.1, please see LICENSE for details
       8             : //* https://www.gnu.org/licenses/lgpl-2.1.html
       9             : 
      10             : #include "PerfGraph.h"
      11             : 
      12             : // MOOSE Includes
      13             : #include "PerfGuard.h"
      14             : #include "MooseError.h"
      15             : #include "PerfGraphLivePrint.h"
      16             : #include "PerfGraphRegistry.h"
      17             : #include "MooseApp.h"
      18             : 
      19             : // Note: do everything we can to make sure this only gets #included
      20             : // in the .C file... this is a heavily templated header that we
      21             : // don't want to expose to EVERY file in MOOSE...
      22             : #include "VariadicTable.h"
      23             : 
      24             : // System Includes
      25             : #include <chrono>
      26             : #include <memory>
      27             : 
      28       66995 : PerfGraph::PerfGraph(const std::string & root_name,
      29             :                      MooseApp & app,
      30             :                      const bool live_all,
      31       66995 :                      const bool perf_graph_live)
      32             :   : ConsoleStreamInterface(app),
      33       66995 :     _moose_app(app),
      34       66995 :     _live_print_all(live_all),
      35       66995 :     _disable_live_print(!perf_graph_live),
      36       66995 :     _perf_graph_registry(moose::internal::getPerfGraphRegistry()),
      37       66995 :     _pid(app.comm().rank()),
      38       66995 :     _root_name(root_name),
      39       66995 :     _root_node_id(_perf_graph_registry.registerSection(root_name, 0)),
      40       66995 :     _root_node(std::make_unique<PerfNode>(_root_node_id)),
      41       66995 :     _current_position(-1),
      42       66995 :     _stack(),
      43       66995 :     _execution_list_begin(0),
      44       66995 :     _execution_list_end(0),
      45       66995 :     _max_memory(0),
      46       66995 :     _active(true),
      47       66995 :     _destructing(false),
      48       66995 :     _live_print_time_limit(5.0),
      49       66995 :     _live_print_mem_limit(100),
      50      334975 :     _live_print(std::make_unique<PerfGraphLivePrint>(*this, app))
      51             : {
      52       66995 :   push(_root_node_id);
      53       66995 : }
      54             : 
      55       62876 : PerfGraph::~PerfGraph() { disableLivePrint(); }
      56             : 
      57             : void
      58       66993 : PerfGraph::enableLivePrint()
      59             : {
      60       66993 :   if (_pid == 0 && !_disable_live_print)
      61             :   {
      62             :     // Start the printing thread
      63        4636 :     _print_thread = std::thread([this] { this->_live_print->start(); });
      64             :   }
      65       66993 : }
      66             : 
      67             : void
      68       63127 : PerfGraph::disableLivePrint()
      69             : {
      70       63127 :   if (_pid == 0 && !_disable_live_print)
      71             :   {
      72             :     {
      73             :       // Unlike using atomics for execution_thread_end
      74             :       // here we actually lock to ensure that either the print thread
      75             :       // immediately sees that we are destructing or is immediately
      76             :       // notified with the below notification.  Without doing this
      77             :       // it would be possible (but unlikely) for the print thread to
      78             :       // hang for 1 second at the end of execution (which would not be
      79             :       // good anytime you are running lots of fast calculations back-to-back
      80             :       // like during testing or stochastic sampling).
      81        2315 :       std::lock_guard<std::mutex> lock(_destructing_mutex);
      82        2315 :       _destructing = true;
      83        2315 :     }
      84             : 
      85        2315 :     _finished_section.notify_one();
      86             : 
      87        2315 :     _print_thread.join();
      88             : 
      89        2315 :     _disable_live_print = true;
      90             :   }
      91       63127 : }
      92             : 
      93             : Real
      94         564 : PerfGraph::sectionData(const DataType type,
      95             :                        const std::string & section_name,
      96             :                        const bool must_exist /* = true */)
      97             : {
      98         564 :   update();
      99             : 
     100             :   const auto section_it =
     101         564 :       _cumulative_section_info.find(section_name == "Root" ? _root_name : section_name);
     102             : 
     103         564 :   if (section_it == _cumulative_section_info.end())
     104             :   {
     105          17 :     if (!must_exist ||                                   // isn't required to exist
     106           3 :         _perf_graph_registry.sectionExists(section_name) // or, is required to exist and it does
     107             :     )
     108          11 :       return 0;
     109             : 
     110           3 :     mooseError("Unknown PerfGraph section name \"",
     111             :                section_name,
     112             :                "\" in PerfGraph::sectionData().\nIf you are attempting to retrieve the root use "
     113             :                "\"Root\".");
     114             :   }
     115             : 
     116         550 :   const CumulativeSectionInfo & section_info = section_it->second;
     117             : 
     118         550 :   if (type == CALLS)
     119          62 :     return section_info._num_calls;
     120             : 
     121         488 :   const auto app_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
     122             : 
     123         488 :   switch (type)
     124             :   {
     125          31 :     case SELF:
     126          31 :       return section_info._self;
     127          11 :     case CHILDREN:
     128          11 :       return section_info._children;
     129         267 :     case TOTAL:
     130         267 :       return section_info._total;
     131          11 :     case SELF_AVG:
     132          11 :       return section_info._self / static_cast<Real>(section_info._num_calls);
     133          11 :     case CHILDREN_AVG:
     134          11 :       return section_info._children / static_cast<Real>(section_info._num_calls);
     135          51 :     case TOTAL_AVG:
     136          51 :       return section_info._total / static_cast<Real>(section_info._num_calls);
     137          31 :     case SELF_PERCENT:
     138          31 :       return 100. * (section_info._self / app_time);
     139          11 :     case CHILDREN_PERCENT:
     140          11 :       return 100. * (section_info._children / app_time);
     141          31 :     case TOTAL_PERCENT:
     142          31 :       return 100. * (section_info._total / app_time);
     143          11 :     case SELF_MEMORY:
     144          11 :       return section_info._self_memory;
     145          11 :     case CHILDREN_MEMORY:
     146          11 :       return section_info._children_memory;
     147          11 :     case TOTAL_MEMORY:
     148          11 :       return section_info._total_memory;
     149           0 :     default:
     150           0 :       ::mooseError("Unknown DataType");
     151             :   }
     152             : }
     153             : 
     154             : void
     155      406452 : PerfGraph::addToExecutionList(const PerfID id,
     156             :                               const IncrementState state,
     157             :                               const std::chrono::time_point<std::chrono::steady_clock> time,
     158             :                               const long int memory)
     159             : {
     160      406452 :   auto & section_increment = _execution_list[_execution_list_end];
     161             : 
     162      406452 :   section_increment._id = id;
     163      406452 :   section_increment._state = state;
     164      406452 :   section_increment._time = time;
     165      406452 :   section_increment._memory = memory;
     166      406452 :   section_increment._beginning_num_printed = _console.numPrinted();
     167             : 
     168             :   // A note about this next section of code:
     169             :   // It is only EVER run on the main thread - and therefore there can be
     170             :   // no race conditions.  All that is important here is that the print
     171             :   // thread always sees a consistent value for _execution_list_end
     172      406452 :   auto next_execution_list_end = _execution_list_end + 1;
     173             : 
     174             :   // Are we at the end of our circular buffer?
     175      406452 :   if (next_execution_list_end >= MAX_EXECUTION_LIST_SIZE)
     176           0 :     next_execution_list_end = 0;
     177             : 
     178             :   // This "release" will synchronize the above memory changes with the
     179             :   // "acquire" in the printing thread
     180             :   // All of the above memory operations will be seen by the
     181             :   // printing thread before the printing thread sees this new value
     182      406452 :   _execution_list_end.store(next_execution_list_end, std::memory_order_release);
     183      406452 : }
     184             : 
     185             : void
     186    85909669 : PerfGraph::push(const PerfID id)
     187             : {
     188    85909669 :   if (!_active)
     189        3397 :     return;
     190             : 
     191    85906272 :   PerfNode * new_node = nullptr;
     192             : 
     193    85906272 :   if (id == _root_node_id)
     194       66995 :     new_node = _root_node.get();
     195             :   else
     196    85839277 :     new_node = _stack[_current_position]->getChild(id);
     197             : 
     198             :   MemoryUtils::Stats stats;
     199    85906272 :   auto memory_success = MemoryUtils::getMemoryStats(stats);
     200             : 
     201    85906272 :   long int start_memory = 0;
     202             : 
     203    85906272 :   if (memory_success)
     204             :   {
     205             :     const auto memory_mb =
     206    85906272 :         MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     207    85906272 :     start_memory = memory_mb;
     208    85906272 :     updateMaxMemory(memory_mb);
     209             :   }
     210             :   // If we weren't able to get the memory stats, let's just use the parent's
     211           0 :   else if (_current_position != -1)
     212           0 :     start_memory = _stack[_current_position]->startMemory();
     213             : 
     214             :   // Set the start time
     215    85906272 :   auto current_time = std::chrono::steady_clock::now();
     216             : 
     217    85906272 :   new_node->setStartTimeAndMemory(current_time, start_memory);
     218             : 
     219             :   // Increment the number of calls
     220    85906272 :   new_node->incrementNumCalls();
     221             : 
     222    85906272 :   _current_position++;
     223             : 
     224    85906272 :   if (_current_position >= MOOSE_MAX_STACK_SIZE)
     225           0 :     mooseError("PerfGraph is out of stack space!");
     226             : 
     227    85906272 :   _stack[_current_position] = new_node;
     228             : 
     229             :   // Add this to the execution list unless the message is empty - but pre-emted by live_print_all
     230    86497324 :   if ((_pid == 0 && !_disable_live_print) &&
     231      591052 :       (!_perf_graph_registry.readSectionInfo(id)._live_message.empty() || _live_print_all))
     232      203423 :     addToExecutionList(id, IncrementState::STARTED, current_time, start_memory);
     233             : }
     234             : 
     235             : void
     236    85828289 : PerfGraph::pop()
     237             : {
     238    85828289 :   if (!_active)
     239        3433 :     return;
     240             : 
     241    85824856 :   auto current_time = std::chrono::steady_clock::now();
     242             : 
     243    85824856 :   auto & current_node = _stack[_current_position];
     244             : 
     245             :   MemoryUtils::Stats stats;
     246    85824856 :   auto memory_success = MemoryUtils::getMemoryStats(stats);
     247             : 
     248    85824856 :   long int current_memory = 0;
     249             : 
     250    85824856 :   if (memory_success)
     251             :   {
     252             :     const auto memory_mb =
     253    85824856 :         MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     254    85824856 :     current_memory = memory_mb;
     255    85824856 :     updateMaxMemory(memory_mb);
     256             :   }
     257             :   // If we weren't able to get the memory stats, let's just use the start memory
     258           0 :   else if (_current_position !=
     259             :            -1) // If we weren't able to get the memory stats, let's just use the start memory
     260           0 :     current_memory = _stack[_current_position]->startMemory();
     261             : 
     262    85824856 :   current_node->addTimeAndMemory(current_time, current_memory);
     263             : 
     264    85824856 :   _current_position--;
     265             : 
     266             :   // Add this to the exection list
     267    86412996 :   if ((_pid == 0 && !_disable_live_print) &&
     268      588140 :       (!_perf_graph_registry.readSectionInfo(current_node->id())._live_message.empty() ||
     269      385111 :        _live_print_all))
     270             :   {
     271      203029 :     addToExecutionList(current_node->id(), IncrementState::FINISHED, current_time, current_memory);
     272             : 
     273             :     // Tell the printing thread that a section has finished
     274             :     //
     275             :     // Note: no mutex is needed here because we're using an atomic
     276             :     // in the predicate of the condition_variable in the thread
     277             :     // This is technically correct - but there is a chance of missing a signal
     278             :     // For us - that chance is low and doesn't matter (the timeout will just be hit
     279             :     // instead). So - I would rather not have an extra lock here in the main thread.
     280      203029 :     _finished_section.notify_one();
     281             :   }
     282             : }
     283             : 
     284             : void
     285      112801 : PerfGraph::update()
     286             : {
     287             :   // First update all of the currently running nodes
     288      112801 :   auto now = std::chrono::steady_clock::now();
     289             : 
     290             :   MemoryUtils::Stats stats;
     291      112801 :   MemoryUtils::getMemoryStats(stats);
     292             :   auto now_memory =
     293      112801 :       MemoryUtils::convertBytes(stats._physical_memory, MemoryUtils::MemUnits::Megabytes);
     294             : 
     295      650014 :   for (int i = 0; i <= _current_position; i++)
     296             :   {
     297      537213 :     auto node = _stack[i];
     298      537213 :     node->addTimeAndMemory(now, now_memory);
     299      537213 :     node->setStartTimeAndMemory(now, now_memory);
     300             :   }
     301             : 
     302             :   // Zero out the entries
     303     7145900 :   for (auto & section_time_it : _cumulative_section_info)
     304             :   {
     305     7033099 :     auto & section_time = section_time_it.second;
     306             : 
     307     7033099 :     section_time._num_calls = 0;
     308     7033099 :     section_time._self = 0.;
     309     7033099 :     section_time._children = 0.;
     310     7033099 :     section_time._total = 0.;
     311     7033099 :     section_time._self_memory = 0;
     312     7033099 :     section_time._children_memory = 0;
     313     7033099 :     section_time._total_memory = 0.;
     314             :   }
     315             : 
     316      112801 :   recursivelyUpdate(*_root_node);
     317             : 
     318             :   // Update vector pointing to section times
     319             :   // Note: we are doing this _after_ recursively filling
     320             :   // because new entries may have been created
     321      112801 :   _cumulative_section_info_ptrs.resize(_perf_graph_registry.numSections());
     322             : 
     323    14813216 :   for (auto & section_time_it : _cumulative_section_info)
     324             :   {
     325    14700415 :     auto id = _perf_graph_registry.sectionID(section_time_it.first);
     326             : 
     327    14700415 :     _cumulative_section_info_ptrs[id] = &section_time_it.second;
     328             :   }
     329      112801 : }
     330             : 
     331             : void
     332    17539997 : PerfGraph::recursivelyUpdate(const PerfNode & current_node)
     333             : {
     334    17539997 :   const auto & section_name = _perf_graph_registry.readSectionInfo(current_node.id())._name;
     335             : 
     336             :   // RHS insertion on purpose
     337    17539997 :   auto & section_time = _cumulative_section_info[section_name];
     338             : 
     339    17539997 :   section_time._self += current_node.selfTimeSec();
     340    17539997 :   section_time._children += current_node.childrenTimeSec();
     341    17539997 :   section_time._total += current_node.totalTimeSec();
     342    17539997 :   section_time._num_calls += current_node.numCalls();
     343             : 
     344    17539997 :   section_time._self_memory += current_node.selfMemory();
     345    17539997 :   section_time._children_memory += current_node.childrenMemory();
     346    17539997 :   section_time._total_memory += current_node.totalMemory();
     347             : 
     348    34967193 :   for (auto & child_it : current_node.children())
     349    17427196 :     recursivelyUpdate(*child_it.second);
     350    17539997 : }
     351             : 
     352             : PerfGraph::FullTable
     353       53346 : PerfGraph::treeTable(const unsigned int level, const bool heaviest /* = false */)
     354             : {
     355       53346 :   update();
     356             : 
     357             :   FullTable vtable({"Section",
     358             :                     "Calls",
     359             :                     "Self(s)",
     360             :                     "Avg(s)",
     361             :                     "%",
     362             :                     "Mem(MB)",
     363             :                     "Total(s)",
     364             :                     "Avg(s)",
     365             :                     "%",
     366             :                     "Mem(MB)"},
     367      106692 :                    10);
     368             : 
     369      106692 :   vtable.setColumnFormat({
     370             :       VariadicTableColumnFormat::AUTO,    // Section Name
     371             :       VariadicTableColumnFormat::AUTO,    // Calls
     372             :       VariadicTableColumnFormat::FIXED,   // Self
     373             :       VariadicTableColumnFormat::FIXED,   // Avg.
     374             :       VariadicTableColumnFormat::PERCENT, // %
     375             :       VariadicTableColumnFormat::AUTO,    // Memory
     376             :       VariadicTableColumnFormat::FIXED,   // Total
     377             :       VariadicTableColumnFormat::FIXED,   // Avg.
     378             :       VariadicTableColumnFormat::PERCENT, // %
     379             :       VariadicTableColumnFormat::AUTO,    // Memory
     380             :   });
     381             : 
     382      106692 :   vtable.setColumnPrecision({
     383             :       1, // Section Name
     384             :       0, // Calls
     385             :       3, // Self
     386             :       3, // Avg.
     387             :       2, // %
     388             :       0, // Memory
     389             :       3, // Total
     390             :       3, // Avg.
     391             :       2, // %
     392             :       0, // Memory
     393             :   });
     394             : 
     395      666378 :   auto act = [this, &vtable](const PerfNode & node,
     396             :                              const moose::internal::PerfGraphSectionInfo & section_info,
     397             :                              const unsigned int depth)
     398             :   {
     399     2665512 :     vtable.addRow(std::string(depth * 2, ' ') + section_info._name,        // Section Name
     400             :                   node.numCalls(),                                         // Calls
     401             :                   node.selfTimeSec(),                                      // Self
     402             :                   node.selfTimeAvg(),                                      // Avg.
     403      666378 :                   100. * node.selfTimeSec() / _root_node->totalTimeSec(),  // %
     404             :                   node.selfMemory(),                                       // Memory
     405             :                   node.totalTimeSec(),                                     // Total
     406             :                   node.totalTimeAvg(),                                     // Avg.
     407      666378 :                   100. * node.totalTimeSec() / _root_node->totalTimeSec(), // %
     408             :                   node.totalMemory());                                     // Memory
     409      666378 :   };
     410       53346 :   treeRecurse(act, level, heaviest);
     411             : 
     412      106692 :   return vtable;
     413           0 : }
     414             : 
     415             : void
     416       53326 : PerfGraph::print(const ConsoleStream & console, unsigned int level)
     417             : {
     418       53326 :   console << "\nPerformance Graph:\n";
     419       53326 :   treeTable(level).print(console);
     420       53326 : }
     421             : 
     422             : void
     423          20 : PerfGraph::printHeaviestBranch(const ConsoleStream & console)
     424             : {
     425          20 :   console << "\nHeaviest Branch:\n";
     426          20 :   treeTable(MOOSE_MAX_STACK_SIZE, /* heaviest = */ true).print(console);
     427          20 : }
     428             : 
     429             : void
     430          20 : PerfGraph::printHeaviestSections(const ConsoleStream & console, const unsigned int num_sections)
     431             : {
     432          20 :   update();
     433             : 
     434          20 :   console << "\nHeaviest Sections:\n";
     435             : 
     436             :   // Indirect Sort The Self Time
     437          20 :   std::vector<size_t> sorted;
     438          20 :   Moose::indirectSort(_cumulative_section_info_ptrs.begin(),
     439             :                       _cumulative_section_info_ptrs.end(),
     440             :                       sorted,
     441       17933 :                       [](CumulativeSectionInfo * lhs, CumulativeSectionInfo * rhs)
     442             :                       {
     443       17933 :                         if (lhs && rhs)
     444       17761 :                           return lhs->_self > rhs->_self;
     445             : 
     446             :                         // If the LHS exists - it's definitely bigger than a non-existant RHS
     447         172 :                         if (lhs)
     448          64 :                           return true;
     449             : 
     450             :                         // Both don't exist - so it doesn't matter how we sort them
     451         108 :                         return false;
     452             :                       });
     453             : 
     454          40 :   HeaviestTable vtable({"Section", "Calls", "Self(s)", "Avg.", "%", "Mem(MB)"}, 10);
     455             : 
     456          40 :   vtable.setColumnFormat({VariadicTableColumnFormat::AUTO,    // Section; doesn't matter
     457             :                           VariadicTableColumnFormat::AUTO,    // Calls
     458             :                           VariadicTableColumnFormat::FIXED,   // Time
     459             :                           VariadicTableColumnFormat::FIXED,   // Avg.
     460             :                           VariadicTableColumnFormat::PERCENT, // Percent
     461             :                           VariadicTableColumnFormat::AUTO}    // Memory
     462             :   );
     463             : 
     464          40 :   vtable.setColumnPrecision({
     465             :       1, // Section
     466             :       1, // Calls
     467             :       3, // Time
     468             :       3, // Avg.
     469             :       2, // Percent
     470             :       1  // Memory
     471             :   });
     472             : 
     473             :   mooseAssert(!_cumulative_section_info_ptrs.empty(),
     474             :               "update() must be run before printHeaviestSections()!");
     475             : 
     476             :   // The total time of the root node
     477          20 :   auto total_root_time = _cumulative_section_info_ptrs[_root_node_id]->_total;
     478             : 
     479             :   // Now print out the largest ones
     480         165 :   for (unsigned int i = 0; i < num_sections; i++)
     481             :   {
     482         145 :     auto id = sorted[i];
     483             : 
     484         145 :     if (!_cumulative_section_info_ptrs[id])
     485           0 :       continue;
     486             : 
     487         145 :     const auto & entry = *_cumulative_section_info_ptrs[id];
     488         145 :     vtable.addRow(_perf_graph_registry.sectionInfo(id)._name,        // Section
     489         145 :                   entry._num_calls,                                  // Calls
     490         145 :                   entry._self,                                       // Time
     491         145 :                   entry._self / static_cast<Real>(entry._num_calls), // Avg.
     492         145 :                   100. * entry._self / total_root_time,              // Percent
     493         145 :                   entry._self_memory);                               // Memory
     494             :   }
     495             : 
     496          20 :   vtable.print(console);
     497          20 : }
     498             : 
     499             : void
     500   171731128 : PerfGraph::updateMaxMemory(const std::size_t current_memory)
     501             : {
     502             :   // We shouldn't need to lock _max_memory in-between getting
     503             :   // it and setting it as this should only ever be set outside
     504             :   // of threads. _max_memory is an atomic so that getting it
     505             :   // is thread safe.
     506   171731128 :   if (current_memory > getMaxMemory())
     507      879843 :     _max_memory = current_memory;
     508   171731128 : }
     509             : 
     510             : void
     511       45637 : dataStore(std::ostream & stream, PerfGraph & perf_graph, void *)
     512             : {
     513             :   // We need to store the registry id -> section info map so that we can add
     514             :   // registered sections that may not be added yet during recover
     515       45637 :   dataStore(stream, perf_graph._perf_graph_registry._id_to_item, nullptr);
     516             : 
     517             :   // Update before serializing the nodes so that the time/memory/calls are correct
     518       45637 :   perf_graph.update();
     519             : 
     520             :   // Recursively serialize all of the nodes
     521       45637 :   dataStore(stream, perf_graph._root_node, nullptr);
     522       45637 : }
     523             : 
     524             : void
     525       13192 : dataLoad(std::istream & stream, PerfGraph & perf_graph, void *)
     526             : {
     527             :   // Load in all of the recovered sections and register those that do not exist yet
     528       13192 :   std::vector<moose::internal::PerfGraphSectionInfo> recovered_section_info;
     529       13192 :   dataLoad(stream, recovered_section_info, nullptr);
     530     2149055 :   for (const auto & info : recovered_section_info)
     531             :   {
     532     2135863 :     if (info._live_message.size())
     533      942117 :       perf_graph._perf_graph_registry.registerSection(
     534      942117 :           info._name, info._level, info._live_message, info._print_dots);
     535             :     else
     536     1193746 :       perf_graph._perf_graph_registry.registerSection(info._name, info._level);
     537             :   }
     538             : 
     539             :   // Update the current node time/memory/calls before loading the nodes as the load
     540             :   // will append information to current nodes that exist
     541       13192 :   perf_graph.update();
     542             : 
     543             :   // Recursively load all of the nodes; this will append information to matching nodes
     544             :   // and will create new nodes for section paths that do not exist
     545       13192 :   dataLoad(stream, perf_graph._root_node, &perf_graph);
     546       13192 : }

Generated by: LCOV version 1.14