LCOV - code coverage report
Current view: top level - src/outputs - Checkpoint.C (source / functions) Hit Total Coverage
Test: idaholab/moose framework: 2bf808 Lines: 121 127 95.3 %
Date: 2025-07-17 01:28:37 Functions: 10 10 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //* This file is part of the MOOSE framework
       2             : //* https://mooseframework.inl.gov
       3             : //*
       4             : //* All rights reserved, see COPYRIGHT for full restrictions
       5             : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
       6             : //*
       7             : //* Licensed under LGPL 2.1, please see LICENSE for details
       8             : //* https://www.gnu.org/licenses/lgpl-2.1.html
       9             : 
      10             : // C POSIX includes
      11             : #include <sstream>
      12             : #include <sys/stat.h>
      13             : 
      14             : #include <system_error>
      15             : 
      16             : // Moose includes
      17             : #include "Checkpoint.h"
      18             : #include "FEProblem.h"
      19             : #include "MooseApp.h"
      20             : #include "MaterialPropertyStorage.h"
      21             : #include "MooseMesh.h"
      22             : #include "MeshMetaDataInterface.h"
      23             : #include "RestartableDataWriter.h"
      24             : 
      25             : #include "libmesh/checkpoint_io.h"
      26             : #include "libmesh/enum_xdr_mode.h"
      27             : #include "libmesh/utility.h"
      28             : 
      29             : using namespace libMesh;
      30             : 
      31             : registerMooseObject("MooseApp", Checkpoint);
      32             : 
      33             : InputParameters
      34      116432 : Checkpoint::validParams()
      35             : {
      36             :   // Get the parameters from the base classes
      37      116432 :   InputParameters params = FileOutput::validParams();
      38             : 
      39             :   // Controls whether the checkpoint will actually run. Should only ever be changed by the
      40             :   // auto-checkpoint created by AutoCheckpointAction, which does not write unless a signal
      41             :   // is received.
      42      116432 :   params.addPrivateParam<CheckpointType>("checkpoint_type", CheckpointType::USER_CREATED);
      43             : 
      44      116432 :   params.addClassDescription("Output for MOOSE recovery checkpoint files.");
      45             : 
      46             :   // Typical checkpoint options
      47      116432 :   params.addParam<unsigned int>("num_files", 2, "Number of the restart files to save");
      48      116432 :   params.addParam<std::string>(
      49             :       "suffix",
      50             :       "cp",
      51             :       "This will be appended to the file_base to create the directory name for checkpoint files.");
      52             :   // For checkpoints, set the wall time output interval to defualt of 1 hour (3600 s)
      53      349296 :   params.addParam<Real>(
      54      232864 :       "wall_time_interval", 3600, "The target wall time interval (in seconds) at which to output");
      55             : 
      56             :   // Parameter to turn off wall time checkpoints
      57      349296 :   params.addParam<bool>(
      58      232864 :       "wall_time_checkpoint", true, "Whether to enable checkpoints based on elapsed wall time");
      59             : 
      60             :   // Since it makes the most sense to write checkpoints at the end of time steps,
      61             :   // change the default value of execute_on to TIMESTEP_END
      62      116432 :   ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true);
      63      232864 :   exec_enum = {EXEC_TIMESTEP_END};
      64             : 
      65      116432 :   return params;
      66      116432 : }
      67             : 
      68       45430 : Checkpoint::Checkpoint(const InputParameters & parameters)
      69             :   : FileOutput(parameters),
      70       45430 :     _checkpoint_type(getParam<CheckpointType>("checkpoint_type")),
      71       45430 :     _num_files(getParam<unsigned int>("num_files")),
      72       90860 :     _suffix(getParam<std::string>("suffix"))
      73             : {
      74             :   // Prevent the checkpoint from executing at any time other than INITIAL,
      75             :   // TIMESTEP_END, and FINAL
      76       45430 :   validateExecuteOn();
      77             : 
      78             :   // The following updates the value of _wall_time_interval if the
      79             :   // '--output-wall-time-interval' command line parameter is used.
      80             :   // If it is not used, _wall_time_interval keeps its current value.
      81             :   // 'The --output-wall-time-interval parameter is necessary for testing
      82             :   // and should only be used in the test suite.
      83       45426 :   Output::setWallTimeIntervalFromCommandLineParam();
      84             : 
      85             :   // We want to do this here so it overrides --output-wall-time-interval
      86       45426 :   if (!getParam<bool>("wall_time_checkpoint"))
      87          20 :     _wall_time_interval = std::numeric_limits<Real>::max();
      88       45426 : }
      89             : 
      90             : std::string
      91       55120 : Checkpoint::filename()
      92             : {
      93             :   // Get the time step with correct zero padding
      94       55120 :   std::ostringstream output;
      95      110240 :   output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0')
      96       55120 :          << std::right << timeStep();
      97             : 
      98      110240 :   return output.str();
      99       55120 : }
     100             : 
     101             : std::string
     102       65727 : Checkpoint::directory() const
     103             : {
     104      131454 :   return _file_base + "_" + _suffix;
     105             : }
     106             : 
     107             : bool
     108     3222516 : Checkpoint::shouldOutput()
     109             : {
     110             :   // should_output_parent ensures that we output only when _execute_on contains
     111             :   // _current_execute_flag (see Output::shouldOutput), ensuring that we wait
     112             :   // until the end of the timestep to write, preventing the output of an
     113             :   // unconverged solution.
     114     3222516 :   const bool should_output_parent = FileOutput::shouldOutput();
     115     3222516 :   if (!should_output_parent)
     116     3047609 :     return false; // No point in continuing
     117             : 
     118             :   // Check for signal
     119             :   // Reading checkpoint on time step 0 is not supported
     120      174907 :   const bool should_output_signal = (Moose::interrupt_signal_number != 0) && (timeStep() > 0);
     121      174907 :   if (should_output_signal)
     122             :   {
     123          28 :     _console << "Unix signal SIGUSR1 detected. Outputting checkpoint file.\n";
     124             :     // Reset signal number since we output
     125          28 :     Moose::interrupt_signal_number = 0;
     126          28 :     return true;
     127             :   }
     128             : 
     129             :   // Check if enough wall time has elapsed to output
     130      174879 :   const bool should_output_wall_time = _wall_time_since_last_output >= _wall_time_interval;
     131      174879 :   if (should_output_wall_time)
     132         374 :     return true;
     133             : 
     134             :   // At this point, we have checked all automatic checkpoint options. If none
     135             :   // of those triggered, then the only way a checkpoint will still be written
     136             :   // is if the user defined it. If the checkpoint is purely system-created,
     137             :   // go ahead and return false (circumvents default time_step_interval = 1 for
     138             :   // auto checkpoints).
     139      174505 :   if (_checkpoint_type == CheckpointType::SYSTEM_CREATED)
     140      164300 :     return false;
     141             : 
     142             :   // Check if the checkpoint should "normally" output, i.e. if it was created
     143             :   // through the input file
     144       10205 :   const bool should_output = (onInterval() || _current_execute_flag == EXEC_FINAL);
     145             : 
     146       10205 :   return should_output;
     147             : }
     148             : 
     149             : void
     150       10607 : Checkpoint::output()
     151             : {
     152             :   // Create the output directory
     153       10607 :   const auto cp_dir = directory();
     154       10607 :   Utility::mkdir(cp_dir.c_str());
     155             : 
     156             :   // Create the output filename
     157       10607 :   const auto current_file = filename();
     158             : 
     159             :   // Create the libMesh Checkpoint_IO object
     160       10607 :   MeshBase & mesh = _es_ptr->get_mesh();
     161       10607 :   CheckpointIO io(mesh, false);
     162             : 
     163             :   // Create checkpoint file structure
     164       10607 :   CheckpointFileNames curr_file_struct;
     165             : 
     166       10607 :   curr_file_struct.checkpoint = current_file + _app.checkpointSuffix();
     167             : 
     168             :   // Write the checkpoint file
     169       10607 :   io.write(curr_file_struct.checkpoint);
     170             : 
     171             :   // Write out meta data if there is any (only on processor zero)
     172       10607 :   if (processor_id() == 0)
     173             :   {
     174        9508 :     const auto paths = _app.writeRestartableMetaData(curr_file_struct.checkpoint);
     175        9508 :     curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
     176        9508 :   }
     177             : 
     178             :   // Write out the backup
     179       10607 :   const auto paths = _app.backup(_app.restartFolderBase(current_file));
     180       10603 :   curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
     181             : 
     182             :   // Remove old checkpoint files
     183       10603 :   updateCheckpointFiles(curr_file_struct);
     184       10603 : }
     185             : 
     186             : void
     187       10603 : Checkpoint::updateCheckpointFiles(CheckpointFileNames file_struct)
     188             : {
     189             :   // It is possible to have already written a checkpoint with the same file
     190             :   // names contained in file_struct. If this is the case, file_struct will
     191             :   // already be stored in _file_names. When this happens, the current state of
     192             :   // the simulation is likely different than the state when the duplicately
     193             :   // named checkpoint was last written. Because of this, we want to go ahead and
     194             :   // rewrite the duplicately named checkpoint, overwritting the files
     195             :   // representing the old state. For accurate bookkeeping, we will delete the
     196             :   // existing instance of file_struct from _file_names and re-append it to the
     197             :   // end of _file_names (to keep the order in which checkpoints are written
     198             :   // accurate).
     199             : 
     200       10603 :   const auto it = std::find(_file_names.begin(), _file_names.end(), file_struct);
     201             :   // file_struct was found in _file_names.
     202             :   // Delete it so it can be re-added as the last element.
     203       10603 :   if (it != _file_names.end())
     204         120 :     _file_names.erase(it);
     205             : 
     206       10603 :   _file_names.push_back(file_struct);
     207             : 
     208             :   // Remove the file and the corresponding directory if it's empty
     209       21904 :   const auto remove_file = [this](const std::filesystem::path & path)
     210             :   {
     211       21904 :     std::error_code err;
     212             : 
     213       21904 :     if (!std::filesystem::remove(path, err))
     214           0 :       mooseWarning("Error during the deletion of checkpoint file\n",
     215           0 :                    std::filesystem::absolute(path),
     216             :                    "\n\n",
     217           0 :                    err.message());
     218             : 
     219       21904 :     const auto dir = path.parent_path();
     220       21904 :     if (std::filesystem::is_empty(dir))
     221       10952 :       if (!std::filesystem::remove(dir, err))
     222           0 :         mooseError("Error during the deletion of checkpoint directory\n",
     223           0 :                    std::filesystem::absolute(dir),
     224             :                    "\n\n",
     225           0 :                    err.message());
     226       21904 :   };
     227             : 
     228             :   // Remove un-wanted files
     229       10603 :   if (_file_names.size() > _num_files)
     230             :   {
     231             :     // Extract the filenames to be removed
     232        5720 :     CheckpointFileNames delete_files = _file_names.front();
     233             : 
     234             :     // Remove these filenames from the list
     235        5720 :     _file_names.pop_front();
     236             : 
     237             :     // Delete restartable data
     238       27624 :     for (const auto & path : delete_files.restart)
     239       21904 :       remove_file(path);
     240             : 
     241             :     // Delete checkpoint files
     242             :     // This file may not exist so don't worry about checking for success
     243        5720 :     if (processor_id() == 0)
     244        5232 :       CheckpointIO::cleanup(delete_files.checkpoint,
     245        5232 :                             _problem_ptr->mesh().isDistributedMesh() ? comm().size() : 1);
     246        5720 :   }
     247       10603 : }
     248             : 
     249             : void
     250       45430 : Checkpoint::validateExecuteOn() const
     251             : {
     252       45430 :   const auto & execute_on = getParam<ExecFlagEnum>("execute_on");
     253      181720 :   const std::set<ExecFlagType> allowed = {EXEC_INITIAL, EXEC_TIMESTEP_END, EXEC_FINAL};
     254       90856 :   for (const auto & value : execute_on)
     255       45430 :     if (!allowed.count(value))
     256           4 :       paramError("execute_on",
     257             :                  "The exec flag ",
     258             :                  value,
     259             :                  " is not allowed. Allowed flags are INITIAL, TIMESTEP_END, and FINAL.");
     260       90856 : }
     261             : 
     262             : std::stringstream
     263       68927 : Checkpoint::checkpointInfo() const
     264             : {
     265             :   static const unsigned int console_field_width = 27;
     266       68927 :   std::stringstream checkpoint_info;
     267             : 
     268       68927 :   std::string interval_info;
     269       68927 :   if (getParam<bool>("wall_time_checkpoint"))
     270             :   {
     271       68887 :     std::stringstream interval_info_ss;
     272       68887 :     interval_info_ss << "Every " << std::defaultfloat << _wall_time_interval << " s";
     273       68887 :     interval_info = interval_info_ss.str();
     274       68887 :   }
     275             :   else
     276          40 :     interval_info = "Disabled";
     277             : 
     278       68927 :   checkpoint_info << std::left << std::setw(console_field_width)
     279       68927 :                   << "  Wall Time Interval:" << interval_info << "\n";
     280             : 
     281       68927 :   std::string user_info;
     282       68927 :   if (_checkpoint_type == CheckpointType::SYSTEM_CREATED)
     283       63284 :     user_info = "Disabled";
     284             :   else
     285        5643 :     user_info = "Outputs/" + name();
     286             : 
     287       68927 :   checkpoint_info << std::left << std::setw(console_field_width)
     288       68927 :                   << "  User Checkpoint:" << user_info << "\n";
     289             : 
     290       68927 :   if (!((interval_info == "Disabled") && (user_info == "Disabled")))
     291             :   {
     292       68907 :     checkpoint_info << std::left << std::setw(console_field_width)
     293       68907 :                     << "  # Checkpoints Kept:" << std::to_string(_num_files) << "\n";
     294       68907 :     std::string exec_on_values = "";
     295      137814 :     for (const auto & item : _execute_on)
     296       68907 :       exec_on_values += item.name() + " ";
     297       68907 :     checkpoint_info << std::left << std::setw(console_field_width)
     298       68907 :                     << "  Execute On:" << exec_on_values << "\n";
     299       68907 :   }
     300             : 
     301      137854 :   return checkpoint_info;
     302       68927 : }

Generated by: LCOV version 1.14