https://mooseframework.inl.gov
Checkpoint.C
Go to the documentation of this file.
1 //* This file is part of the MOOSE framework
2 //* https://mooseframework.inl.gov
3 //*
4 //* All rights reserved, see COPYRIGHT for full restrictions
5 //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 //*
7 //* Licensed under LGPL 2.1, please see LICENSE for details
8 //* https://www.gnu.org/licenses/lgpl-2.1.html
9 
10 // C POSIX includes
11 #include <sstream>
12 #include <sys/stat.h>
13 
14 #include <system_error>
15 
16 // Moose includes
17 #include "Checkpoint.h"
18 #include "FEProblem.h"
19 #include "MooseApp.h"
21 #include "MooseMesh.h"
22 #include "MeshMetaDataInterface.h"
23 #include "RestartableDataWriter.h"
24 
25 #include "libmesh/checkpoint_io.h"
26 #include "libmesh/enum_xdr_mode.h"
27 #include "libmesh/utility.h"
28 
29 using namespace libMesh;
30 
31 registerMooseObject("MooseApp", Checkpoint);
32 
35 {
36  // Get the parameters from the base classes
38 
39  params.addClassDescription("Output for MOOSE recovery checkpoint files.");
40 
41  // Typical checkpoint options
42  params.addParam<unsigned int>("num_files", 2, "Number of the restart files to save");
43  params.addParam<std::string>(
44  "suffix",
45  "cp",
46  "This will be appended to the file_base to create the directory name for checkpoint files.");
47  // For checkpoints, set the wall time output interval to defualt of 1 hour (3600 s)
48  params.addParam<Real>(
49  "wall_time_interval", 3600, "The target wall time interval (in seconds) at which to output");
50 
51  // Since it makes the most sense to write checkpoints at the end of time steps,
52  // change the default value of execute_on to TIMESTEP_END
53  ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true);
54  exec_enum = {EXEC_TIMESTEP_END};
55 
56  return params;
57 }
58 
60  : FileOutput(parameters),
61  _num_files(getParam<unsigned int>("num_files")),
62  _suffix(getParam<std::string>("suffix"))
63 {
64  // Prevent the checkpoint from executing at any time other than INITIAL,
65  // TIMESTEP_END, and FINAL
67 
68  // The following updates the value of _wall_time_interval if the
69  // '--output-wall-time-interval' command line parameter is used.
70  // If it is not used, _wall_time_interval keeps its current value.
71  // 'The --output-wall-time-interval parameter is necessary for testing
72  // and should only be used in the test suite.
74 }
75 
76 std::string
78 {
79  // Get the time step with correct zero padding
80  std::ostringstream output;
81  output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0')
82  << std::right << timeStep();
83 
84  return output.str();
85 }
86 
87 std::string
89 {
90  return _file_base + "_" + _suffix;
91 }
92 
93 bool
95 {
96  // should_output_parent ensures that we output only when _execute_on contains
97  // _current_execute_flag (see Output::shouldOutput), ensuring that we wait
98  // until the end of the timestep to write, preventing the output of an
99  // unconverged solution.
100  const bool should_output_parent = FileOutput::shouldOutput();
101  if (!should_output_parent)
102  return false; // No point in continuing
103 
104  // Check for signal
105  // Reading checkpoint on time step 0 is not supported
106  const bool should_output_signal = (Moose::interrupt_signal_number != 0) && (timeStep() > 0);
107  if (should_output_signal)
108  {
109  _console << "Unix signal SIGUSR1 detected. Outputting checkpoint file.\n";
110  // Reset signal number since we output
112  return true;
113  }
114 
115  // Check if enough wall time has elapsed to output
116  const bool should_output_wall_time = _wall_time_since_last_output >= _wall_time_interval;
117  if (should_output_wall_time)
118  return true;
119 
120  // Check if the checkpoint should "normally" output, i.e. if it was created
121  // through the input file
122  const bool should_output = (onInterval() || _current_execute_flag == EXEC_FINAL);
123 
124  return should_output;
125 }
126 
127 void
129 {
130  // Create the output directory
131  const auto cp_dir = directory();
132  Utility::mkdir(cp_dir.c_str());
133 
134  // Create the output filename
135  const auto current_file = filename();
136 
137  // Create the libMesh Checkpoint_IO object
138  MeshBase & mesh = _es_ptr->get_mesh();
139  CheckpointIO io(mesh, false);
140 
141  // Create checkpoint file structure
142  CheckpointFileNames curr_file_struct;
143 
144  curr_file_struct.checkpoint = current_file + _app.checkpointSuffix();
145 
146  // Write the checkpoint file
147  io.write(curr_file_struct.checkpoint);
148 
149  // Write out meta data if there is any (only on processor zero)
150  if (processor_id() == 0)
151  {
152  const auto paths = _app.writeRestartableMetaData(curr_file_struct.checkpoint);
153  curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
154  }
155 
156  // Write out the backup
157  const auto paths = _app.backup(_app.restartFolderBase(current_file));
158  curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
159 
160  // Remove old checkpoint files
161  updateCheckpointFiles(curr_file_struct);
162 }
163 
164 void
166 {
167  // It is possible to have already written a checkpoint with the same file
168  // names contained in file_struct. If this is the case, file_struct will
169  // already be stored in _file_names. When this happens, the current state of
170  // the simulation is likely different than the state when the duplicately
171  // named checkpoint was last written. Because of this, we want to go ahead and
172  // rewrite the duplicately named checkpoint, overwritting the files
173  // representing the old state. For accurate bookkeeping, we will delete the
174  // existing instance of file_struct from _file_names and re-append it to the
175  // end of _file_names (to keep the order in which checkpoints are written
176  // accurate).
177 
178  const auto it = std::find(_file_names.begin(), _file_names.end(), file_struct);
179  // file_struct was found in _file_names.
180  // Delete it so it can be re-added as the last element.
181  if (it != _file_names.end())
182  _file_names.erase(it);
183 
184  _file_names.push_back(file_struct);
185 
186  // Remove the file and the corresponding directory if it's empty
187  const auto remove_file = [this](const std::filesystem::path & path)
188  {
189  std::error_code err;
190 
191  if (!std::filesystem::remove(path, err))
192  mooseWarning("Error during the deletion of checkpoint file\n",
193  std::filesystem::absolute(path),
194  "\n\n",
195  err.message());
196 
197  const auto dir = path.parent_path();
198  if (std::filesystem::is_empty(dir))
199  if (!std::filesystem::remove(dir, err))
200  mooseError("Error during the deletion of checkpoint directory\n",
201  std::filesystem::absolute(dir),
202  "\n\n",
203  err.message());
204  };
205 
206  // Remove un-wanted files
207  if (_file_names.size() > _num_files)
208  {
209  // Extract the filenames to be removed
210  CheckpointFileNames delete_files = _file_names.front();
211 
212  // Remove these filenames from the list
213  _file_names.pop_front();
214 
215  // Delete restartable data
216  for (const auto & path : delete_files.restart)
217  remove_file(path);
218 
219  // Delete checkpoint files
220  // This file may not exist so don't worry about checking for success
221  if (processor_id() == 0)
222  CheckpointIO::cleanup(delete_files.checkpoint,
223  _problem_ptr->mesh().isDistributedMesh() ? comm().size() : 1);
224  }
225 }
226 
227 void
229 {
230  const auto & execute_on = getParam<ExecFlagEnum>("execute_on");
231  const std::set<ExecFlagType> allowed = {EXEC_INITIAL, EXEC_TIMESTEP_END, EXEC_FINAL};
232  for (const auto & value : execute_on)
233  if (!allowed.count(value))
234  paramError("execute_on",
235  "The exec flag ",
236  value,
237  " is not allowed. Allowed flags are INITIAL, TIMESTEP_END, and FINAL.");
238 }
239 
240 std::stringstream
242 {
243  static const unsigned int console_field_width = 27;
244  std::stringstream checkpoint_info;
245 
246  std::stringstream interval_info_ss;
247  interval_info_ss << "Every " << std::defaultfloat << _wall_time_interval << " s";
248  const std::string interval_info = interval_info_ss.str();
249 
250  checkpoint_info << std::left << std::setw(console_field_width)
251  << " Wall Time Interval:" << interval_info << "\n";
252 
253  const std::string user_info = "Outputs/" + name();
254 
255  checkpoint_info << std::left << std::setw(console_field_width) << " Checkpoint:" << user_info
256  << "\n";
257 
258  checkpoint_info << std::left << std::setw(console_field_width)
259  << " # Checkpoints Kept:" << std::to_string(_num_files) << "\n";
260  std::string exec_on_values = "";
261  for (const auto & item : _execute_on)
262  exec_on_values += item.name() + " ";
263  checkpoint_info << std::left << std::setw(console_field_width)
264  << " Execute On:" << exec_on_values << "\n";
265 
266  return checkpoint_info;
267 }
OStreamProxy err
static const std::string & checkpointSuffix()
The file suffix for the checkpoint mesh.
Definition: MooseApp.C:2766
virtual bool shouldOutput() override
Checks if the output method should be executed.
Definition: FileOutput.C:80
virtual void write(const std::string &name) override
KOKKOS_INLINE_FUNCTION const T * find(const T &target, const T *const begin, const T *const end)
Find a value in an array.
Definition: KokkosUtils.h:40
A MultiMooseEnum object to hold "execute_on" flags.
Definition: ExecFlagEnum.h:21
Checkpoint(const InputParameters &parameters)
Class constructor.
Definition: Checkpoint.C:59
ExecFlagEnum _execute_on
The common Execution types; this is used as the default execution type for everything except system i...
Definition: Output.h:203
A structure for storing the various output files associated with checkpoint output.
Definition: Checkpoint.h:24
void paramError(const std::string &param, Args... args) const
Emits an error prefixed with the file and line number of the given param (from the input file) along ...
Definition: MooseBase.h:467
virtual bool onInterval()
Returns true if the output interval is satisfied.
Definition: Output.C:276
T & set(const std::string &name, bool quiet_mode=false)
Returns a writable reference to the named parameters.
std::filesystem::path restartFolderBase(const std::filesystem::path &folder_base) const
The file suffix for restartable data.
Definition: MooseApp.C:2781
MeshBase & mesh
std::string checkpoint
Filename for CheckpointIO file (the mesh)
Definition: Checkpoint.h:27
The main MOOSE class responsible for handling user-defined parameters in almost every MOOSE system...
const Parallel::Communicator & comm() const
std::string _file_base
The base filename from the input paramaters.
Definition: FileOutput.h:89
void updateCheckpointFiles(CheckpointFileNames file_struct)
Definition: Checkpoint.C:165
const std::string _suffix
Directory suffix.
Definition: Checkpoint.h:105
std::vector< std::filesystem::path > writeRestartableMetaData(const RestartableDataMapName &name, const std::filesystem::path &folder_base)
Writes the restartable meta data for name with a folder base of folder_base.
Definition: MooseApp.C:2279
const ExecFlagType EXEC_TIMESTEP_END
Definition: Moose.C:36
registerMooseObject("MooseApp", Checkpoint)
The following methods are specializations for using the libMesh::Parallel::packed_range_* routines fo...
virtual bool shouldOutput() override
Determines if the checkpoint should write out to a file.
Definition: Checkpoint.C:94
Real _wall_time_since_last_output
time in seconds since last output
Definition: Output.h:286
void validateExecuteOn() const
Determines if the requested values of execute_on are valid for checkpoints.
Definition: Checkpoint.C:228
static const unsigned int console_field_width
Width used for printing simulation information.
Definition: ConsoleUtils.h:30
unsigned int _padding
Number of digits to pad the extensions.
Definition: FileOutput.h:83
processor_id_type size() const
virtual std::string filename() override
Returns the base filename for the checkpoint files.
Definition: Checkpoint.C:77
void mooseWarning(Args &&... args) const
const std::string & name() const
Get the name of the class.
Definition: MooseBase.h:103
Real value(unsigned n, unsigned alpha, unsigned beta, Real x)
virtual void output() override
Outputs a checkpoint file.
Definition: Checkpoint.C:128
Writes out three things:
Definition: Checkpoint.h:48
std::vector< std::filesystem::path > backup(const std::filesystem::path &folder_base)
Backs up the application to the folder folder_base.
Definition: MooseApp.C:1540
ExecFlagType _current_execute_flag
Current execute on flag.
Definition: Output.h:211
FEProblemBase * _problem_ptr
Pointer the the FEProblemBase object for output object (use this)
Definition: Output.h:185
static InputParameters validParams()
Definition: FileOutput.C:24
std::vector< std::filesystem::path > restart
Filenames for restartable data.
Definition: Checkpoint.h:30
MooseApp & _app
The MOOSE application this is associated with.
Definition: MooseBase.h:385
Real _wall_time_interval
Target wall time between outputs in seconds.
Definition: Output.h:241
void setWallTimeIntervalFromCommandLineParam()
Function to set the wall time interval based on value of command line parameter (used for testing onl...
Definition: Output.C:332
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
libMesh::EquationSystems * _es_ptr
Reference the the libMesh::EquationSystems object that contains the data.
Definition: Output.h:194
const MeshBase & get_mesh() const
std::deque< CheckpointFileNames > _file_names
Vector of checkpoint filename structures.
Definition: Checkpoint.h:108
virtual MooseMesh & mesh() override
void mooseError(Args &&... args) const
Emits an error prefixed with object name and type and optionally a file path to the top-level block p...
Definition: MooseBase.h:281
int interrupt_signal_number
Used by the signal handler to determine if we should write a checkpoint file out at any point during ...
Definition: Moose.C:847
void addClassDescription(const std::string &doc_string)
This method adds a description of the class that will be displayed in the input file syntax dump...
void addParam(const std::string &name, const S &value, const std::string &doc_string)
These methods add an optional parameter and a documentation string to the InputParameters object...
const ConsoleStream _console
An instance of helper class to write streams to the Console objects.
An outputter with filename support.
Definition: FileOutput.h:20
virtual int timeStep()
Get the current time step.
Definition: Output.C:383
std::stringstream checkpointInfo() const
Gathers and records information used later for console output.
Definition: Checkpoint.C:241
processor_id_type processor_id() const
virtual bool isDistributedMesh() const
Returns the final Mesh distribution type.
Definition: MooseMesh.h:1140
std::string directory() const
Retrieve the checkpoint output directory.
Definition: Checkpoint.C:88
const ExecFlagType EXEC_FINAL
Definition: Moose.C:46
static InputParameters validParams()
Definition: Checkpoint.C:34
void ErrorVector unsigned int
unsigned int _num_files
Max no. of output files to store.
Definition: Checkpoint.h:102
const ExecFlagType EXEC_INITIAL
Definition: Moose.C:30