https://mooseframework.inl.gov
Checkpoint.C
Go to the documentation of this file.
1 //* This file is part of the MOOSE framework
2 //* https://mooseframework.inl.gov
3 //*
4 //* All rights reserved, see COPYRIGHT for full restrictions
5 //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 //*
7 //* Licensed under LGPL 2.1, please see LICENSE for details
8 //* https://www.gnu.org/licenses/lgpl-2.1.html
9 
10 // C POSIX includes
11 #include <sstream>
12 #include <sys/stat.h>
13 
14 #include <system_error>
15 
16 // Moose includes
17 #include "Checkpoint.h"
18 #include "FEProblem.h"
19 #include "MooseApp.h"
21 #include "MooseMesh.h"
22 #include "MeshMetaDataInterface.h"
23 #include "RestartableDataWriter.h"
24 
25 #include "libmesh/checkpoint_io.h"
26 #include "libmesh/enum_xdr_mode.h"
27 #include "libmesh/utility.h"
28 
29 using namespace libMesh;
30 
31 registerMooseObject("MooseApp", Checkpoint);
32 
35 {
36  // Get the parameters from the base classes
38 
39  // Controls whether the checkpoint will actually run. Should only ever be changed by the
40  // auto-checkpoint created by AutoCheckpointAction, which does not write unless a signal
41  // is received.
43 
44  params.addClassDescription("Output for MOOSE recovery checkpoint files.");
45 
46  // Typical checkpoint options
47  params.addParam<unsigned int>("num_files", 2, "Number of the restart files to save");
48  params.addParam<std::string>(
49  "suffix",
50  "cp",
51  "This will be appended to the file_base to create the directory name for checkpoint files.");
52  // For checkpoints, set the wall time output interval to defualt of 1 hour (3600 s)
53  params.addParam<Real>(
54  "wall_time_interval", 3600, "The target wall time interval (in seconds) at which to output");
55 
56  // Parameter to turn off wall time checkpoints
57  params.addParam<bool>(
58  "wall_time_checkpoint", true, "Whether to enable checkpoints based on elapsed wall time");
59 
60  // Since it makes the most sense to write checkpoints at the end of time steps,
61  // change the default value of execute_on to TIMESTEP_END
62  ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true);
63  exec_enum = {EXEC_TIMESTEP_END};
64 
65  return params;
66 }
67 
69  : FileOutput(parameters),
70  _checkpoint_type(getParam<CheckpointType>("checkpoint_type")),
71  _num_files(getParam<unsigned int>("num_files")),
72  _suffix(getParam<std::string>("suffix"))
73 {
74  // Prevent the checkpoint from executing at any time other than INITIAL,
75  // TIMESTEP_END, and FINAL
77 
78  // The following updates the value of _wall_time_interval if the
79  // '--output-wall-time-interval' command line parameter is used.
80  // If it is not used, _wall_time_interval keeps its current value.
81  // 'The --output-wall-time-interval parameter is necessary for testing
82  // and should only be used in the test suite.
84 
85  // We want to do this here so it overrides --output-wall-time-interval
86  if (!getParam<bool>("wall_time_checkpoint"))
88 }
89 
90 std::string
92 {
93  // Get the time step with correct zero padding
94  std::ostringstream output;
95  output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0')
96  << std::right << timeStep();
97 
98  return output.str();
99 }
100 
101 std::string
103 {
104  return _file_base + "_" + _suffix;
105 }
106 
107 bool
109 {
110  // should_output_parent ensures that we output only when _execute_on contains
111  // _current_execute_flag (see Output::shouldOutput), ensuring that we wait
112  // until the end of the timestep to write, preventing the output of an
113  // unconverged solution.
114  const bool should_output_parent = FileOutput::shouldOutput();
115  if (!should_output_parent)
116  return false; // No point in continuing
117 
118  // Check for signal
119  // Reading checkpoint on time step 0 is not supported
120  const bool should_output_signal = (Moose::interrupt_signal_number != 0) && (timeStep() > 0);
121  if (should_output_signal)
122  {
123  _console << "Unix signal SIGUSR1 detected. Outputting checkpoint file.\n";
124  // Reset signal number since we output
126  return true;
127  }
128 
129  // Check if enough wall time has elapsed to output
130  const bool should_output_wall_time = _wall_time_since_last_output >= _wall_time_interval;
131  if (should_output_wall_time)
132  return true;
133 
134  // At this point, we have checked all automatic checkpoint options. If none
135  // of those triggered, then the only way a checkpoint will still be written
136  // is if the user defined it. If the checkpoint is purely system-created,
137  // go ahead and return false (circumvents default time_step_interval = 1 for
138  // auto checkpoints).
140  return false;
141 
142  // Check if the checkpoint should "normally" output, i.e. if it was created
143  // through the input file
144  const bool should_output = (onInterval() || _current_execute_flag == EXEC_FINAL);
145 
146  return should_output;
147 }
148 
149 void
151 {
152  // Create the output directory
153  const auto cp_dir = directory();
154  Utility::mkdir(cp_dir.c_str());
155 
156  // Create the output filename
157  const auto current_file = filename();
158 
159  // Create the libMesh Checkpoint_IO object
160  MeshBase & mesh = _es_ptr->get_mesh();
161  CheckpointIO io(mesh, false);
162 
163  // Create checkpoint file structure
164  CheckpointFileNames curr_file_struct;
165 
166  curr_file_struct.checkpoint = current_file + _app.checkpointSuffix();
167 
168  // Write the checkpoint file
169  io.write(curr_file_struct.checkpoint);
170 
171  // Write out meta data if there is any (only on processor zero)
172  if (processor_id() == 0)
173  {
174  const auto paths = _app.writeRestartableMetaData(curr_file_struct.checkpoint);
175  curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
176  }
177 
178  // Write out the backup
179  const auto paths = _app.backup(_app.restartFolderBase(current_file));
180  curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
181 
182  // Remove old checkpoint files
183  updateCheckpointFiles(curr_file_struct);
184 }
185 
186 void
188 {
189  // It is possible to have already written a checkpoint with the same file
190  // names contained in file_struct. If this is the case, file_struct will
191  // already be stored in _file_names. When this happens, the current state of
192  // the simulation is likely different than the state when the duplicately
193  // named checkpoint was last written. Because of this, we want to go ahead and
194  // rewrite the duplicately named checkpoint, overwritting the files
195  // representing the old state. For accurate bookkeeping, we will delete the
196  // existing instance of file_struct from _file_names and re-append it to the
197  // end of _file_names (to keep the order in which checkpoints are written
198  // accurate).
199 
200  const auto it = std::find(_file_names.begin(), _file_names.end(), file_struct);
201  // file_struct was found in _file_names.
202  // Delete it so it can be re-added as the last element.
203  if (it != _file_names.end())
204  _file_names.erase(it);
205 
206  _file_names.push_back(file_struct);
207 
208  // Remove the file and the corresponding directory if it's empty
209  const auto remove_file = [this](const std::filesystem::path & path)
210  {
211  std::error_code err;
212 
213  if (!std::filesystem::remove(path, err))
214  mooseWarning("Error during the deletion of checkpoint file\n",
215  std::filesystem::absolute(path),
216  "\n\n",
217  err.message());
218 
219  const auto dir = path.parent_path();
220  if (std::filesystem::is_empty(dir))
221  if (!std::filesystem::remove(dir, err))
222  mooseError("Error during the deletion of checkpoint directory\n",
223  std::filesystem::absolute(dir),
224  "\n\n",
225  err.message());
226  };
227 
228  // Remove un-wanted files
229  if (_file_names.size() > _num_files)
230  {
231  // Extract the filenames to be removed
232  CheckpointFileNames delete_files = _file_names.front();
233 
234  // Remove these filenames from the list
235  _file_names.pop_front();
236 
237  // Delete restartable data
238  for (const auto & path : delete_files.restart)
239  remove_file(path);
240 
241  // Delete checkpoint files
242  // This file may not exist so don't worry about checking for success
243  if (processor_id() == 0)
244  CheckpointIO::cleanup(delete_files.checkpoint,
245  _problem_ptr->mesh().isDistributedMesh() ? comm().size() : 1);
246  }
247 }
248 
249 void
251 {
252  const auto & execute_on = getParam<ExecFlagEnum>("execute_on");
253  const std::set<ExecFlagType> allowed = {EXEC_INITIAL, EXEC_TIMESTEP_END, EXEC_FINAL};
254  for (const auto & value : execute_on)
255  if (!allowed.count(value))
256  paramError("execute_on",
257  "The exec flag ",
258  value,
259  " is not allowed. Allowed flags are INITIAL, TIMESTEP_END, and FINAL.");
260 }
261 
262 std::stringstream
264 {
265  static const unsigned int console_field_width = 27;
266  std::stringstream checkpoint_info;
267 
268  std::string interval_info;
269  if (getParam<bool>("wall_time_checkpoint"))
270  {
271  std::stringstream interval_info_ss;
272  interval_info_ss << "Every " << std::defaultfloat << _wall_time_interval << " s";
273  interval_info = interval_info_ss.str();
274  }
275  else
276  interval_info = "Disabled";
277 
278  checkpoint_info << std::left << std::setw(console_field_width)
279  << " Wall Time Interval:" << interval_info << "\n";
280 
281  std::string user_info;
283  user_info = "Disabled";
284  else
285  user_info = "Outputs/" + name();
286 
287  checkpoint_info << std::left << std::setw(console_field_width)
288  << " User Checkpoint:" << user_info << "\n";
289 
290  if (!((interval_info == "Disabled") && (user_info == "Disabled")))
291  {
292  checkpoint_info << std::left << std::setw(console_field_width)
293  << " # Checkpoints Kept:" << std::to_string(_num_files) << "\n";
294  std::string exec_on_values = "";
295  for (const auto & item : _execute_on)
296  exec_on_values += item.name() + " ";
297  checkpoint_info << std::left << std::setw(console_field_width)
298  << " Execute On:" << exec_on_values << "\n";
299  }
300 
301  return checkpoint_info;
302 }
OStreamProxy err
static const std::string & checkpointSuffix()
The file suffix for the checkpoint mesh.
Definition: MooseApp.C:3044
virtual bool shouldOutput() override
Checks if the output method should be executed.
Definition: FileOutput.C:80
virtual void write(const std::string &name) override
A MultiMooseEnum object to hold "execute_on" flags.
Definition: ExecFlagEnum.h:21
Checkpoint(const InputParameters &parameters)
Class constructor.
Definition: Checkpoint.C:68
ExecFlagEnum _execute_on
The common Execution types; this is used as the default execution type for everything except system i...
Definition: Output.h:203
A structure for storing the various output files associated with checkpoint output.
Definition: Checkpoint.h:39
CheckpointType
Enumerated type for determining what type of checkpoint this is.
Definition: Checkpoint.h:28
virtual bool onInterval()
Returns true if the output interval is satisfied.
Definition: Output.C:285
void addPrivateParam(const std::string &name, const T &value)
These method add a parameter to the InputParameters object which can be retrieved like any other para...
T & set(const std::string &name, bool quiet_mode=false)
Returns a writable reference to the named parameters.
std::filesystem::path restartFolderBase(const std::filesystem::path &folder_base) const
The file suffix for restartable data.
Definition: MooseApp.C:3059
MeshBase & mesh
std::string checkpoint
Filename for CheckpointIO file (the mesh)
Definition: Checkpoint.h:42
The main MOOSE class responsible for handling user-defined parameters in almost every MOOSE system...
const Parallel::Communicator & comm() const
std::string _file_base
The base filename from the input paramaters.
Definition: FileOutput.h:89
void updateCheckpointFiles(CheckpointFileNames file_struct)
Definition: Checkpoint.C:187
const std::string _suffix
Directory suffix.
Definition: Checkpoint.h:126
std::vector< std::filesystem::path > writeRestartableMetaData(const RestartableDataMapName &name, const std::filesystem::path &folder_base)
Writes the restartable meta data for name with a folder base of folder_base.
Definition: MooseApp.C:2557
const ExecFlagType EXEC_TIMESTEP_END
Definition: Moose.C:34
registerMooseObject("MooseApp", Checkpoint)
The following methods are specializations for using the libMesh::Parallel::packed_range_* routines fo...
virtual bool shouldOutput() override
Determines if the checkpoint should write out to a file.
Definition: Checkpoint.C:108
Real _wall_time_since_last_output
time in seconds since last output
Definition: Output.h:289
virtual const std::string & name() const
Get the name of the class.
Definition: MooseBase.h:57
void mooseWarning(Args &&... args) const
Emits a warning prefixed with object name and type.
void validateExecuteOn() const
Determines if the requested values of execute_on are valid for checkpoints.
Definition: Checkpoint.C:250
static const unsigned int console_field_width
Width used for printing simulation information.
Definition: ConsoleUtils.h:30
auto max(const L &left, const R &right)
unsigned int _padding
Number of digits to pad the extensions.
Definition: FileOutput.h:83
processor_id_type size() const
virtual std::string filename() override
Returns the base filename for the checkpoint files.
Definition: Checkpoint.C:91
CheckpointType _checkpoint_type
Determines if this checkpoint is an autosave, and what kind of autosave it is.
Definition: Checkpoint.h:120
Real value(unsigned n, unsigned alpha, unsigned beta, Real x)
virtual void output() override
Outputs a checkpoint file.
Definition: Checkpoint.C:150
Writes out three things:
Definition: Checkpoint.h:63
std::vector< std::filesystem::path > backup(const std::filesystem::path &folder_base)
Backs up the application to the folder folder_base.
Definition: MooseApp.C:1849
ExecFlagType _current_execute_flag
Current execute on flag.
Definition: Output.h:211
FEProblemBase * _problem_ptr
Pointer the the FEProblemBase object for output object (use this)
Definition: Output.h:185
static InputParameters validParams()
Definition: FileOutput.C:24
std::vector< std::filesystem::path > restart
Filenames for restartable data.
Definition: Checkpoint.h:45
MooseApp & _app
The MOOSE application this is associated with.
Definition: MooseBase.h:84
void paramError(const std::string &param, Args... args) const
Emits an error prefixed with the file and line number of the given param (from the input file) along ...
Real _wall_time_interval
Target wall time between outputs in seconds.
Definition: Output.h:244
void setWallTimeIntervalFromCommandLineParam()
Function to set the wall time interval based on value of command line parameter (used for testing onl...
Definition: Output.C:338
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
libMesh::EquationSystems * _es_ptr
Reference the the libMesh::EquationSystems object that contains the data.
Definition: Output.h:194
const MeshBase & get_mesh() const
std::deque< CheckpointFileNames > _file_names
Vector of checkpoint filename structures.
Definition: Checkpoint.h:129
virtual MooseMesh & mesh() override
int interrupt_signal_number
Used by the signal handler to determine if we should write a checkpoint file out at any point during ...
Definition: Moose.C:764
void mooseError(Args &&... args) const
Emits an error prefixed with object name and type.
void addClassDescription(const std::string &doc_string)
This method adds a description of the class that will be displayed in the input file syntax dump...
void addParam(const std::string &name, const S &value, const std::string &doc_string)
These methods add an optional parameter and a documentation string to the InputParameters object...
const ConsoleStream _console
An instance of helper class to write streams to the Console objects.
An outputter with filename support.
Definition: FileOutput.h:20
virtual int timeStep()
Get the current time step.
Definition: Output.C:389
std::stringstream checkpointInfo() const
Gathers and records information used later for console output.
Definition: Checkpoint.C:263
processor_id_type processor_id() const
virtual bool isDistributedMesh() const
Returns the final Mesh distribution type.
Definition: MooseMesh.h:1001
std::string directory() const
Retrieve the checkpoint output directory.
Definition: Checkpoint.C:102
const ExecFlagType EXEC_FINAL
Definition: Moose.C:44
static InputParameters validParams()
Definition: Checkpoint.C:34
void ErrorVector unsigned int
unsigned int _num_files
Max no. of output files to store.
Definition: Checkpoint.h:123
const ExecFlagType EXEC_INITIAL
Definition: Moose.C:28