www.mooseframework.org
Checkpoint.C
Go to the documentation of this file.
1 //* This file is part of the MOOSE framework
2 //* https://www.mooseframework.org
3 //*
4 //* All rights reserved, see COPYRIGHT for full restrictions
5 //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 //*
7 //* Licensed under LGPL 2.1, please see LICENSE for details
8 //* https://www.gnu.org/licenses/lgpl-2.1.html
9 
10 // C POSIX includes
11 #include <sys/stat.h>
12 
13 #include <system_error>
14 
15 // Moose includes
16 #include "Checkpoint.h"
17 #include "FEProblem.h"
18 #include "MooseApp.h"
20 #include "MooseMesh.h"
21 #include "MeshMetaDataInterface.h"
22 #include "RestartableDataWriter.h"
23 
24 #include "libmesh/checkpoint_io.h"
25 #include "libmesh/enum_xdr_mode.h"
26 #include "libmesh/utility.h"
27 
28 registerMooseObject("MooseApp", Checkpoint);
29 
32 {
33  // Get the parameters from the base classes
35 
36  // Controls whether the checkpoint will actually run. Should only ever be changed by the
37  // auto-checkpoint created by AutoCheckpointAction, which does not write unless a signal
38  // is received.
39  params.addPrivateParam<CheckpointType>("checkpoint_type", CheckpointType::NONE);
40 
41  params.addClassDescription("Output for MOOSE recovery checkpoint files.");
42 
43  // Typical checkpoint options
44  params.addParam<unsigned int>("num_files", 2, "Number of the restart files to save");
45  params.addParam<std::string>(
46  "suffix",
47  "cp",
48  "This will be appended to the file_base to create the directory name for checkpoint files.");
49  // For checkpoints, set the wall time output interval to defualt of 10 minutes (600 s)
50  params.addParam<Real>(
51  "wall_time_interval", 600, "The target wall time interval (in seconds) at which to output");
52 
53  // Since it makes the most sense to write checkpoints at the end of time steps,
54  // change the default value of execute_on to TIMESTEP_END
55  ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true);
56  exec_enum = {EXEC_TIMESTEP_END};
57 
58  return params;
59 }
60 
62  : FileOutput(parameters),
63  _checkpoint_type(getParam<CheckpointType>("checkpoint_type")),
64  _num_files(getParam<unsigned int>("num_files")),
65  _suffix(getParam<std::string>("suffix"))
66 {
67  // Prevent the checkpoint from executing at any time other than INITIAL,
68  // TIMESTEP_END, and FINAL
69  const auto & execute_on = getParam<ExecFlagEnum>("execute_on");
70 
71  // Create a vector containing all valid values of execute_on
72  std::vector<ExecFlagEnum> valid_execute_on_values(7);
73  {
74  ExecFlagEnum valid_execute_on_value = execute_on;
75  valid_execute_on_value = {EXEC_INITIAL};
76  valid_execute_on_values[0] = valid_execute_on_value;
77  valid_execute_on_value = {EXEC_TIMESTEP_END};
78  valid_execute_on_values[1] = valid_execute_on_value;
79  valid_execute_on_value = {EXEC_FINAL};
80  valid_execute_on_values[2] = valid_execute_on_value;
81  valid_execute_on_value = {EXEC_INITIAL, EXEC_TIMESTEP_END};
82  valid_execute_on_values[3] = valid_execute_on_value;
83  valid_execute_on_value = {EXEC_TIMESTEP_END, EXEC_FINAL};
84  valid_execute_on_values[4] = valid_execute_on_value;
85  valid_execute_on_value = {EXEC_INITIAL, EXEC_FINAL};
86  valid_execute_on_values[5] = valid_execute_on_value;
87  valid_execute_on_value = {EXEC_INITIAL, EXEC_TIMESTEP_END, EXEC_FINAL};
88  valid_execute_on_values[6] = valid_execute_on_value;
89  }
90 
91  // Check if the value of execute_on is valid
92  auto it = std::find(valid_execute_on_values.begin(), valid_execute_on_values.end(), execute_on);
93  const bool is_valid_value = (it != valid_execute_on_values.end());
94  if (!is_valid_value)
95  paramError("execute_on",
96  "The checkpoint system may only be used with execute_on values ",
97  "INITIAL, TIMESTEP_END, and FINAL, not '",
98  execute_on,
99  "'.");
100 
101  // The following updates the value of _wall_time_interval if the
102  // '--output-wall-time-interval' command line parameter is used.
103  // If it is not used, _wall_time_interval keeps its current value.
104  // 'The --output-wall-time-interval parameter is necessary for testing
105  // and should only be used in the test suite.
107 }
108 
109 std::string
111 {
112  // Get the time step with correct zero padding
113  std::ostringstream output;
114  output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0')
115  << std::right << timeStep();
116 
117  return output.str();
118 }
119 
120 std::string
122 {
123  return _file_base + "_" + _suffix;
124 }
125 
126 bool
128 {
129  // should_output_parent ensures that we output only when _execute_on contains
130  // _current_execute_flag (see Output::shouldOutput), ensuring that we wait
131  // until the end of the timestep to write, preventing the output of an
132  // unconverged solution.
133  const bool should_output_parent = FileOutput::shouldOutput();
134  if (!should_output_parent)
135  return false; // No point in continuing
136 
137  // Check for signal
138  // Reading checkpoint on time step 0 is not supported
139  const bool should_output_signal = (Moose::interrupt_signal_number != 0) && (timeStep() > 0);
140  if (should_output_signal)
141  {
142  _console << "Unix signal SIGUSR1 detected. Outputting checkpoint file.\n";
143  // Reset signal number since we output
145  return true;
146  }
147 
148  // Check if enough wall time has elapsed to output
149  const bool should_output_wall_time = _wall_time_since_last_output >= _wall_time_interval;
150  if (should_output_wall_time)
151  return true;
152 
153  // At this point, we have checked all automatic checkpoint options. If none
154  // of those triggered, then the only way a checkpoint will still be written
155  // is if the user defined it. If the checkpoint is purely system-created,
156  // go ahead and return false (circumvents default time_step_interval = 1 for
157  // auto checkpoints).
159  return false;
160 
161  // Check if the checkpoint should "normally" output, i.e. if it was created
162  // through the input file
163  const bool should_output = (onInterval() || _current_execute_flag == EXEC_FINAL);
164 
165  return should_output;
166 }
167 
168 void
170 {
171  // Create the output directory
172  const auto cp_dir = directory();
173  Utility::mkdir(cp_dir.c_str());
174 
175  // Create the output filename
176  const auto current_file = filename();
177 
178  // Create the libMesh Checkpoint_IO object
179  MeshBase & mesh = _es_ptr->get_mesh();
180  CheckpointIO io(mesh, true);
181 
182  // Create checkpoint file structure
183  CheckpointFileNames curr_file_struct;
184 
185  curr_file_struct.checkpoint = current_file + _app.checkpointSuffix();
186 
187  // Write the checkpoint file
188  io.write(curr_file_struct.checkpoint);
189 
190  // Write out meta data if there is any (only on processor zero)
191  if (processor_id() == 0)
192  {
193  const auto paths = _app.writeRestartableMetaData(curr_file_struct.checkpoint);
194  curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
195  }
196 
197  // Write out the backup
198  const auto paths = _app.backup(_app.restartFolderBase(current_file));
199  curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
200 
201  // Remove old checkpoint files
202  updateCheckpointFiles(curr_file_struct);
203 }
204 
205 void
207 {
208  // Update the list of stored files
209  _file_names.push_back(file_struct);
210 
211  // Remove the file and the corresponding directory if it's empty
212  const auto remove_file = [this](const std::filesystem::path & path)
213  {
214  std::error_code err;
215 
216  if (!std::filesystem::remove(path, err))
217  mooseWarning("Error during the deletion of checkpoint file\n",
218  std::filesystem::absolute(path),
219  "\n\n",
220  err.message());
221 
222  const auto dir = path.parent_path();
223  if (std::filesystem::is_empty(dir))
224  if (!std::filesystem::remove(dir, err))
225  mooseError("Error during the deletion of checkpoint directory\n",
226  std::filesystem::absolute(dir),
227  "\n\n",
228  err.message());
229  };
230 
231  // Remove un-wanted files
232  if (_file_names.size() > _num_files)
233  {
234  // Extract the filenames to be removed
235  CheckpointFileNames delete_files = _file_names.front();
236 
237  // Remove these filenames from the list
238  _file_names.pop_front();
239 
240  // Delete restartable data
241  for (const auto & path : delete_files.restart)
242  remove_file(path);
243 
244  // Delete checkpoint files
245  // This file may not exist so don't worry about checking for success
246  if (processor_id() == 0)
247  CheckpointIO::cleanup(delete_files.checkpoint,
248  _problem_ptr->mesh().isDistributedMesh() ? comm().size() : 1);
249  }
250 }
OStreamProxy err
static const std::string & checkpointSuffix()
The file suffix for the checkpoint mesh.
Definition: MooseApp.C:2396
virtual bool shouldOutput() override
Checks if the output method should be executed.
Definition: FileOutput.C:79
A MultiMooseEnum object to hold "execute_on" flags.
Definition: ExecFlagEnum.h:21
Checkpoint(const InputParameters &parameters)
Class constructor.
Definition: Checkpoint.C:61
A structure for storing the various output files associated with checkpoint output.
Definition: Checkpoint.h:40
CheckpointType
Enumerated type for determining what type of checkpoint this is.
Definition: Checkpoint.h:28
virtual bool onInterval()
Returns true if the output interval is satisfied.
Definition: Output.C:286
void addPrivateParam(const std::string &name, const T &value)
These method add a parameter to the InputParameters object which can be retrieved like any other para...
bool isDistributedMesh() const
Returns the final Mesh distribution type.
Definition: MooseMesh.h:984
T & set(const std::string &name, bool quiet_mode=false)
Returns a writable reference to the named parameters.
std::filesystem::path restartFolderBase(const std::filesystem::path &folder_base) const
The file suffix for restartable data.
Definition: MooseApp.C:2411
MeshBase & mesh
std::string checkpoint
Filename for CheckpointIO file (the mesh)
Definition: Checkpoint.h:43
The main MOOSE class responsible for handling user-defined parameters in almost every MOOSE system...
const Parallel::Communicator & comm() const
std::string _file_base
The base filename from the input paramaters.
Definition: FileOutput.h:89
void updateCheckpointFiles(CheckpointFileNames file_struct)
Definition: Checkpoint.C:206
const std::string _suffix
Directory suffix.
Definition: Checkpoint.h:106
std::vector< std::filesystem::path > writeRestartableMetaData(const RestartableDataMapName &name, const std::filesystem::path &folder_base)
Writes the restartable meta data for name with a folder base of folder_base.
Definition: MooseApp.C:1909
const ExecFlagType EXEC_TIMESTEP_END
Definition: Moose.C:32
registerMooseObject("MooseApp", Checkpoint)
virtual bool shouldOutput() override
Determines if the checkpoint should write out to a file.
Definition: Checkpoint.C:127
Real _wall_time_since_last_output
time in seconds since last output
Definition: Output.h:283
void mooseWarning(Args &&... args) const
Emits a warning prefixed with object name and type.
unsigned int _padding
Number of digits to pad the extensions.
Definition: FileOutput.h:83
processor_id_type size() const
virtual std::string filename() override
Returns the base filename for the checkpoint files.
Definition: Checkpoint.C:110
CheckpointType _checkpoint_type
Determines if this checkpoint is an autosave, and what kind of autosave it is.
Definition: Checkpoint.h:100
virtual void output() override
Outputs a checkpoint file.
Definition: Checkpoint.C:169
EquationSystems * _es_ptr
Reference the the libMesh::EquationSystems object that contains the data.
Definition: Output.h:188
Writes out three things:
Definition: Checkpoint.h:58
std::vector< std::filesystem::path > backup(const std::filesystem::path &folder_base)
Backs up the application to the folder folder_base.
Definition: MooseApp.C:1214
ExecFlagType _current_execute_flag
Current execute on flag.
Definition: Output.h:205
FEProblemBase * _problem_ptr
Pointer the the FEProblemBase object for output object (use this)
Definition: Output.h:179
static InputParameters validParams()
Definition: FileOutput.C:24
std::vector< std::filesystem::path > restart
Filenames for restartable data.
Definition: Checkpoint.h:46
MooseApp & _app
The MOOSE application this is associated with.
Definition: MooseBase.h:69
void paramError(const std::string &param, Args... args) const
Emits an error prefixed with the file and line number of the given param (from the input file) along ...
Real _wall_time_interval
Target wall time between outputs in seconds.
Definition: Output.h:238
void setWallTimeIntervalFromCommandLineParam()
Function to set the wall time interval based on value of command line parameter (used for testing onl...
Definition: Output.C:336
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
std::deque< CheckpointFileNames > _file_names
Vector of checkpoint filename structures.
Definition: Checkpoint.h:109
virtual MooseMesh & mesh() override
int interrupt_signal_number
Used by the signal handler to determine if we should write a checkpoint file out at any point during ...
Definition: Moose.C:641
void mooseError(Args &&... args) const
Emits an error prefixed with object name and type.
void addClassDescription(const std::string &doc_string)
This method adds a description of the class that will be displayed in the input file syntax dump...
void addParam(const std::string &name, const S &value, const std::string &doc_string)
These methods add an option parameter and a documentation string to the InputParameters object...
const ConsoleStream _console
An instance of helper class to write streams to the Console objects.
An outputter with filename support.
Definition: FileOutput.h:20
virtual int timeStep()
Get the current time step.
Definition: Output.C:387
processor_id_type processor_id() const
std::string directory() const
Retrieve the checkpoint output directory.
Definition: Checkpoint.C:121
const ExecFlagType EXEC_FINAL
Definition: Moose.C:38
static InputParameters validParams()
Definition: Checkpoint.C:31
void ErrorVector unsigned int
unsigned int _num_files
Max no. of output files to store.
Definition: Checkpoint.h:103
const ExecFlagType EXEC_INITIAL
Definition: Moose.C:28