Line data Source code
1 : //* This file is part of the MOOSE framework
2 : //* https://mooseframework.inl.gov
3 : //*
4 : //* All rights reserved, see COPYRIGHT for full restrictions
5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 : //*
7 : //* Licensed under LGPL 2.1, please see LICENSE for details
8 : //* https://www.gnu.org/licenses/lgpl-2.1.html
9 :
10 : // C POSIX includes
11 : #include <sstream>
12 : #include <sys/stat.h>
13 :
14 : #include <system_error>
15 :
16 : // Moose includes
17 : #include "Checkpoint.h"
18 : #include "FEProblem.h"
19 : #include "MooseApp.h"
20 : #include "MaterialPropertyStorage.h"
21 : #include "MooseMesh.h"
22 : #include "MeshMetaDataInterface.h"
23 : #include "RestartableDataWriter.h"
24 :
25 : #include "libmesh/checkpoint_io.h"
26 : #include "libmesh/enum_xdr_mode.h"
27 : #include "libmesh/utility.h"
28 :
29 : using namespace libMesh;
30 :
31 : registerMooseObject("MooseApp", Checkpoint);
32 :
33 : InputParameters
34 116432 : Checkpoint::validParams()
35 : {
36 : // Get the parameters from the base classes
37 116432 : InputParameters params = FileOutput::validParams();
38 :
39 : // Controls whether the checkpoint will actually run. Should only ever be changed by the
40 : // auto-checkpoint created by AutoCheckpointAction, which does not write unless a signal
41 : // is received.
42 116432 : params.addPrivateParam<CheckpointType>("checkpoint_type", CheckpointType::USER_CREATED);
43 :
44 116432 : params.addClassDescription("Output for MOOSE recovery checkpoint files.");
45 :
46 : // Typical checkpoint options
47 116432 : params.addParam<unsigned int>("num_files", 2, "Number of the restart files to save");
48 116432 : params.addParam<std::string>(
49 : "suffix",
50 : "cp",
51 : "This will be appended to the file_base to create the directory name for checkpoint files.");
52 : // For checkpoints, set the wall time output interval to defualt of 1 hour (3600 s)
53 349296 : params.addParam<Real>(
54 232864 : "wall_time_interval", 3600, "The target wall time interval (in seconds) at which to output");
55 :
56 : // Parameter to turn off wall time checkpoints
57 349296 : params.addParam<bool>(
58 232864 : "wall_time_checkpoint", true, "Whether to enable checkpoints based on elapsed wall time");
59 :
60 : // Since it makes the most sense to write checkpoints at the end of time steps,
61 : // change the default value of execute_on to TIMESTEP_END
62 116432 : ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true);
63 232864 : exec_enum = {EXEC_TIMESTEP_END};
64 :
65 116432 : return params;
66 116432 : }
67 :
68 45430 : Checkpoint::Checkpoint(const InputParameters & parameters)
69 : : FileOutput(parameters),
70 45430 : _checkpoint_type(getParam<CheckpointType>("checkpoint_type")),
71 45430 : _num_files(getParam<unsigned int>("num_files")),
72 90860 : _suffix(getParam<std::string>("suffix"))
73 : {
74 : // Prevent the checkpoint from executing at any time other than INITIAL,
75 : // TIMESTEP_END, and FINAL
76 45430 : validateExecuteOn();
77 :
78 : // The following updates the value of _wall_time_interval if the
79 : // '--output-wall-time-interval' command line parameter is used.
80 : // If it is not used, _wall_time_interval keeps its current value.
81 : // 'The --output-wall-time-interval parameter is necessary for testing
82 : // and should only be used in the test suite.
83 45426 : Output::setWallTimeIntervalFromCommandLineParam();
84 :
85 : // We want to do this here so it overrides --output-wall-time-interval
86 45426 : if (!getParam<bool>("wall_time_checkpoint"))
87 20 : _wall_time_interval = std::numeric_limits<Real>::max();
88 45426 : }
89 :
90 : std::string
91 55120 : Checkpoint::filename()
92 : {
93 : // Get the time step with correct zero padding
94 55120 : std::ostringstream output;
95 110240 : output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0')
96 55120 : << std::right << timeStep();
97 :
98 110240 : return output.str();
99 55120 : }
100 :
101 : std::string
102 65727 : Checkpoint::directory() const
103 : {
104 131454 : return _file_base + "_" + _suffix;
105 : }
106 :
107 : bool
108 3222516 : Checkpoint::shouldOutput()
109 : {
110 : // should_output_parent ensures that we output only when _execute_on contains
111 : // _current_execute_flag (see Output::shouldOutput), ensuring that we wait
112 : // until the end of the timestep to write, preventing the output of an
113 : // unconverged solution.
114 3222516 : const bool should_output_parent = FileOutput::shouldOutput();
115 3222516 : if (!should_output_parent)
116 3047609 : return false; // No point in continuing
117 :
118 : // Check for signal
119 : // Reading checkpoint on time step 0 is not supported
120 174907 : const bool should_output_signal = (Moose::interrupt_signal_number != 0) && (timeStep() > 0);
121 174907 : if (should_output_signal)
122 : {
123 28 : _console << "Unix signal SIGUSR1 detected. Outputting checkpoint file.\n";
124 : // Reset signal number since we output
125 28 : Moose::interrupt_signal_number = 0;
126 28 : return true;
127 : }
128 :
129 : // Check if enough wall time has elapsed to output
130 174879 : const bool should_output_wall_time = _wall_time_since_last_output >= _wall_time_interval;
131 174879 : if (should_output_wall_time)
132 374 : return true;
133 :
134 : // At this point, we have checked all automatic checkpoint options. If none
135 : // of those triggered, then the only way a checkpoint will still be written
136 : // is if the user defined it. If the checkpoint is purely system-created,
137 : // go ahead and return false (circumvents default time_step_interval = 1 for
138 : // auto checkpoints).
139 174505 : if (_checkpoint_type == CheckpointType::SYSTEM_CREATED)
140 164300 : return false;
141 :
142 : // Check if the checkpoint should "normally" output, i.e. if it was created
143 : // through the input file
144 10205 : const bool should_output = (onInterval() || _current_execute_flag == EXEC_FINAL);
145 :
146 10205 : return should_output;
147 : }
148 :
149 : void
150 10607 : Checkpoint::output()
151 : {
152 : // Create the output directory
153 10607 : const auto cp_dir = directory();
154 10607 : Utility::mkdir(cp_dir.c_str());
155 :
156 : // Create the output filename
157 10607 : const auto current_file = filename();
158 :
159 : // Create the libMesh Checkpoint_IO object
160 10607 : MeshBase & mesh = _es_ptr->get_mesh();
161 10607 : CheckpointIO io(mesh, false);
162 :
163 : // Create checkpoint file structure
164 10607 : CheckpointFileNames curr_file_struct;
165 :
166 10607 : curr_file_struct.checkpoint = current_file + _app.checkpointSuffix();
167 :
168 : // Write the checkpoint file
169 10607 : io.write(curr_file_struct.checkpoint);
170 :
171 : // Write out meta data if there is any (only on processor zero)
172 10607 : if (processor_id() == 0)
173 : {
174 9508 : const auto paths = _app.writeRestartableMetaData(curr_file_struct.checkpoint);
175 9508 : curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
176 9508 : }
177 :
178 : // Write out the backup
179 10607 : const auto paths = _app.backup(_app.restartFolderBase(current_file));
180 10603 : curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
181 :
182 : // Remove old checkpoint files
183 10603 : updateCheckpointFiles(curr_file_struct);
184 10603 : }
185 :
186 : void
187 10603 : Checkpoint::updateCheckpointFiles(CheckpointFileNames file_struct)
188 : {
189 : // It is possible to have already written a checkpoint with the same file
190 : // names contained in file_struct. If this is the case, file_struct will
191 : // already be stored in _file_names. When this happens, the current state of
192 : // the simulation is likely different than the state when the duplicately
193 : // named checkpoint was last written. Because of this, we want to go ahead and
194 : // rewrite the duplicately named checkpoint, overwritting the files
195 : // representing the old state. For accurate bookkeeping, we will delete the
196 : // existing instance of file_struct from _file_names and re-append it to the
197 : // end of _file_names (to keep the order in which checkpoints are written
198 : // accurate).
199 :
200 10603 : const auto it = std::find(_file_names.begin(), _file_names.end(), file_struct);
201 : // file_struct was found in _file_names.
202 : // Delete it so it can be re-added as the last element.
203 10603 : if (it != _file_names.end())
204 120 : _file_names.erase(it);
205 :
206 10603 : _file_names.push_back(file_struct);
207 :
208 : // Remove the file and the corresponding directory if it's empty
209 21904 : const auto remove_file = [this](const std::filesystem::path & path)
210 : {
211 21904 : std::error_code err;
212 :
213 21904 : if (!std::filesystem::remove(path, err))
214 0 : mooseWarning("Error during the deletion of checkpoint file\n",
215 0 : std::filesystem::absolute(path),
216 : "\n\n",
217 0 : err.message());
218 :
219 21904 : const auto dir = path.parent_path();
220 21904 : if (std::filesystem::is_empty(dir))
221 10952 : if (!std::filesystem::remove(dir, err))
222 0 : mooseError("Error during the deletion of checkpoint directory\n",
223 0 : std::filesystem::absolute(dir),
224 : "\n\n",
225 0 : err.message());
226 21904 : };
227 :
228 : // Remove un-wanted files
229 10603 : if (_file_names.size() > _num_files)
230 : {
231 : // Extract the filenames to be removed
232 5720 : CheckpointFileNames delete_files = _file_names.front();
233 :
234 : // Remove these filenames from the list
235 5720 : _file_names.pop_front();
236 :
237 : // Delete restartable data
238 27624 : for (const auto & path : delete_files.restart)
239 21904 : remove_file(path);
240 :
241 : // Delete checkpoint files
242 : // This file may not exist so don't worry about checking for success
243 5720 : if (processor_id() == 0)
244 5232 : CheckpointIO::cleanup(delete_files.checkpoint,
245 5232 : _problem_ptr->mesh().isDistributedMesh() ? comm().size() : 1);
246 5720 : }
247 10603 : }
248 :
249 : void
250 45430 : Checkpoint::validateExecuteOn() const
251 : {
252 45430 : const auto & execute_on = getParam<ExecFlagEnum>("execute_on");
253 181720 : const std::set<ExecFlagType> allowed = {EXEC_INITIAL, EXEC_TIMESTEP_END, EXEC_FINAL};
254 90856 : for (const auto & value : execute_on)
255 45430 : if (!allowed.count(value))
256 4 : paramError("execute_on",
257 : "The exec flag ",
258 : value,
259 : " is not allowed. Allowed flags are INITIAL, TIMESTEP_END, and FINAL.");
260 90856 : }
261 :
262 : std::stringstream
263 68927 : Checkpoint::checkpointInfo() const
264 : {
265 : static const unsigned int console_field_width = 27;
266 68927 : std::stringstream checkpoint_info;
267 :
268 68927 : std::string interval_info;
269 68927 : if (getParam<bool>("wall_time_checkpoint"))
270 : {
271 68887 : std::stringstream interval_info_ss;
272 68887 : interval_info_ss << "Every " << std::defaultfloat << _wall_time_interval << " s";
273 68887 : interval_info = interval_info_ss.str();
274 68887 : }
275 : else
276 40 : interval_info = "Disabled";
277 :
278 68927 : checkpoint_info << std::left << std::setw(console_field_width)
279 68927 : << " Wall Time Interval:" << interval_info << "\n";
280 :
281 68927 : std::string user_info;
282 68927 : if (_checkpoint_type == CheckpointType::SYSTEM_CREATED)
283 63284 : user_info = "Disabled";
284 : else
285 5643 : user_info = "Outputs/" + name();
286 :
287 68927 : checkpoint_info << std::left << std::setw(console_field_width)
288 68927 : << " User Checkpoint:" << user_info << "\n";
289 :
290 68927 : if (!((interval_info == "Disabled") && (user_info == "Disabled")))
291 : {
292 68907 : checkpoint_info << std::left << std::setw(console_field_width)
293 68907 : << " # Checkpoints Kept:" << std::to_string(_num_files) << "\n";
294 68907 : std::string exec_on_values = "";
295 137814 : for (const auto & item : _execute_on)
296 68907 : exec_on_values += item.name() + " ";
297 68907 : checkpoint_info << std::left << std::setw(console_field_width)
298 68907 : << " Execute On:" << exec_on_values << "\n";
299 68907 : }
300 :
301 137854 : return checkpoint_info;
302 68927 : }
|