Line data Source code
1 : //* This file is part of the MOOSE framework
2 : //* https://mooseframework.inl.gov
3 : //*
4 : //* All rights reserved, see COPYRIGHT for full restrictions
5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 : //*
7 : //* Licensed under LGPL 2.1, please see LICENSE for details
8 : //* https://www.gnu.org/licenses/lgpl-2.1.html
9 :
10 : // C POSIX includes
11 : #include <sstream>
12 : #include <sys/stat.h>
13 :
14 : #include <system_error>
15 :
16 : // Moose includes
17 : #include "Checkpoint.h"
18 : #include "FEProblem.h"
19 : #include "MooseApp.h"
20 : #include "MaterialPropertyStorage.h"
21 : #include "MooseMesh.h"
22 : #include "MeshMetaDataInterface.h"
23 : #include "RestartableDataWriter.h"
24 :
25 : #include "libmesh/checkpoint_io.h"
26 : #include "libmesh/enum_xdr_mode.h"
27 : #include "libmesh/utility.h"
28 :
29 : using namespace libMesh;
30 :
31 : registerMooseObject("MooseApp", Checkpoint);
32 :
33 : InputParameters
34 124504 : Checkpoint::validParams()
35 : {
36 : // Get the parameters from the base classes
37 124504 : InputParameters params = FileOutput::validParams();
38 :
39 : // Controls whether the checkpoint will actually run. Should only ever be changed by the
40 : // auto-checkpoint created by AutoCheckpointAction, which does not write unless a signal
41 : // is received.
42 124504 : params.addPrivateParam<CheckpointType>("checkpoint_type", CheckpointType::USER_CREATED);
43 :
44 124504 : params.addClassDescription("Output for MOOSE recovery checkpoint files.");
45 :
46 : // Typical checkpoint options
47 124504 : params.addParam<unsigned int>("num_files", 2, "Number of the restart files to save");
48 124504 : params.addParam<std::string>(
49 : "suffix",
50 : "cp",
51 : "This will be appended to the file_base to create the directory name for checkpoint files.");
52 : // For checkpoints, set the wall time output interval to defualt of 1 hour (3600 s)
53 373512 : params.addParam<Real>(
54 249008 : "wall_time_interval", 3600, "The target wall time interval (in seconds) at which to output");
55 :
56 : // Parameter to turn off wall time checkpoints
57 373512 : params.addParam<bool>(
58 249008 : "wall_time_checkpoint", true, "Whether to enable checkpoints based on elapsed wall time");
59 :
60 : // Since it makes the most sense to write checkpoints at the end of time steps,
61 : // change the default value of execute_on to TIMESTEP_END
62 124504 : ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true);
63 249008 : exec_enum = {EXEC_TIMESTEP_END};
64 :
65 124504 : return params;
66 124504 : }
67 :
68 48982 : Checkpoint::Checkpoint(const InputParameters & parameters)
69 : : FileOutput(parameters),
70 48982 : _checkpoint_type(getParam<CheckpointType>("checkpoint_type")),
71 48982 : _num_files(getParam<unsigned int>("num_files")),
72 97964 : _suffix(getParam<std::string>("suffix"))
73 : {
74 : // Prevent the checkpoint from executing at any time other than INITIAL,
75 : // TIMESTEP_END, and FINAL
76 48982 : validateExecuteOn();
77 :
78 : // The following updates the value of _wall_time_interval if the
79 : // '--output-wall-time-interval' command line parameter is used.
80 : // If it is not used, _wall_time_interval keeps its current value.
81 : // 'The --output-wall-time-interval parameter is necessary for testing
82 : // and should only be used in the test suite.
83 48978 : Output::setWallTimeIntervalFromCommandLineParam();
84 :
85 : // We want to do this here so it overrides --output-wall-time-interval
86 48978 : if (!getParam<bool>("wall_time_checkpoint"))
87 20 : _wall_time_interval = std::numeric_limits<Real>::max();
88 48978 : }
89 :
90 : std::string
91 58937 : Checkpoint::filename()
92 : {
93 : // Get the time step with correct zero padding
94 58937 : std::ostringstream output;
95 117874 : output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0')
96 58937 : << std::right << timeStep();
97 :
98 117874 : return output.str();
99 58937 : }
100 :
101 : std::string
102 69813 : Checkpoint::directory() const
103 : {
104 139626 : return _file_base + "_" + _suffix;
105 : }
106 :
107 : bool
108 3539963 : Checkpoint::shouldOutput()
109 : {
110 : // should_output_parent ensures that we output only when _execute_on contains
111 : // _current_execute_flag (see Output::shouldOutput), ensuring that we wait
112 : // until the end of the timestep to write, preventing the output of an
113 : // unconverged solution.
114 3539963 : const bool should_output_parent = FileOutput::shouldOutput();
115 3539963 : if (!should_output_parent)
116 3349655 : return false; // No point in continuing
117 :
118 : // Check for signal
119 : // Reading checkpoint on time step 0 is not supported
120 190308 : const bool should_output_signal = (Moose::interrupt_signal_number != 0) && (timeStep() > 0);
121 190308 : if (should_output_signal)
122 : {
123 32 : _console << "Unix signal SIGUSR1 detected. Outputting checkpoint file.\n";
124 : // Reset signal number since we output
125 32 : Moose::interrupt_signal_number = 0;
126 32 : return true;
127 : }
128 :
129 : // Check if enough wall time has elapsed to output
130 190276 : const bool should_output_wall_time = _wall_time_since_last_output >= _wall_time_interval;
131 190276 : if (should_output_wall_time)
132 385 : return true;
133 :
134 : // At this point, we have checked all automatic checkpoint options. If none
135 : // of those triggered, then the only way a checkpoint will still be written
136 : // is if the user defined it. If the checkpoint is purely system-created,
137 : // go ahead and return false (circumvents default time_step_interval = 1 for
138 : // auto checkpoints).
139 189891 : if (_checkpoint_type == CheckpointType::SYSTEM_CREATED)
140 179432 : return false;
141 :
142 : // Check if the checkpoint should "normally" output, i.e. if it was created
143 : // through the input file
144 10459 : const bool should_output = (onInterval() || _current_execute_flag == EXEC_FINAL);
145 :
146 10459 : return should_output;
147 : }
148 :
149 : void
150 10876 : Checkpoint::output()
151 : {
152 : // Create the output directory
153 10876 : const auto cp_dir = directory();
154 10876 : Utility::mkdir(cp_dir.c_str());
155 :
156 : // Create the output filename
157 10876 : const auto current_file = filename();
158 :
159 : // Create the libMesh Checkpoint_IO object
160 10876 : MeshBase & mesh = _es_ptr->get_mesh();
161 10876 : CheckpointIO io(mesh, false);
162 :
163 : // Create checkpoint file structure
164 10876 : CheckpointFileNames curr_file_struct;
165 :
166 10876 : curr_file_struct.checkpoint = current_file + _app.checkpointSuffix();
167 :
168 : // Write the checkpoint file
169 10876 : io.write(curr_file_struct.checkpoint);
170 :
171 : // Write out meta data if there is any (only on processor zero)
172 10876 : if (processor_id() == 0)
173 : {
174 9759 : const auto paths = _app.writeRestartableMetaData(curr_file_struct.checkpoint);
175 9759 : curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
176 9759 : }
177 :
178 : // Write out the backup
179 10876 : const auto paths = _app.backup(_app.restartFolderBase(current_file));
180 10872 : curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end());
181 :
182 : // Remove old checkpoint files
183 10872 : updateCheckpointFiles(curr_file_struct);
184 10872 : }
185 :
186 : void
187 10872 : Checkpoint::updateCheckpointFiles(CheckpointFileNames file_struct)
188 : {
189 : // It is possible to have already written a checkpoint with the same file
190 : // names contained in file_struct. If this is the case, file_struct will
191 : // already be stored in _file_names. When this happens, the current state of
192 : // the simulation is likely different than the state when the duplicately
193 : // named checkpoint was last written. Because of this, we want to go ahead and
194 : // rewrite the duplicately named checkpoint, overwritting the files
195 : // representing the old state. For accurate bookkeeping, we will delete the
196 : // existing instance of file_struct from _file_names and re-append it to the
197 : // end of _file_names (to keep the order in which checkpoints are written
198 : // accurate).
199 :
200 10872 : const auto it = std::find(_file_names.begin(), _file_names.end(), file_struct);
201 : // file_struct was found in _file_names.
202 : // Delete it so it can be re-added as the last element.
203 10872 : if (it != _file_names.end())
204 133 : _file_names.erase(it);
205 :
206 10872 : _file_names.push_back(file_struct);
207 :
208 : // Remove the file and the corresponding directory if it's empty
209 22456 : const auto remove_file = [this](const std::filesystem::path & path)
210 : {
211 22456 : std::error_code err;
212 :
213 22456 : if (!std::filesystem::remove(path, err))
214 0 : mooseWarning("Error during the deletion of checkpoint file\n",
215 0 : std::filesystem::absolute(path),
216 : "\n\n",
217 0 : err.message());
218 :
219 22456 : const auto dir = path.parent_path();
220 22456 : if (std::filesystem::is_empty(dir))
221 11228 : if (!std::filesystem::remove(dir, err))
222 0 : mooseError("Error during the deletion of checkpoint directory\n",
223 0 : std::filesystem::absolute(dir),
224 : "\n\n",
225 0 : err.message());
226 22456 : };
227 :
228 : // Remove un-wanted files
229 10872 : if (_file_names.size() > _num_files)
230 : {
231 : // Extract the filenames to be removed
232 5862 : CheckpointFileNames delete_files = _file_names.front();
233 :
234 : // Remove these filenames from the list
235 5862 : _file_names.pop_front();
236 :
237 : // Delete restartable data
238 28318 : for (const auto & path : delete_files.restart)
239 22456 : remove_file(path);
240 :
241 : // Delete checkpoint files
242 : // This file may not exist so don't worry about checking for success
243 5862 : if (processor_id() == 0)
244 5366 : CheckpointIO::cleanup(delete_files.checkpoint,
245 5366 : _problem_ptr->mesh().isDistributedMesh() ? comm().size() : 1);
246 5862 : }
247 10872 : }
248 :
249 : void
250 48982 : Checkpoint::validateExecuteOn() const
251 : {
252 48982 : const auto & execute_on = getParam<ExecFlagEnum>("execute_on");
253 195928 : const std::set<ExecFlagType> allowed = {EXEC_INITIAL, EXEC_TIMESTEP_END, EXEC_FINAL};
254 97960 : for (const auto & value : execute_on)
255 48982 : if (!allowed.count(value))
256 4 : paramError("execute_on",
257 : "The exec flag ",
258 : value,
259 : " is not allowed. Allowed flags are INITIAL, TIMESTEP_END, and FINAL.");
260 97960 : }
261 :
262 : std::stringstream
263 74600 : Checkpoint::checkpointInfo() const
264 : {
265 : static const unsigned int console_field_width = 27;
266 74600 : std::stringstream checkpoint_info;
267 :
268 74600 : std::string interval_info;
269 74600 : if (getParam<bool>("wall_time_checkpoint"))
270 : {
271 74560 : std::stringstream interval_info_ss;
272 74560 : interval_info_ss << "Every " << std::defaultfloat << _wall_time_interval << " s";
273 74560 : interval_info = interval_info_ss.str();
274 74560 : }
275 : else
276 40 : interval_info = "Disabled";
277 :
278 74600 : checkpoint_info << std::left << std::setw(console_field_width)
279 74600 : << " Wall Time Interval:" << interval_info << "\n";
280 :
281 74600 : std::string user_info;
282 74600 : if (_checkpoint_type == CheckpointType::SYSTEM_CREATED)
283 68872 : user_info = "Disabled";
284 : else
285 5728 : user_info = "Outputs/" + name();
286 :
287 74600 : checkpoint_info << std::left << std::setw(console_field_width)
288 74600 : << " User Checkpoint:" << user_info << "\n";
289 :
290 74600 : if (!((interval_info == "Disabled") && (user_info == "Disabled")))
291 : {
292 74580 : checkpoint_info << std::left << std::setw(console_field_width)
293 74580 : << " # Checkpoints Kept:" << std::to_string(_num_files) << "\n";
294 74580 : std::string exec_on_values = "";
295 149160 : for (const auto & item : _execute_on)
296 74580 : exec_on_values += item.name() + " ";
297 74580 : checkpoint_info << std::left << std::setw(console_field_width)
298 74580 : << " Execute On:" << exec_on_values << "\n";
299 74580 : }
300 :
301 149200 : return checkpoint_info;
302 74600 : }
|