Line data Source code
1 : //* This file is part of the MOOSE framework 2 : //* https://mooseframework.inl.gov 3 : //* 4 : //* All rights reserved, see COPYRIGHT for full restrictions 5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT 6 : //* 7 : //* Licensed under LGPL 2.1, please see LICENSE for details 8 : //* https://www.gnu.org/licenses/lgpl-2.1.html 9 : 10 : // C POSIX includes 11 : #include <sstream> 12 : #include <sys/stat.h> 13 : 14 : #include <system_error> 15 : 16 : // Moose includes 17 : #include "Checkpoint.h" 18 : #include "FEProblem.h" 19 : #include "MooseApp.h" 20 : #include "MaterialPropertyStorage.h" 21 : #include "MooseMesh.h" 22 : #include "MeshMetaDataInterface.h" 23 : #include "RestartableDataWriter.h" 24 : 25 : #include "libmesh/checkpoint_io.h" 26 : #include "libmesh/enum_xdr_mode.h" 27 : #include "libmesh/utility.h" 28 : 29 : using namespace libMesh; 30 : 31 : registerMooseObject("MooseApp", Checkpoint); 32 : 33 : InputParameters 34 99712 : Checkpoint::validParams() 35 : { 36 : // Get the parameters from the base classes 37 99712 : InputParameters params = FileOutput::validParams(); 38 : 39 199424 : params.addClassDescription("Output for MOOSE recovery checkpoint files."); 40 : 41 : // Typical checkpoint options 42 398848 : params.addParam<unsigned int>("num_files", 2, "Number of the restart files to save"); 43 398848 : params.addParam<std::string>( 44 : "suffix", 45 : "cp", 46 : "This will be appended to the file_base to create the directory name for checkpoint files."); 47 : // For checkpoints, set the wall time output interval to defualt of 1 hour (3600 s) 48 299136 : params.addParam<Real>( 49 199424 : "wall_time_interval", 3600, "The target wall time interval (in seconds) at which to output"); 50 : 51 : // Since it makes the most sense to write checkpoints at the end of time steps, 52 : // change the default value of execute_on to TIMESTEP_END 53 99712 : ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true); 54 199424 : exec_enum = {EXEC_TIMESTEP_END}; 55 : 56 99712 : return params; 57 99712 : } 58 : 59 48327 : Checkpoint::Checkpoint(const InputParameters & parameters) 60 : : FileOutput(parameters), 61 48327 : _num_files(getParam<unsigned int>("num_files")), 62 193308 : _suffix(getParam<std::string>("suffix")) 63 : { 64 : // Prevent the checkpoint from executing at any time other than INITIAL, 65 : // TIMESTEP_END, and FINAL 66 48327 : validateExecuteOn(); 67 : 68 : // The following updates the value of _wall_time_interval if the 69 : // '--output-wall-time-interval' command line parameter is used. 70 : // If it is not used, _wall_time_interval keeps its current value. 71 : // 'The --output-wall-time-interval parameter is necessary for testing 72 : // and should only be used in the test suite. 73 48324 : Output::setWallTimeIntervalFromCommandLineParam(); 74 48324 : } 75 : 76 : std::string 77 60446 : Checkpoint::filename() 78 : { 79 : // Get the time step with correct zero padding 80 60446 : std::ostringstream output; 81 120892 : output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0') 82 60446 : << std::right << timeStep(); 83 : 84 120892 : return output.str(); 85 60446 : } 86 : 87 : std::string 88 73304 : Checkpoint::directory() const 89 : { 90 146608 : return _file_base + "_" + _suffix; 91 : } 92 : 93 : bool 94 303828 : Checkpoint::shouldOutput() 95 : { 96 : // should_output_parent ensures that we output only when _execute_on contains 97 : // _current_execute_flag (see Output::shouldOutput), ensuring that we wait 98 : // until the end of the timestep to write, preventing the output of an 99 : // unconverged solution. 100 303828 : const bool should_output_parent = FileOutput::shouldOutput(); 101 303828 : if (!should_output_parent) 102 290970 : return false; // No point in continuing 103 : 104 : // Check for signal 105 : // Reading checkpoint on time step 0 is not supported 106 12858 : const bool should_output_signal = (Moose::interrupt_signal_number != 0) && (timeStep() > 0); 107 12858 : if (should_output_signal) 108 : { 109 22 : _console << "Unix signal SIGUSR1 detected. Outputting checkpoint file.\n"; 110 : // Reset signal number since we output 111 22 : Moose::interrupt_signal_number = 0; 112 22 : return true; 113 : } 114 : 115 : // Check if enough wall time has elapsed to output 116 12836 : const bool should_output_wall_time = _wall_time_since_last_output >= _wall_time_interval; 117 12836 : if (should_output_wall_time) 118 275 : return true; 119 : 120 : // Check if the checkpoint should "normally" output, i.e. if it was created 121 : // through the input file 122 12561 : const bool should_output = (onInterval() || _current_execute_flag == EXEC_FINAL); 123 : 124 12561 : return should_output; 125 : } 126 : 127 : void 128 12858 : Checkpoint::output() 129 : { 130 : // Create the output directory 131 12858 : const auto cp_dir = directory(); 132 12858 : Utility::mkdir(cp_dir.c_str()); 133 : 134 : // Create the output filename 135 12858 : const auto current_file = filename(); 136 : 137 : // Create the libMesh Checkpoint_IO object 138 12858 : MeshBase & mesh = _es_ptr->get_mesh(); 139 12858 : CheckpointIO io(mesh, false); 140 : 141 : // Create checkpoint file structure 142 12858 : CheckpointFileNames curr_file_struct; 143 : 144 12858 : curr_file_struct.checkpoint = current_file + _app.checkpointSuffix(); 145 : 146 : // Write the checkpoint file 147 12858 : io.write(curr_file_struct.checkpoint); 148 : 149 : // Write out meta data if there is any (only on processor zero) 150 12858 : if (processor_id() == 0) 151 : { 152 11261 : const auto paths = _app.writeRestartableMetaData(curr_file_struct.checkpoint); 153 11261 : curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end()); 154 11261 : } 155 : 156 : // Write out the backup 157 12858 : const auto paths = _app.backup(_app.restartFolderBase(current_file)); 158 12855 : curr_file_struct.restart.insert(curr_file_struct.restart.begin(), paths.begin(), paths.end()); 159 : 160 : // Remove old checkpoint files 161 12855 : updateCheckpointFiles(curr_file_struct); 162 12855 : } 163 : 164 : void 165 12855 : Checkpoint::updateCheckpointFiles(CheckpointFileNames file_struct) 166 : { 167 : // It is possible to have already written a checkpoint with the same file 168 : // names contained in file_struct. If this is the case, file_struct will 169 : // already be stored in _file_names. When this happens, the current state of 170 : // the simulation is likely different than the state when the duplicately 171 : // named checkpoint was last written. Because of this, we want to go ahead and 172 : // rewrite the duplicately named checkpoint, overwritting the files 173 : // representing the old state. For accurate bookkeeping, we will delete the 174 : // existing instance of file_struct from _file_names and re-append it to the 175 : // end of _file_names (to keep the order in which checkpoints are written 176 : // accurate). 177 : 178 12855 : const auto it = std::find(_file_names.begin(), _file_names.end(), file_struct); 179 : // file_struct was found in _file_names. 180 : // Delete it so it can be re-added as the last element. 181 12855 : if (it != _file_names.end()) 182 121 : _file_names.erase(it); 183 : 184 12855 : _file_names.push_back(file_struct); 185 : 186 : // Remove the file and the corresponding directory if it's empty 187 23138 : const auto remove_file = [this](const std::filesystem::path & path) 188 : { 189 23138 : std::error_code err; 190 : 191 23138 : if (!std::filesystem::remove(path, err)) 192 0 : mooseWarning("Error during the deletion of checkpoint file\n", 193 0 : std::filesystem::absolute(path), 194 : "\n\n", 195 0 : err.message()); 196 : 197 23138 : const auto dir = path.parent_path(); 198 23138 : if (std::filesystem::is_empty(dir)) 199 11569 : if (!std::filesystem::remove(dir, err)) 200 0 : mooseError("Error during the deletion of checkpoint directory\n", 201 0 : std::filesystem::absolute(dir), 202 : "\n\n", 203 0 : err.message()); 204 23138 : }; 205 : 206 : // Remove un-wanted files 207 12855 : if (_file_names.size() > _num_files) 208 : { 209 : // Extract the filenames to be removed 210 6055 : CheckpointFileNames delete_files = _file_names.front(); 211 : 212 : // Remove these filenames from the list 213 6055 : _file_names.pop_front(); 214 : 215 : // Delete restartable data 216 29193 : for (const auto & path : delete_files.restart) 217 23138 : remove_file(path); 218 : 219 : // Delete checkpoint files 220 : // This file may not exist so don't worry about checking for success 221 6055 : if (processor_id() == 0) 222 5514 : CheckpointIO::cleanup(delete_files.checkpoint, 223 5514 : _problem_ptr->mesh().isDistributedMesh() ? comm().size() : 1); 224 6055 : } 225 12855 : } 226 : 227 : void 228 48327 : Checkpoint::validateExecuteOn() const 229 : { 230 96654 : const auto & execute_on = getParam<ExecFlagEnum>("execute_on"); 231 241635 : const std::set<ExecFlagType> allowed = {EXEC_INITIAL, EXEC_TIMESTEP_END, EXEC_FINAL}; 232 97273 : for (const auto & value : execute_on) 233 48949 : if (!allowed.count(value)) 234 6 : paramError("execute_on", 235 : "The exec flag ", 236 : value, 237 : " is not allowed. Allowed flags are INITIAL, TIMESTEP_END, and FINAL."); 238 96651 : } 239 : 240 : std::stringstream 241 73507 : Checkpoint::checkpointInfo() const 242 : { 243 : static const unsigned int console_field_width = 27; 244 73507 : std::stringstream checkpoint_info; 245 : 246 73507 : std::stringstream interval_info_ss; 247 73507 : interval_info_ss << "Every " << std::defaultfloat << _wall_time_interval << " s"; 248 73507 : const std::string interval_info = interval_info_ss.str(); 249 : 250 73507 : checkpoint_info << std::left << std::setw(console_field_width) 251 73507 : << " Wall Time Interval:" << interval_info << "\n"; 252 : 253 73507 : const std::string user_info = "Outputs/" + name(); 254 : 255 73507 : checkpoint_info << std::left << std::setw(console_field_width) << " Checkpoint:" << user_info 256 73507 : << "\n"; 257 : 258 73507 : checkpoint_info << std::left << std::setw(console_field_width) 259 73507 : << " # Checkpoints Kept:" << std::to_string(_num_files) << "\n"; 260 73507 : std::string exec_on_values = ""; 261 148043 : for (const auto & item : _execute_on) 262 74536 : exec_on_values += item.name() + " "; 263 73507 : checkpoint_info << std::left << std::setw(console_field_width) 264 73507 : << " Execute On:" << exec_on_values << "\n"; 265 : 266 147014 : return checkpoint_info; 267 73507 : }