Line data Source code
1 : //* This file is part of the MOOSE framework 2 : //* https://mooseframework.inl.gov 3 : //* 4 : //* All rights reserved, see COPYRIGHT for full restrictions 5 : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT 6 : //* 7 : //* Licensed under LGPL 2.1, please see LICENSE for details 8 : //* https://www.gnu.org/licenses/lgpl-2.1.html 9 : 10 : #ifdef LIBTORCH_ENABLED 11 : 12 : #include "LibtorchArtificialNeuralNetTrainer.h" 13 : 14 : namespace Moose 15 : { 16 : 17 : template <typename SamplerType> 18 25 : LibtorchArtificialNeuralNetTrainer<SamplerType>::LibtorchArtificialNeuralNetTrainer( 19 : LibtorchArtificialNeuralNet & nn, const Parallel::Communicator & comm) 20 25 : : libMesh::ParallelObject(comm), _nn(nn) 21 : { 22 25 : } 23 : 24 : template <typename SamplerType> 25 : unsigned int 26 24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::computeBatchSize(const unsigned int num_samples, 27 : const unsigned int num_batches) 28 : { 29 : // If we have more requested batches than the number of samples, we automatically decrease 30 : // the number of batches and put one sample in each 31 24 : if (num_samples < num_batches) 32 0 : return 1; 33 : // If the samples can be divided between the batches equally, we do that 34 24 : else if (num_samples % num_batches == 0) 35 24 : return num_samples / num_batches; 36 : // In all other cases, we compute the batch sizes with the specified number of batches 37 : // and we check if we could divide the data more evenly if we put one less sample in each 38 : // batch and potentially create a new batch. 39 : else 40 : { 41 0 : const unsigned int sample_per_batch_1 = num_samples / num_batches; 42 0 : const unsigned int remainder_1 = num_samples % num_batches; 43 0 : const unsigned int sample_per_batch_2 = sample_per_batch_1 - 1; 44 0 : const unsigned int remainder_2 = 45 : num_samples - (num_samples / sample_per_batch_2) * sample_per_batch_2; 46 : 47 0 : const Real rel_remainder1 = Real(remainder_1) / Real(sample_per_batch_1); 48 0 : const Real rel_remainder2 = Real(remainder_2) / Real(sample_per_batch_2); 49 : 50 0 : return rel_remainder2 > rel_remainder1 ? sample_per_batch_2 : sample_per_batch_1; 51 : } 52 : } 53 : 54 : template <typename SamplerType> 55 : unsigned int 56 24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::computeLocalBatchSize( 57 : const unsigned int batch_size, const unsigned int num_ranks) 58 : { 59 : // If we have more processors than the number of samples in this batch, we error out. We 60 : // do not support idle processors at the moment (at least not this way). 61 24 : if (batch_size < num_ranks) 62 0 : mooseError("The number of used processors is greater than the number of samples in the batch!"); 63 24 : else if (batch_size % num_ranks == 0) 64 24 : return batch_size / num_ranks; 65 : else 66 0 : return batch_size / num_ranks + 1; 67 : } 68 : 69 : template <typename SamplerType> 70 : std::unique_ptr<torch::optim::Optimizer> 71 24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::createOptimizer( 72 : const LibtorchArtificialNeuralNet & nn, const LibtorchTrainingOptions & options) 73 : { 74 24 : std::unique_ptr<torch::optim::Optimizer> optimizer; 75 24 : switch (options.optimizer_type) 76 : { 77 6 : case 0: 78 6 : optimizer = std::make_unique<torch::optim::Adam>( 79 12 : nn.parameters(), torch::optim::AdamOptions(options.learning_rate)); 80 6 : break; 81 6 : case 1: 82 6 : optimizer = std::make_unique<torch::optim::Adagrad>(nn.parameters(), options.learning_rate); 83 6 : break; 84 6 : case 2: 85 6 : optimizer = std::make_unique<torch::optim::RMSprop>(nn.parameters(), options.learning_rate); 86 6 : break; 87 6 : case 3: 88 6 : optimizer = std::make_unique<torch::optim::SGD>(nn.parameters(), options.learning_rate); 89 6 : break; 90 : } 91 24 : return optimizer; 92 0 : } 93 : 94 : template <typename SamplerType> 95 : void 96 24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::train(LibtorchDataset & dataset, 97 : const LibtorchTrainingOptions & options) 98 : { 99 : // This is used to measure the training time. Would not like to inherit from 100 : // PerfGraphInterface. Other objects can time this process from the outside. 101 24 : const auto t_begin = MPI_Wtime(); 102 : 103 : /* 104 : * It might happen that we limit the number of processors that can be used for the training 105 : * through the options argument. In this case every additional rank beyond the maximum will behave 106 : * as rank 0. This is necessary to avoid cases when the (number of MPI processes)*(num_batches) 107 : * exceeds the number of samples. 108 : */ 109 24 : int num_ranks = std::min(n_processors(), options.parallel_processes); 110 : // The real rank of the current process 111 24 : int real_rank = processor_id(); 112 : // The capped rank (or used rank) of the current process. 113 24 : int used_rank = real_rank < num_ranks ? real_rank : 0; 114 : 115 24 : const auto num_samples = dataset.size().value(); 116 : 117 24 : if (num_ranks * options.num_batches > num_samples) 118 0 : mooseError("The number of used processors* number of requestedf batches " + 119 0 : std::to_string(num_ranks * options.num_batches) + 120 : " is greater than the number of samples used for the training!"); 121 : 122 : // Compute the number of samples in each batch 123 24 : const unsigned int sample_per_batch = computeBatchSize(num_samples, options.num_batches); 124 : 125 : // Compute the number of samples for this process 126 24 : const unsigned int sample_per_proc = computeLocalBatchSize(sample_per_batch, num_ranks); 127 : 128 : // Transform the dataset se that the loader has an easier time 129 24 : auto transformed_data_set = dataset.map(torch::data::transforms::Stack<>()); 130 : 131 : // Create a sampler, this is mainly here to enable random sampling. The default is sequential 132 24 : SamplerType sampler(num_samples, num_ranks, used_rank, options.allow_duplicates); 133 : 134 : // Generate a dataloader which will build our batches for training 135 48 : auto data_loader = 136 24 : torch::data::make_data_loader(std::move(transformed_data_set), sampler, sample_per_proc); 137 : 138 : // Setup the optimizer 139 24 : std::unique_ptr<torch::optim::Optimizer> optimizer = createOptimizer(_nn, options); 140 : 141 24 : Real rel_loss = 1.0; 142 24 : Real initial_loss = 1.0; 143 24 : Real epoch_loss = 0.0; 144 : 145 : // Begin training loop 146 24 : unsigned int epoch = 1; 147 2064 : while (epoch <= options.num_epochs && rel_loss > options.rel_loss_tol) 148 : { 149 2040 : epoch_loss = 0.0; 150 : 151 85680 : for (auto & batch : *data_loader) 152 : { 153 : // Reset gradients 154 40800 : optimizer->zero_grad(); 155 : 156 : // Compute prediction 157 40800 : torch::Tensor prediction = _nn.forward(batch.data); 158 : 159 : // Compute loss values using a MSE ( mean squared error) 160 40800 : torch::Tensor loss = torch::mse_loss(prediction, batch.target); 161 : 162 : // Propagate error back 163 40800 : loss.backward(); 164 : 165 : // If we are on a process whose rank is below the allowed limit, we actually collect the loss 166 40800 : if (real_rank == used_rank) 167 40800 : epoch_loss += loss.item<double>(); 168 : 169 : // To enable the parallel training, we compute the gradients of the neural net parameters 170 : // using backpropagation at each process and then average the gradients across processes. 171 : // For this we sum the data on every processor and then divide it by the active processor 172 : // numbers. Note: We need to zero out the gradients for inactive processors (which are beyond 173 : // the predefined limit) 174 367200 : for (auto & param : _nn.named_parameters()) 175 : { 176 326400 : if (real_rank != used_rank) 177 0 : param.value().grad().data() = param.value().grad().data() * 0.0; 178 : 179 326400 : MPI_Allreduce(MPI_IN_PLACE, 180 : param.value().grad().data_ptr(), 181 : param.value().grad().numel(), 182 : MPI_DOUBLE, 183 : MPI_SUM, 184 : _communicator.get()); 185 : 186 326400 : param.value().grad().data() = param.value().grad().data() / num_ranks; 187 : } 188 : 189 : // Use new gradients to update the parameters 190 40800 : optimizer->step(); 191 : } 192 : 193 : // We also reduce the loss value to make sure every process runs the same number of epochs and 194 : // does not exit the loop due to hitting the realtive error condition 195 2040 : _communicator.sum(epoch_loss); 196 : 197 2040 : epoch_loss = epoch_loss / options.num_batches / num_ranks; 198 : 199 2040 : if (epoch == 1) 200 24 : initial_loss = epoch_loss; 201 : 202 2040 : rel_loss = epoch_loss / initial_loss; 203 : 204 : // Print training information if requested 205 2040 : if (options.print_loss) 206 2040 : if (epoch % options.print_epoch_loss == 0 || epoch == 1) 207 126 : Moose::out << "Epoch: " << epoch << " | Loss: " << COLOR_GREEN << epoch_loss 208 252 : << COLOR_DEFAULT << " | Rel. loss: " << COLOR_GREEN << rel_loss << COLOR_DEFAULT 209 126 : << std::endl; 210 : 211 2040 : epoch += 1; 212 : } 213 : // This is used to measure the training time. Would not like to inherit from 214 : // PerfGraphInterface. Other objects can time this process from the outside. 215 24 : auto t_end = MPI_Wtime(); 216 : 217 24 : if (options.print_loss && used_rank == 0) 218 48 : Moose::out << "Neural net training time: " << COLOR_GREEN << (t_end - t_begin) << COLOR_DEFAULT 219 24 : << " s" << std::endl; 220 24 : } 221 : // Explicitly instantiate for Random and Sequential samplers 222 : template class LibtorchArtificialNeuralNetTrainer<torch::data::samplers::DistributedRandomSampler>; 223 : 224 : template class LibtorchArtificialNeuralNetTrainer< 225 : torch::data::samplers::DistributedSequentialSampler>; 226 : } 227 : 228 : #endif