https://mooseframework.inl.gov
LibtorchArtificialNeuralNetTrainer.C
Go to the documentation of this file.
1 //* This file is part of the MOOSE framework
2 //* https://mooseframework.inl.gov
3 //*
4 //* All rights reserved, see COPYRIGHT for full restrictions
5 //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
6 //*
7 //* Licensed under LGPL 2.1, please see LICENSE for details
8 //* https://www.gnu.org/licenses/lgpl-2.1.html
9 
10 #ifdef LIBTORCH_ENABLED
11 
13 
14 namespace Moose
15 {
16 
17 template <typename SamplerType>
20  : libMesh::ParallelObject(comm), _nn(nn)
21 {
22 }
23 
24 template <typename SamplerType>
25 unsigned int
27  const unsigned int num_batches)
28 {
29  // If we have more requested batches than the number of samples, we automatically decrease
30  // the number of batches and put one sample in each
31  if (num_samples < num_batches)
32  return 1;
33  // If the samples can be divided between the batches equally, we do that
34  else if (num_samples % num_batches == 0)
35  return num_samples / num_batches;
36  // In all other cases, we compute the batch sizes with the specified number of batches
37  // and we check if we could divide the data more evenly if we put one less sample in each
38  // batch and potentially create a new batch.
39  else
40  {
41  const unsigned int sample_per_batch_1 = num_samples / num_batches;
42  const unsigned int remainder_1 = num_samples % num_batches;
43  const unsigned int sample_per_batch_2 = sample_per_batch_1 - 1;
44  const unsigned int remainder_2 =
45  num_samples - (num_samples / sample_per_batch_2) * sample_per_batch_2;
46 
47  const Real rel_remainder1 = Real(remainder_1) / Real(sample_per_batch_1);
48  const Real rel_remainder2 = Real(remainder_2) / Real(sample_per_batch_2);
49 
50  return rel_remainder2 > rel_remainder1 ? sample_per_batch_2 : sample_per_batch_1;
51  }
52 }
53 
54 template <typename SamplerType>
55 unsigned int
57  const unsigned int batch_size, const unsigned int num_ranks)
58 {
59  // If we have more processors than the number of samples in this batch, we error out. We
60  // do not support idle processors at the moment (at least not this way).
61  if (batch_size < num_ranks)
62  mooseError("The number of used processors is greater than the number of samples in the batch!");
63  else if (batch_size % num_ranks == 0)
64  return batch_size / num_ranks;
65  else
66  return batch_size / num_ranks + 1;
67 }
68 
69 template <typename SamplerType>
70 std::unique_ptr<torch::optim::Optimizer>
72  const LibtorchArtificialNeuralNet & nn, const LibtorchTrainingOptions & options)
73 {
74  std::unique_ptr<torch::optim::Optimizer> optimizer;
75  switch (options.optimizer_type)
76  {
77  case 0:
78  optimizer = std::make_unique<torch::optim::Adam>(
79  nn.parameters(), torch::optim::AdamOptions(options.learning_rate));
80  break;
81  case 1:
82  optimizer = std::make_unique<torch::optim::Adagrad>(nn.parameters(), options.learning_rate);
83  break;
84  case 2:
85  optimizer = std::make_unique<torch::optim::RMSprop>(nn.parameters(), options.learning_rate);
86  break;
87  case 3:
88  optimizer = std::make_unique<torch::optim::SGD>(nn.parameters(), options.learning_rate);
89  break;
90  }
91  return optimizer;
92 }
93 
94 template <typename SamplerType>
95 void
97  const LibtorchTrainingOptions & options)
98 {
99  // This is used to measure the training time. Would not like to inherit from
100  // PerfGraphInterface. Other objects can time this process from the outside.
101  const auto t_begin = MPI_Wtime();
102 
103  /*
104  * It might happen that we limit the number of processors that can be used for the training
105  * through the options argument. In this case every additional rank beyond the maximum will behave
106  * as rank 0. This is necessary to avoid cases when the (number of MPI processes)*(num_batches)
107  * exceeds the number of samples.
108  */
109  int num_ranks = std::min(n_processors(), options.parallel_processes);
110  // The real rank of the current process
111  int real_rank = processor_id();
112  // The capped rank (or used rank) of the current process.
113  int used_rank = real_rank < num_ranks ? real_rank : 0;
114 
115  const auto num_samples = dataset.size().value();
116 
117  if (num_ranks * options.num_batches > num_samples)
118  mooseError("The number of used processors* number of requestedf batches " +
119  std::to_string(num_ranks * options.num_batches) +
120  " is greater than the number of samples used for the training!");
121 
122  // Compute the number of samples in each batch
123  const unsigned int sample_per_batch = computeBatchSize(num_samples, options.num_batches);
124 
125  // Compute the number of samples for this process
126  const unsigned int sample_per_proc = computeLocalBatchSize(sample_per_batch, num_ranks);
127 
128  // Transform the dataset se that the loader has an easier time
129  auto transformed_data_set = dataset.map(torch::data::transforms::Stack<>());
130 
131  // Create a sampler, this is mainly here to enable random sampling. The default is sequential
132  SamplerType sampler(num_samples, num_ranks, used_rank, options.allow_duplicates);
133 
134  // Generate a dataloader which will build our batches for training
135  auto data_loader =
136  torch::data::make_data_loader(std::move(transformed_data_set), sampler, sample_per_proc);
137 
138  // Setup the optimizer
139  std::unique_ptr<torch::optim::Optimizer> optimizer = createOptimizer(_nn, options);
140 
141  Real rel_loss = 1.0;
142  Real initial_loss = 1.0;
143  Real epoch_loss = 0.0;
144 
145  // Begin training loop
146  unsigned int epoch = 1;
147  while (epoch <= options.num_epochs && rel_loss > options.rel_loss_tol)
148  {
149  epoch_loss = 0.0;
150 
151  for (auto & batch : *data_loader)
152  {
153  // Reset gradients
154  optimizer->zero_grad();
155 
156  // Compute prediction
157  torch::Tensor prediction = _nn.forward(batch.data);
158 
159  // Compute loss values using a MSE ( mean squared error)
160  torch::Tensor loss = torch::mse_loss(prediction, batch.target);
161 
162  // Propagate error back
163  loss.backward();
164 
165  // If we are on a process whose rank is below the allowed limit, we actually collect the loss
166  if (real_rank == used_rank)
167  epoch_loss += loss.item<double>();
168 
169  // To enable the parallel training, we compute the gradients of the neural net parameters
170  // using backpropagation at each process and then average the gradients across processes.
171  // For this we sum the data on every processor and then divide it by the active processor
172  // numbers. Note: We need to zero out the gradients for inactive processors (which are beyond
173  // the predefined limit)
174  for (auto & param : _nn.named_parameters())
175  {
176  if (real_rank != used_rank)
177  param.value().grad().data() = param.value().grad().data() * 0.0;
178 
179  MPI_Allreduce(MPI_IN_PLACE,
180  param.value().grad().data_ptr(),
181  param.value().grad().numel(),
182  MPI_DOUBLE,
183  MPI_SUM,
184  _communicator.get());
185 
186  param.value().grad().data() = param.value().grad().data() / num_ranks;
187  }
188 
189  // Use new gradients to update the parameters
190  optimizer->step();
191  }
192 
193  // We also reduce the loss value to make sure every process runs the same number of epochs and
194  // does not exit the loop due to hitting the realtive error condition
195  _communicator.sum(epoch_loss);
196 
197  epoch_loss = epoch_loss / options.num_batches / num_ranks;
198 
199  if (epoch == 1)
200  initial_loss = epoch_loss;
201 
202  rel_loss = epoch_loss / initial_loss;
203 
204  // Print training information if requested
205  if (options.print_loss)
206  if (epoch % options.print_epoch_loss == 0 || epoch == 1)
207  Moose::out << "Epoch: " << epoch << " | Loss: " << COLOR_GREEN << epoch_loss
208  << COLOR_DEFAULT << " | Rel. loss: " << COLOR_GREEN << rel_loss << COLOR_DEFAULT
209  << std::endl;
210 
211  epoch += 1;
212  }
213  // This is used to measure the training time. Would not like to inherit from
214  // PerfGraphInterface. Other objects can time this process from the outside.
215  auto t_end = MPI_Wtime();
216 
217  if (options.print_loss && used_rank == 0)
218  Moose::out << "Neural net training time: " << COLOR_GREEN << (t_end - t_begin) << COLOR_DEFAULT
219  << " s" << std::endl;
220 }
221 // Explicitly instantiate for Random and Sequential samplers
223 
225  torch::data::samplers::DistributedSequentialSampler>;
226 }
227 
228 #endif
LibtorchArtificialNeuralNetTrainer(LibtorchArtificialNeuralNet &nn, const Parallel::Communicator &comm)
Construct using the neural network and a parallel communicator.
unsigned int num_batches
Number of batches we want to split the dataset into.
bool print_loss
If we want to print additional information during training.
torch::optional< size_t > size() const override
Return the number of samples this data set contains.
void mooseError(Args &&... args)
Emit an error message with the given stringified, concatenated args and terminate the application...
Definition: MooseError.h:302
Templated class which is responsible for training LibtorchArtificialNeuralNets.
static unsigned int computeLocalBatchSize(const unsigned int batch_size, const unsigned int num_ranks)
Computes the number of local samples.
This class is a wrapper around a libtorch dataset which can be used by the data loaders in the neural...
The following methods are specializations for using the libMesh::Parallel::packed_range_* routines fo...
unsigned int parallel_processes
The number of allowed parallel processes.
Real learning_rate
The learning rate for the optimizers.
MooseEnum optimizer_type
The type of optimizer we want to use for training, adam is the default due to its robustness and fast...
Real rel_loss_tol
The relative loss tolerance where the training shall stop.
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
virtual void train(LibtorchDataset &dataset, const LibtorchTrainingOptions &options)
Train the neural network using a given (serialized) data and options for the training process...
MOOSE now contains C++17 code, so give a reasonable error message stating what the user can do to add...
static unsigned int computeBatchSize(const unsigned int num_samples, const unsigned int num_batches)
Computes the number of samples used for each batch.
A struct containing necessary information for training neural networks.
auto min(const L &left, const R &right)
unsigned int print_epoch_loss
The frequency of training loss print to console.
static std::unique_ptr< torch::optim::Optimizer > createOptimizer(const LibtorchArtificialNeuralNet &nn, const LibtorchTrainingOptions &options)
Setup the optimizer based on the provided options.