10 #ifdef LIBTORCH_ENABLED 17 template <
typename SamplerType>
20 :
libMesh::ParallelObject(comm), _nn(nn)
24 template <
typename SamplerType>
27 const unsigned int num_batches)
31 if (num_samples < num_batches)
34 else if (num_samples % num_batches == 0)
35 return num_samples / num_batches;
41 const unsigned int sample_per_batch_1 = num_samples / num_batches;
42 const unsigned int remainder_1 = num_samples % num_batches;
43 const unsigned int sample_per_batch_2 = sample_per_batch_1 - 1;
44 const unsigned int remainder_2 =
45 num_samples - (num_samples / sample_per_batch_2) * sample_per_batch_2;
47 const Real rel_remainder1 =
Real(remainder_1) /
Real(sample_per_batch_1);
48 const Real rel_remainder2 =
Real(remainder_2) /
Real(sample_per_batch_2);
50 return rel_remainder2 > rel_remainder1 ? sample_per_batch_2 : sample_per_batch_1;
54 template <
typename SamplerType>
57 const unsigned int batch_size,
const unsigned int num_ranks)
61 if (batch_size < num_ranks)
62 mooseError(
"The number of used processors is greater than the number of samples in the batch!");
63 else if (batch_size % num_ranks == 0)
64 return batch_size / num_ranks;
66 return batch_size / num_ranks + 1;
69 template <
typename SamplerType>
70 std::unique_ptr<torch::optim::Optimizer>
74 std::unique_ptr<torch::optim::Optimizer> optimizer;
78 optimizer = std::make_unique<torch::optim::Adam>(
79 nn.parameters(), torch::optim::AdamOptions(options.
learning_rate));
82 optimizer = std::make_unique<torch::optim::Adagrad>(nn.parameters(), options.
learning_rate);
85 optimizer = std::make_unique<torch::optim::RMSprop>(nn.parameters(), options.
learning_rate);
88 optimizer = std::make_unique<torch::optim::SGD>(nn.parameters(), options.
learning_rate);
94 template <
typename SamplerType>
101 const auto t_begin = MPI_Wtime();
111 int real_rank = processor_id();
113 int used_rank = real_rank < num_ranks ? real_rank : 0;
115 const auto num_samples = dataset.
size().value();
118 mooseError(
"The number of used processors* number of requestedf batches " +
120 " is greater than the number of samples used for the training!");
123 const unsigned int sample_per_batch = computeBatchSize(num_samples, options.
num_batches);
126 const unsigned int sample_per_proc = computeLocalBatchSize(sample_per_batch, num_ranks);
129 auto transformed_data_set = dataset.map(torch::data::transforms::Stack<>());
132 SamplerType sampler(num_samples, num_ranks, used_rank, options.
allow_duplicates);
136 torch::data::make_data_loader(std::move(transformed_data_set), sampler, sample_per_proc);
139 std::unique_ptr<torch::optim::Optimizer> optimizer = createOptimizer(_nn, options);
142 Real initial_loss = 1.0;
143 Real epoch_loss = 0.0;
146 unsigned int epoch = 1;
147 while (epoch <= options.num_epochs && rel_loss > options.
rel_loss_tol)
151 for (
auto & batch : *data_loader)
154 optimizer->zero_grad();
157 torch::Tensor prediction = _nn.forward(batch.data);
160 torch::Tensor loss = torch::mse_loss(prediction, batch.target);
166 if (real_rank == used_rank)
167 epoch_loss += loss.item<
double>();
174 for (
auto & param : _nn.named_parameters())
176 if (real_rank != used_rank)
177 param.value().grad().data() = param.value().grad().data() * 0.0;
179 MPI_Allreduce(MPI_IN_PLACE,
180 param.value().grad().data_ptr(),
181 param.value().grad().numel(),
184 _communicator.get());
186 param.value().grad().data() = param.value().grad().data() / num_ranks;
195 _communicator.sum(epoch_loss);
197 epoch_loss = epoch_loss / options.
num_batches / num_ranks;
200 initial_loss = epoch_loss;
202 rel_loss = epoch_loss / initial_loss;
207 Moose::out <<
"Epoch: " << epoch <<
" | Loss: " << COLOR_GREEN << epoch_loss
208 << COLOR_DEFAULT <<
" | Rel. loss: " << COLOR_GREEN << rel_loss << COLOR_DEFAULT
215 auto t_end = MPI_Wtime();
218 Moose::out <<
"Neural net training time: " << COLOR_GREEN << (t_end - t_begin) << COLOR_DEFAULT
219 <<
" s" << std::endl;
225 torch::data::samplers::DistributedSequentialSampler>;
LibtorchArtificialNeuralNetTrainer(LibtorchArtificialNeuralNet &nn, const Parallel::Communicator &comm)
Construct using the neural network and a parallel communicator.
unsigned int num_batches
Number of batches we want to split the dataset into.
bool print_loss
If we want to print additional information during training.
torch::optional< size_t > size() const override
Return the number of samples this data set contains.
void mooseError(Args &&... args)
Emit an error message with the given stringified, concatenated args and terminate the application...
Templated class which is responsible for training LibtorchArtificialNeuralNets.
static unsigned int computeLocalBatchSize(const unsigned int batch_size, const unsigned int num_ranks)
Computes the number of local samples.
This class is a wrapper around a libtorch dataset which can be used by the data loaders in the neural...
The following methods are specializations for using the libMesh::Parallel::packed_range_* routines fo...
unsigned int parallel_processes
The number of allowed parallel processes.
bool allow_duplicates
Parameter for sampling.
Real learning_rate
The learning rate for the optimizers.
MooseEnum optimizer_type
The type of optimizer we want to use for training, adam is the default due to its robustness and fast...
Real rel_loss_tol
The relative loss tolerance where the training shall stop.
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
virtual void train(LibtorchDataset &dataset, const LibtorchTrainingOptions &options)
Train the neural network using a given (serialized) data and options for the training process...
MOOSE now contains C++17 code, so give a reasonable error message stating what the user can do to add...
static unsigned int computeBatchSize(const unsigned int num_samples, const unsigned int num_batches)
Computes the number of samples used for each batch.
A struct containing necessary information for training neural networks.
auto min(const L &left, const R &right)
unsigned int print_epoch_loss
The frequency of training loss print to console.
static std::unique_ptr< torch::optim::Optimizer > createOptimizer(const LibtorchArtificialNeuralNet &nn, const LibtorchTrainingOptions &options)
Setup the optimizer based on the provided options.