10 #ifdef MOOSE_LIBTORCH_ENABLED    17 template <
typename SamplerType>
    20   : 
libMesh::ParallelObject(comm), _nn(nn)
    24 template <
typename SamplerType>
    27                                                                   const unsigned int num_batches)
    31   if (num_samples < num_batches)
    34   else if (num_samples % num_batches == 0)
    35     return num_samples / num_batches;
    41     const unsigned int sample_per_batch_1 = num_samples / num_batches;
    42     const unsigned int remainder_1 = num_samples % num_batches;
    43     const unsigned int sample_per_batch_2 = sample_per_batch_1 - 1;
    44     const unsigned int remainder_2 =
    45         num_samples - (num_samples / sample_per_batch_2) * sample_per_batch_2;
    47     const Real rel_remainder1 = 
Real(remainder_1) / 
Real(sample_per_batch_1);
    48     const Real rel_remainder2 = 
Real(remainder_2) / 
Real(sample_per_batch_2);
    50     return rel_remainder2 > rel_remainder1 ? sample_per_batch_2 : sample_per_batch_1;
    54 template <
typename SamplerType>
    57     const unsigned int batch_size, 
const unsigned int num_ranks)
    61   if (batch_size < num_ranks)
    62     mooseError(
"The number of used processors is greater than the number of samples in the batch!");
    63   else if (batch_size % num_ranks == 0)
    64     return batch_size / num_ranks;
    66     return batch_size / num_ranks + 1;
    69 template <
typename SamplerType>
    70 std::unique_ptr<torch::optim::Optimizer>
    74   std::unique_ptr<torch::optim::Optimizer> optimizer;
    78       optimizer = std::make_unique<torch::optim::Adam>(
    79           nn.parameters(), torch::optim::AdamOptions(options.
learning_rate));
    82       optimizer = std::make_unique<torch::optim::Adagrad>(nn.parameters(), options.
learning_rate);
    85       optimizer = std::make_unique<torch::optim::RMSprop>(nn.parameters(), options.
learning_rate);
    88       optimizer = std::make_unique<torch::optim::SGD>(nn.parameters(), options.
learning_rate);
    94 template <
typename SamplerType>
   101   const auto t_begin = MPI_Wtime();
   111   int real_rank = processor_id();
   113   int used_rank = real_rank < num_ranks ? real_rank : 0;
   115   const auto num_samples = dataset.
size().value();
   118     mooseError(
"The number of used processors* number of requestedf batches " +
   120                " is greater than the number of samples used for the training!");
   123   const unsigned int sample_per_batch = computeBatchSize(num_samples, options.
num_batches);
   126   const unsigned int sample_per_proc = computeLocalBatchSize(sample_per_batch, num_ranks);
   129   auto transformed_data_set = dataset.map(torch::data::transforms::Stack<>());
   132   SamplerType sampler(num_samples, num_ranks, used_rank, options.
allow_duplicates);
   136       torch::data::make_data_loader(std::move(transformed_data_set), sampler, sample_per_proc);
   139   std::unique_ptr<torch::optim::Optimizer> optimizer = createOptimizer(_nn, options);
   142   Real initial_loss = 1.0;
   143   Real epoch_loss = 0.0;
   146   unsigned int epoch = 1;
   147   while (epoch <= options.num_epochs && rel_loss > options.
rel_loss_tol)
   151     for (
auto & batch : *data_loader)
   154       optimizer->zero_grad();
   157       torch::Tensor prediction = _nn.forward(batch.data);
   160       torch::Tensor loss = torch::mse_loss(prediction, batch.target);
   166       if (real_rank == used_rank)
   167         epoch_loss += loss.item<
double>();
   174       for (
auto & param : _nn.named_parameters())
   176         if (real_rank != used_rank)
   177           param.value().grad().data() = param.value().grad().data() * 0.0;
   179         MPI_Allreduce(MPI_IN_PLACE,
   180                       param.value().grad().data_ptr(),
   181                       param.value().grad().numel(),
   184                       _communicator.get());
   186         param.value().grad().data() = param.value().grad().data() / num_ranks;
   195     _communicator.sum(epoch_loss);
   197     epoch_loss = epoch_loss / options.
num_batches / num_ranks;
   200       initial_loss = epoch_loss;
   202     rel_loss = epoch_loss / initial_loss;
   207         Moose::out << 
"Epoch: " << epoch << 
" | Loss: " << COLOR_GREEN << epoch_loss
   208                    << COLOR_DEFAULT << 
" | Rel. loss: " << COLOR_GREEN << rel_loss << COLOR_DEFAULT
   215   auto t_end = MPI_Wtime();
   218     Moose::out << 
"Neural net training time: " << COLOR_GREEN << (t_end - t_begin) << COLOR_DEFAULT
   219                << 
" s" << std::endl;
   225     torch::data::samplers::DistributedSequentialSampler>;
 LibtorchArtificialNeuralNetTrainer(LibtorchArtificialNeuralNet &nn, const Parallel::Communicator &comm)
Construct using the neural network and a parallel communicator. 
 
unsigned int num_batches
Number of batches we want to split the dataset into. 
 
bool print_loss
If we want to print additional information during training. 
 
torch::optional< size_t > size() const override
Return the number of samples this data set contains. 
 
void mooseError(Args &&... args)
Emit an error message with the given stringified, concatenated args and terminate the application...
 
Templated class which is responsible for training LibtorchArtificialNeuralNets. 
 
static unsigned int computeLocalBatchSize(const unsigned int batch_size, const unsigned int num_ranks)
Computes the number of local samples. 
 
This class is a wrapper around a libtorch dataset which can be used by the data loaders in the neural...
 
The following methods are specializations for using the libMesh::Parallel::packed_range_* routines fo...
 
unsigned int parallel_processes
The number of allowed parallel processes. 
 
bool allow_duplicates
Parameter for sampling. 
 
Real learning_rate
The learning rate for the optimizers. 
 
MooseEnum optimizer_type
The type of optimizer we want to use for training, adam is the default due to its robustness and fast...
 
Real rel_loss_tol
The relative loss tolerance where the training shall stop. 
 
DIE A HORRIBLE DEATH HERE typedef LIBMESH_DEFAULT_SCALAR_TYPE Real
 
virtual void train(LibtorchDataset &dataset, const LibtorchTrainingOptions &options)
Train the neural network using a given (serialized) data and options for the training process...
 
MOOSE now contains C++17 code, so give a reasonable error message stating what the user can do to add...
 
static unsigned int computeBatchSize(const unsigned int num_samples, const unsigned int num_batches)
Computes the number of samples used for each batch. 
 
A struct containing necessary information for training neural networks. 
 
auto min(const L &left, const R &right)
 
unsigned int print_epoch_loss
The frequency of training loss print to console. 
 
static std::unique_ptr< torch::optim::Optimizer > createOptimizer(const LibtorchArtificialNeuralNet &nn, const LibtorchTrainingOptions &options)
Setup the optimizer based on the provided options.