LCOV - code coverage report
Current view: top level - src/libtorch/utils - LibtorchArtificialNeuralNetTrainer.C (source / functions) Hit Total Coverage
Test: idaholab/moose framework: 2bf808 Lines: 76 90 84.4 %
Date: 2025-07-17 01:28:37 Functions: 5 10 50.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //* This file is part of the MOOSE framework
       2             : //* https://mooseframework.inl.gov
       3             : //*
       4             : //* All rights reserved, see COPYRIGHT for full restrictions
       5             : //* https://github.com/idaholab/moose/blob/master/COPYRIGHT
       6             : //*
       7             : //* Licensed under LGPL 2.1, please see LICENSE for details
       8             : //* https://www.gnu.org/licenses/lgpl-2.1.html
       9             : 
      10             : #ifdef LIBTORCH_ENABLED
      11             : 
      12             : #include "LibtorchArtificialNeuralNetTrainer.h"
      13             : 
      14             : namespace Moose
      15             : {
      16             : 
      17             : template <typename SamplerType>
      18          25 : LibtorchArtificialNeuralNetTrainer<SamplerType>::LibtorchArtificialNeuralNetTrainer(
      19             :     LibtorchArtificialNeuralNet & nn, const Parallel::Communicator & comm)
      20          25 :   : libMesh::ParallelObject(comm), _nn(nn)
      21             : {
      22          25 : }
      23             : 
      24             : template <typename SamplerType>
      25             : unsigned int
      26          24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::computeBatchSize(const unsigned int num_samples,
      27             :                                                                   const unsigned int num_batches)
      28             : {
      29             :   // If we have more requested batches than the number of samples, we automatically decrease
      30             :   // the number of batches and put one sample in each
      31          24 :   if (num_samples < num_batches)
      32           0 :     return 1;
      33             :   // If the samples can be divided between the batches equally, we do that
      34          24 :   else if (num_samples % num_batches == 0)
      35          24 :     return num_samples / num_batches;
      36             :   // In all other cases, we compute the batch sizes with the specified number of batches
      37             :   // and we check if we could divide the data more evenly if we put one less sample in each
      38             :   // batch and potentially create a new batch.
      39             :   else
      40             :   {
      41           0 :     const unsigned int sample_per_batch_1 = num_samples / num_batches;
      42           0 :     const unsigned int remainder_1 = num_samples % num_batches;
      43           0 :     const unsigned int sample_per_batch_2 = sample_per_batch_1 - 1;
      44           0 :     const unsigned int remainder_2 =
      45             :         num_samples - (num_samples / sample_per_batch_2) * sample_per_batch_2;
      46             : 
      47           0 :     const Real rel_remainder1 = Real(remainder_1) / Real(sample_per_batch_1);
      48           0 :     const Real rel_remainder2 = Real(remainder_2) / Real(sample_per_batch_2);
      49             : 
      50           0 :     return rel_remainder2 > rel_remainder1 ? sample_per_batch_2 : sample_per_batch_1;
      51             :   }
      52             : }
      53             : 
      54             : template <typename SamplerType>
      55             : unsigned int
      56          24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::computeLocalBatchSize(
      57             :     const unsigned int batch_size, const unsigned int num_ranks)
      58             : {
      59             :   // If we have more processors than the number of samples in this batch, we error out. We
      60             :   // do not support idle processors at the moment (at least not this way).
      61          24 :   if (batch_size < num_ranks)
      62           0 :     mooseError("The number of used processors is greater than the number of samples in the batch!");
      63          24 :   else if (batch_size % num_ranks == 0)
      64          24 :     return batch_size / num_ranks;
      65             :   else
      66           0 :     return batch_size / num_ranks + 1;
      67             : }
      68             : 
      69             : template <typename SamplerType>
      70             : std::unique_ptr<torch::optim::Optimizer>
      71          24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::createOptimizer(
      72             :     const LibtorchArtificialNeuralNet & nn, const LibtorchTrainingOptions & options)
      73             : {
      74          24 :   std::unique_ptr<torch::optim::Optimizer> optimizer;
      75          24 :   switch (options.optimizer_type)
      76             :   {
      77           6 :     case 0:
      78           6 :       optimizer = std::make_unique<torch::optim::Adam>(
      79          12 :           nn.parameters(), torch::optim::AdamOptions(options.learning_rate));
      80           6 :       break;
      81           6 :     case 1:
      82           6 :       optimizer = std::make_unique<torch::optim::Adagrad>(nn.parameters(), options.learning_rate);
      83           6 :       break;
      84           6 :     case 2:
      85           6 :       optimizer = std::make_unique<torch::optim::RMSprop>(nn.parameters(), options.learning_rate);
      86           6 :       break;
      87           6 :     case 3:
      88           6 :       optimizer = std::make_unique<torch::optim::SGD>(nn.parameters(), options.learning_rate);
      89           6 :       break;
      90             :   }
      91          24 :   return optimizer;
      92           0 : }
      93             : 
      94             : template <typename SamplerType>
      95             : void
      96          24 : LibtorchArtificialNeuralNetTrainer<SamplerType>::train(LibtorchDataset & dataset,
      97             :                                                        const LibtorchTrainingOptions & options)
      98             : {
      99             :   // This is used to measure the training time. Would not like to inherit from
     100             :   // PerfGraphInterface. Other objects can time this process from the outside.
     101          24 :   const auto t_begin = MPI_Wtime();
     102             : 
     103             :   /*
     104             :    * It might happen that we limit the number of processors that can be used for the training
     105             :    * through the options argument. In this case every additional rank beyond the maximum will behave
     106             :    * as rank 0. This is necessary to avoid cases when the (number of MPI processes)*(num_batches)
     107             :    * exceeds the number of samples.
     108             :    */
     109          24 :   int num_ranks = std::min(n_processors(), options.parallel_processes);
     110             :   // The real rank of the current process
     111          24 :   int real_rank = processor_id();
     112             :   // The capped rank (or used rank) of the current process.
     113          24 :   int used_rank = real_rank < num_ranks ? real_rank : 0;
     114             : 
     115          24 :   const auto num_samples = dataset.size().value();
     116             : 
     117          24 :   if (num_ranks * options.num_batches > num_samples)
     118           0 :     mooseError("The number of used processors* number of requestedf batches " +
     119           0 :                std::to_string(num_ranks * options.num_batches) +
     120             :                " is greater than the number of samples used for the training!");
     121             : 
     122             :   // Compute the number of samples in each batch
     123          24 :   const unsigned int sample_per_batch = computeBatchSize(num_samples, options.num_batches);
     124             : 
     125             :   // Compute the number of samples for this process
     126          24 :   const unsigned int sample_per_proc = computeLocalBatchSize(sample_per_batch, num_ranks);
     127             : 
     128             :   // Transform the dataset se that the loader has an easier time
     129          24 :   auto transformed_data_set = dataset.map(torch::data::transforms::Stack<>());
     130             : 
     131             :   // Create a sampler, this is mainly here to enable random sampling. The default is sequential
     132          24 :   SamplerType sampler(num_samples, num_ranks, used_rank, options.allow_duplicates);
     133             : 
     134             :   // Generate a dataloader which will build our batches for training
     135          48 :   auto data_loader =
     136          24 :       torch::data::make_data_loader(std::move(transformed_data_set), sampler, sample_per_proc);
     137             : 
     138             :   // Setup the optimizer
     139          24 :   std::unique_ptr<torch::optim::Optimizer> optimizer = createOptimizer(_nn, options);
     140             : 
     141          24 :   Real rel_loss = 1.0;
     142          24 :   Real initial_loss = 1.0;
     143          24 :   Real epoch_loss = 0.0;
     144             : 
     145             :   // Begin training loop
     146          24 :   unsigned int epoch = 1;
     147        2064 :   while (epoch <= options.num_epochs && rel_loss > options.rel_loss_tol)
     148             :   {
     149        2040 :     epoch_loss = 0.0;
     150             : 
     151       85680 :     for (auto & batch : *data_loader)
     152             :     {
     153             :       // Reset gradients
     154       40800 :       optimizer->zero_grad();
     155             : 
     156             :       // Compute prediction
     157       40800 :       torch::Tensor prediction = _nn.forward(batch.data);
     158             : 
     159             :       // Compute loss values using a MSE ( mean squared error)
     160       40800 :       torch::Tensor loss = torch::mse_loss(prediction, batch.target);
     161             : 
     162             :       // Propagate error back
     163       40800 :       loss.backward();
     164             : 
     165             :       // If we are on a process whose rank is below the allowed limit, we actually collect the loss
     166       40800 :       if (real_rank == used_rank)
     167       40800 :         epoch_loss += loss.item<double>();
     168             : 
     169             :       // To enable the parallel training, we compute the gradients of the neural net parameters
     170             :       // using backpropagation at each process and then average the gradients across processes.
     171             :       // For this we sum the data on every processor and then divide it by the active processor
     172             :       // numbers. Note: We need to zero out the gradients for inactive processors (which are beyond
     173             :       // the predefined limit)
     174      367200 :       for (auto & param : _nn.named_parameters())
     175             :       {
     176      326400 :         if (real_rank != used_rank)
     177           0 :           param.value().grad().data() = param.value().grad().data() * 0.0;
     178             : 
     179      326400 :         MPI_Allreduce(MPI_IN_PLACE,
     180             :                       param.value().grad().data_ptr(),
     181             :                       param.value().grad().numel(),
     182             :                       MPI_DOUBLE,
     183             :                       MPI_SUM,
     184             :                       _communicator.get());
     185             : 
     186      326400 :         param.value().grad().data() = param.value().grad().data() / num_ranks;
     187             :       }
     188             : 
     189             :       // Use new gradients to update the parameters
     190       40800 :       optimizer->step();
     191             :     }
     192             : 
     193             :     // We also reduce the loss value to make sure every process runs the same number of epochs and
     194             :     // does not exit the loop due to hitting the realtive error condition
     195        2040 :     _communicator.sum(epoch_loss);
     196             : 
     197        2040 :     epoch_loss = epoch_loss / options.num_batches / num_ranks;
     198             : 
     199        2040 :     if (epoch == 1)
     200          24 :       initial_loss = epoch_loss;
     201             : 
     202        2040 :     rel_loss = epoch_loss / initial_loss;
     203             : 
     204             :     // Print training information if requested
     205        2040 :     if (options.print_loss)
     206        2040 :       if (epoch % options.print_epoch_loss == 0 || epoch == 1)
     207         126 :         Moose::out << "Epoch: " << epoch << " | Loss: " << COLOR_GREEN << epoch_loss
     208         252 :                    << COLOR_DEFAULT << " | Rel. loss: " << COLOR_GREEN << rel_loss << COLOR_DEFAULT
     209         126 :                    << std::endl;
     210             : 
     211        2040 :     epoch += 1;
     212             :   }
     213             :   // This is used to measure the training time. Would not like to inherit from
     214             :   // PerfGraphInterface. Other objects can time this process from the outside.
     215          24 :   auto t_end = MPI_Wtime();
     216             : 
     217          24 :   if (options.print_loss && used_rank == 0)
     218          48 :     Moose::out << "Neural net training time: " << COLOR_GREEN << (t_end - t_begin) << COLOR_DEFAULT
     219          24 :                << " s" << std::endl;
     220          24 : }
     221             : // Explicitly instantiate for Random and Sequential samplers
     222             : template class LibtorchArtificialNeuralNetTrainer<torch::data::samplers::DistributedRandomSampler>;
     223             : 
     224             : template class LibtorchArtificialNeuralNetTrainer<
     225             :     torch::data::samplers::DistributedSequentialSampler>;
     226             : }
     227             : 
     228             : #endif

Generated by: LCOV version 1.14