Source code for acm.model.train

import numpy as np
from pathlib import Path
import logging

import torch
from pytorch_lightning import seed_everything
from sunbird.emulators import FCN, train
from sunbird.data import ArrayDataModule

from acm.utils.logging import setup_logging
setup_logging()
logger = logging.getLogger('ACM trainer')


[docs]
def TrainFCN(
    # Data from DataObservable
    lhc_y: np.ndarray,
    lhc_x: np.ndarray,
    lhc_x_names: list,
    covariance_matrix: np.ndarray,
    stat_name: str,
    model_dir: str,
    n_test: int|list,
    # Hyperparameters
    learning_rate: float,
    n_hidden: list,
    dropout_rate: float,
    weight_decay: float,
    act_fn: str = 'learned_sigmoid',
    loss: str = 'mae',
    # Training
    max_epochs: int = 5000,
    log_dir: str = None,
    seed: int = None,
    # Data transforms
    transform = None, 
    )-> float:
    """
    Train a Fully Connected Neural Network (FCN) emulator for the given statistic, with the given hyperparameters.
    This function expects the LHC data and the covariance matrix to be in the same format as the one used in the ACM pipeline.

    Parameters
    ----------
    lhc_y : np.ndarray
        LHC y data for the statistic to train on (outputs)
    lhc_x : np.ndarray
        LHC x data for the statistic to train on (inputs).
    lhc_x_names : list
        List of the names of the input parameters.
    covariance_matrix : np.ndarray
        Covariance matrix for the statistic to train on.
    stat_name : str
        Statistic to train on.
    model_dir : str, optional
        Directory to save the model. 
    n_test : int|list
        Number of training samples to select from the LHC data. Must be smaller than the total number of samples.
        If a list is provided, those indexes are used to select the test samples (excluded from the training set).
        If an integer is provided, the first n_test samples are used for testing.
        Set to 0 to use all the samples for training.
    learning_rate : float
        Learning rate for the optimizer.
    n_hidden : list
        List of integers, number of neurons in each hidden layer.
    dropout_rate : float
        Dropout rate for the hidden layers.
    weight_decay : float
        Weight decay for the optimizer.
    act_fn : str, optional
        Activation function for the hidden layers. Defaults to 'learned_sigmoid'.
    loss : str, optional
        Loss function to use. Defaults to 'mae'.
    max_epochs : int, optional
        Maximum number of epochs to train the model. Defaults to 5000.
    log_dir : str, optional
        Directory to save the pytorch lightning logs.
        If set to None, the logs are saved in the current directory. Defaults to None.
    transform : callable, optional
        Transform to apply to the output features, from the `sunbird.data.transforms` or `sunbird.data.transforms_array` modules. Defaults to None.

    Returns
    -------
    float
        Validation loss of the model.
    """
    logger.info(f'Loaded LHC with shape: {lhc_x.shape}, {lhc_y.shape}')

    logger.info(f'Loaded covariance matrix with shape: {covariance_matrix.shape}')

    if transform: 
        logger.info(f'Applying transform: {type(transform).__name__}')
        try: # Handle sunbird.data.transforms
            lhc_y = transform.fit_transform(lhc_y)
        except: # Handle sunbird.data.transforms_array
            lhc_y = transform.transform(lhc_y) 
    
    # Set the first n_test samples to the testing set 
    n_tot = len(lhc_y) # Total number of data points
    if isinstance(n_test, int):
        idx_train = list(range(n_test, n_tot))
    elif isinstance(n_test, list):
        idx_train = list(set(range(n_tot)) - set(n_test))
    
    if len(idx_train) > n_tot:
        raise ValueError(f'Number of training samples ({n_test=}) is larger than the total number of samples ({n_tot=})')

    logger.info(f'Using {len(idx_train)} last samples for training')

    lhc_train_x = lhc_x[idx_train]
    lhc_train_y = lhc_y[idx_train]

    train_mean = np.mean(lhc_train_y, axis=0)
    train_std = np.std(lhc_train_y, axis=0)

    train_mean_x = np.mean(lhc_train_x, axis=0)
    train_std_x = np.std(lhc_train_x, axis=0)

    data = ArrayDataModule(
        x=torch.Tensor(lhc_train_x),
        y=torch.Tensor(lhc_train_y), 
        val_fraction=0.2, # NOTE : Hardcoded values here : Ok ?
        batch_size=128,
        num_workers=0)
    data.setup()
    
    if seed is not None:
        logger.info(f'Setting seed to {seed}')
        seed_everything(seed, workers=True)

    model = FCN(
            n_input=data.n_input,
            n_output=data.n_output,
            n_hidden=n_hidden,
            dropout_rate=dropout_rate, 
            learning_rate=learning_rate,
            scheduler_patience=10, # NOTE : Hardcoded values here : Ok ?
            scheduler_factor=0.5,
            scheduler_threshold=1.e-6,
            weight_decay=weight_decay,
            act_fn=act_fn,
            loss=loss,
            training=True,
            mean_output=train_mean,
            std_output=train_std,
            mean_input=train_mean_x,
            std_input=train_std_x,
            transform_output=transform,
            standarize_output=True,
            coordinates=lhc_x_names,
            covariance_matrix=covariance_matrix,
        )
    
    if model_dir is not None: # To avoid some errors with Path
        model_dir = Path(model_dir) / f'{stat_name}/'
        Path(model_dir).mkdir(parents=True, exist_ok=True)

    val_loss, model, early_stop_callback = train.fit(
        data=data, model=model,
        model_dir=model_dir,
        max_epochs=max_epochs,
        log_dir=log_dir,
        devices=1,
    )
    
    return val_loss



# NOTE : toy example to test the function
if __name__ == '__main__':
    
    from sunbird.data.transforms import Log
    transform = Log()
    
    # Set the paths
    from acm.observables.emc import GalaxyCorrelationFunctionMultipoles
    tpcf = GalaxyCorrelationFunctionMultipoles(
        # No filters for now
    )
    stat_name = tpcf.stat_name
    lhc_y = tpcf.y
    lhc_x = tpcf.x
    lhc_x_names = tpcf.x_names
    covariance_matrix = tpcf.get_covariance_matrix()
    model_dir = tpcf.paths['model_dir']
    
    logger.info(f'Training {stat_name}')
    
    # Training parameters
    n_test = 600 # 6 first cosmologies
    
    # Hyperparameters
    learning_rate = 1.0e-3
    n_hidden = [512, 512, 512, 512]
    dropout_rate = 0
    weight_decay = 0
    
    val_loss = TrainFCN(
        lhc_y = lhc_y,
        lhc_x = lhc_x,
        lhc_x_names = lhc_x_names,
        covariance_matrix = covariance_matrix,
        stat_name = stat_name,
        model_dir = model_dir,
        n_test = n_test,
        learning_rate = learning_rate,
        n_hidden = n_hidden,
        dropout_rate = dropout_rate,
        weight_decay = weight_decay,
        transform=transform,
    )
    
    logger.info(f'Best validation loss for {stat_name}: {val_loss}')