Source code for pipeline

import warnings
from typing import Dict, List

from cetaceo.data import BaseDataset
from cetaceo.models import Model
from cetaceo.optimization import OptunaOptimizer


[docs] class Pipeline: r""" Pipeline class to train and evaluate models. To optimize a model, provide an optimizer and model class. To train a model with fixed parameters, provide a model and training parameters. Args: train_dataset (BaseDataset): The training dataset. valid_dataset (BaseDataset, optional): The validation dataset. Default is `None`. test_dataset (BaseDataset, optional): The test dataset. Default is `None`. model (Model, optional): The model to train. Default is `None`. If optimizer and model_class are provided, this is not used. training_params (Dict, optional): The parameters for training the model. Default is `None`. If optimizer and model_class are provided, this is not used. optimizer (OptunaOptimizer, optional): The optimizer to use for optimization. Default is `None`. model_class (Model, optional): The model class to use for optimization. Default is `None`. evaluators (List, optional): The evaluators to use for evaluating the model. Default is `[]`. Raises: AssertionError: If neither model and training_params nor optimizer and model_class are provided. """ def __init__( self, train_dataset: BaseDataset, valid_dataset: BaseDataset = None, test_dataset: BaseDataset = None, model: Model = None, training_params: Dict = None, optimizer: OptunaOptimizer = None, model_class=None, evaluators: List = [], ): self._model = model self.train_dataset = train_dataset self.test_dataset = test_dataset self.valid_dataset = valid_dataset self.optimizer = optimizer self.training_params = training_params self.model_class = model_class self.evaluators = evaluators assert (self.optimizer is not None and self.model_class is not None) or ( self._model is not None and self.training_params is not None ), "Either model and training_params or optimizer and model_class must be provided" @property def model(self) -> Model: """ Get the trained model. """ return self._model
[docs] def run(self): """ Run the pipeline. Returns: model_output: The output of the model's fit method. """ if self.optimizer is not None: if self.valid_dataset is None: self.valid_dataset = self.train_dataset warnings.warn( "Validation dataset not provided, using train dataset for evaluation on optimization" ) self._model, self.training_params = self.model_class.create_optimized_model( train_dataset = self.train_dataset, eval_dataset = self.valid_dataset, optuna_optimizer = self.optimizer, ) model_output = self._model.fit( self.train_dataset, eval_dataset=self.test_dataset, **self.training_params ) if len(self.evaluators) > 0 and self.test_dataset is not None: print(f"{'-'*50}\nMetrics on test data:\n{'-'*50}") self.evaluate(self.test_dataset) return model_output
[docs] def evaluate(self, dataset: BaseDataset) -> Dict[str, float]: """ Evaluate the model on a dataset. Args: dataset (BaseDataset): The dataset to evaluate the model on. Returns: metrics (Dict[str, float]): The metrics evaluated on the dataset. """ # if the dataset is scaled, then there is no need to rescale the output rescale_output = dataset.isscaled[1] # TODO: rescale_output deberia ser siempre false y se deberia quitar, se deja asi de momento por el MLP, pero hay que cambiarlo y_pred = self._model.predict(dataset, rescale_output=False,**self.training_params) if rescale_output: y_pred = dataset.rescale_y(y_pred) # dataset.rescale_data() try: x, y_true = dataset[:] x = dataset.rescale_x(x) if rescale_output: y_true = dataset.rescale_y(y_true) except ValueError: raise ValueError("The dataset must have input and output data") metrics = {} for evaluator in self.evaluators: metrics.update(evaluator(y_true, y_pred, x)) evaluator.print_metrics() return metrics