Source code for pipeline_study

import warnings
from typing import Dict, List

from cetaceo.data import BaseDataset
from cetaceo.models import Model
from cetaceo.optimization import OptunaOptimizer


[docs] class Pipeline: r""" Pipeline class to train and evaluate models. To optimize a model, provide an optimizer and model class. To train a model with fixed parameters, provide a model and training parameters. Args: train_dataset (BaseDataset): The training dataset. valid_dataset (BaseDataset, optional): The validation dataset. Default is `None`. test_dataset (BaseDataset, optional): The test dataset. Default is `None`. model (Model, optional): The model to train. Default is `None`. If optimizer and model_class are provided, this is not used. training_params (Dict, optional): The parameters for training the model. Default is `None`. If optimizer and model_class are provided, this is not used. optimizer (OptunaOptimizer, optional): The optimizer to use for optimization. Default is `None`. model_class (Model, optional): The model class to use for optimization. Default is `None`. evaluators (List, optional): The evaluators to use for evaluating the model. Default is `[]`. Raises: AssertionError: If neither model and training_params nor optimizer and model_class are provided. """ def __init__( self, train_dataset: BaseDataset, valid_dataset: BaseDataset = None, test_dataset: BaseDataset = None, model: Model = None, training_params: Dict = None, optimizer: OptunaOptimizer = None, model_class=None, evaluators: List = [], ): self._model = model self.train_dataset = train_dataset self.test_dataset = test_dataset self.valid_dataset = valid_dataset self.optimizer = optimizer self.training_params = training_params self.model_class = model_class self.evaluators = evaluators assert (self.optimizer is not None and self.model_class is not None) or ( self._model is not None and self.training_params is not None ), "Either model and training_params or optimizer and model_class must be provided" @property def model(self) -> Model: """ Get the trained model. """ return self._model
[docs] def run(self): """ Run the pipeline. Returns: model_output: The output of the model's fit method. """ if self.optimizer is not None: if self.valid_dataset is None: self.valid_dataset = self.train_dataset warnings.warn( "Validation dataset not provided, using train dataset for evaluation on optimization" ) self._model, self.training_params = self.model_class.create_optimized_model( train_dataset = self.train_dataset, eval_dataset = self.valid_dataset, optuna_optimizer = self.optimizer, ) model_output = self._model.fit( self.train_dataset, eval_dataset=self.test_dataset, **self.training_params ) if len(self.evaluators) > 0 and self.test_dataset is not None: print(f"{'-'*50}\nMetrics on test data:\n{'-'*50}") self.evaluate(self.test_dataset) return model_output
[docs] def evaluate(self, dataset) -> Dict[str, float]: """ Evaluate the model on a dataset (or list of datasets). Args: dataset (BaseDataset or List[BaseDataset]): The dataset(s) to evaluate the model on. Returns: metrics (Dict[str, float]): The aggregated metrics evaluated on the dataset(s). """ metrics = {} # Evaluar cada dataset en la lista y_trues = [] xs = [] y_preds_unscaled = [] y_preds = self._model.predict(dataset[0], rescale_output=False, **self.training_params) for ds, y_pred in zip(dataset,y_preds): # Iteramos sobre los datasets rescale_output = ds.isscaled[1] # Obtener predicción para este dataset if rescale_output: y_pred = ds.rescale_y(y_pred) try: x, y_true = ds[:] x = ds.rescale_x(x) if rescale_output: y_true = ds.rescale_y(y_true) except ValueError: raise ValueError("Each dataset must have input and output data") # Guardamos resultados y_preds_unscaled.append(y_pred) y_trues.append(y_true) xs.append(x) # Aplicamos los evaluadores a cada dataset for i, evaluator in enumerate(self.evaluators): for j in range(len(dataset)): # Evaluar cada par (y_true, y_pred) metrics.update(evaluator(y_trues[j], y_preds_unscaled[j], xs[j])) evaluator.print_metrics() return metrics