import warnings
from typing import Dict, List
from cetaceo.data import BaseDataset
from cetaceo.models import Model
from cetaceo.optimization import OptunaOptimizer
[docs]
class Pipeline:
r"""
Pipeline class to train and evaluate models.
To optimize a model, provide an optimizer and model class.
To train a model with fixed parameters, provide a model and training parameters.
Args:
train_dataset (BaseDataset): The training dataset.
valid_dataset (BaseDataset, optional): The validation dataset. Default is `None`.
test_dataset (BaseDataset, optional): The test dataset. Default is `None`.
model (Model, optional): The model to train. Default is `None`.
If optimizer and model_class are provided, this is not used.
training_params (Dict, optional): The parameters for training the model. Default is `None`.
If optimizer and model_class are provided, this is not used.
optimizer (OptunaOptimizer, optional): The optimizer to use for optimization. Default is `None`.
model_class (Model, optional): The model class to use for optimization. Default is `None`.
evaluators (List, optional): The evaluators to use for evaluating the model. Default is `[]`.
Raises:
AssertionError: If neither model and training_params nor optimizer and model_class are provided.
"""
def __init__(
self,
train_dataset: BaseDataset,
valid_dataset: BaseDataset = None,
test_dataset: BaseDataset = None,
model: Model = None,
training_params: Dict = None,
optimizer: OptunaOptimizer = None,
model_class=None,
evaluators: List = [],
):
self._model = model
self.train_dataset = train_dataset
self.test_dataset = test_dataset
self.valid_dataset = valid_dataset
self.optimizer = optimizer
self.training_params = training_params
self.model_class = model_class
self.evaluators = evaluators
assert (self.optimizer is not None and self.model_class is not None) or (
self._model is not None and self.training_params is not None
), "Either model and training_params or optimizer and model_class must be provided"
@property
def model(self) -> Model:
"""
Get the trained model.
"""
return self._model
[docs]
def run(self):
"""
Run the pipeline.
Returns:
model_output: The output of the model's fit method.
"""
if self.optimizer is not None:
if self.valid_dataset is None:
self.valid_dataset = self.train_dataset
warnings.warn(
"Validation dataset not provided, using train dataset for evaluation on optimization"
)
self._model, self.training_params = self.model_class.create_optimized_model(
train_dataset = self.train_dataset,
eval_dataset = self.valid_dataset,
optuna_optimizer = self.optimizer,
)
model_output = self._model.fit(
self.train_dataset, eval_dataset=self.test_dataset, **self.training_params
)
if len(self.evaluators) > 0 and self.test_dataset is not None:
print(f"{'-'*50}\nMetrics on test data:\n{'-'*50}")
self.evaluate(self.test_dataset)
return model_output
[docs]
def evaluate(self, dataset) -> Dict[str, float]:
"""
Evaluate the model on a dataset (or list of datasets).
Args:
dataset (BaseDataset or List[BaseDataset]): The dataset(s) to evaluate the model on.
Returns:
metrics (Dict[str, float]): The aggregated metrics evaluated on the dataset(s).
"""
metrics = {}
# Evaluar cada dataset en la lista
y_trues = []
xs = []
y_preds_unscaled = []
y_preds = self._model.predict(dataset[0], rescale_output=False, **self.training_params)
for ds, y_pred in zip(dataset,y_preds): # Iteramos sobre los datasets
rescale_output = ds.isscaled[1]
# Obtener predicción para este dataset
if rescale_output:
y_pred = ds.rescale_y(y_pred)
try:
x, y_true = ds[:]
x = ds.rescale_x(x)
if rescale_output:
y_true = ds.rescale_y(y_true)
except ValueError:
raise ValueError("Each dataset must have input and output data")
# Guardamos resultados
y_preds_unscaled.append(y_pred)
y_trues.append(y_true)
xs.append(x)
# Aplicamos los evaluadores a cada dataset
for i, evaluator in enumerate(self.evaluators):
for j in range(len(dataset)): # Evaluar cada par (y_true, y_pred)
metrics.update(evaluator(y_trues[j], y_preds_unscaled[j], xs[j]))
evaluator.print_metrics()
return metrics