Source code for pyepal.pal.validate_inputs

# -*- coding: utf-8 -*-
# Copyright 2022 PyePAL authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Methods to validate inputs for the PAL classes"""
import collections
import warnings
from typing import Any, Iterable, List, Sequence, Union

import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

__all__ = [
    "base_validate_models",
    "validate_beta_scale",
    "validate_coef_var",
    "validate_coregionalized_gpy",
    "validate_delta",
    "validate_epsilon",
    "validate_gbdt_models",
    "validate_goals",
    "validate_gpy_model",
    "validate_interquartile_scaler",
    "validate_ndim",
    "validate_njobs",
    "validate_nt_models",
    "validate_number_models",
    "validate_optimizers",
    "validate_positive_integer_list",
    "validate_sklearn_gpr_models",
    "validate_catboost_models",
]


[docs]def validate_ndim(ndim: Any) -> int: """Make sure that the number of dimensions makes sense Args: ndim (Any): number of dimensions Raises: ValueError: If the number of dimensions is not an integer ValueError: If the number of dimensions is not greater than 0 Returns: int: the number of dimensions """ if not isinstance(ndim, int): raise ValueError("The number of dimensions, ndim, must be a positive integer") if ndim <= 0: raise ValueError("ndmin must be greater than 0") return ndim
[docs]def validate_delta(delta: Any) -> float: """Make sure that delta is in a reasonable range Args: delta (Any): Delta hyperparameter Raises: ValueError: Delta must be in [0,1]. Returns: float: delta """ if (delta > 1) | (delta < 0): raise ValueError("The delta values must be in [0,1]") return delta
[docs]def validate_beta_scale(beta_scale: Any) -> float: """ Args: beta_scale (Any): scaling factor for beta Raises: ValueError: If beta is smaller than 0 Returns: float: scaling factor for beta """ if beta_scale < 0: raise ValueError("The beta_scale values must be positive") return beta_scale
[docs]def validate_epsilon(epsilon: Any, ndim: int) -> np.ndarray: """Validate epsilon and return a np.array Args: epsilon (Any): Epsilon hyperparameter ndim (int): Number of dimensions/objectives Raises: ValueError: If epsilon is a list there must be one float per dimension ValueError: Epsilon must be in [0,1] ValueError: If epsilon is an array there must be one float per dimension Returns: np.ndarray: Array of one epsilon per objective """ if isinstance(epsilon, list): if len(epsilon) != ndim: raise ValueError( "If epsilon is provided as a list,\ there must be one float per dimension" ) for value in epsilon: if (value > 1) | (value < 0): raise ValueError("The epsilon values must be in [0,1]") return np.array(epsilon) if isinstance(epsilon, np.ndarray): if len(epsilon) != ndim: raise ValueError( "If epsilon is provided as a array,\ there must be one float per dimension" ) for value in epsilon: if (value > 1) | (value < 0): raise ValueError("The epsilon values must be in [0,1]") return epsilon if (epsilon > 1) | (epsilon < 0): raise ValueError("The epsilon values must be in [0,1]") warnings.warn( """Only one epsilon value provided, will automatically expand to use the same value in every dimension""", UserWarning, ) return np.array([epsilon] * ndim)
[docs]def validate_goals(goals: Any, ndim: int) -> np.ndarray: # pylint:disable=too-many-branches """Create a valid array of goals. 1 for maximization, -1 for objectives that are to be minimized. Args: goals (Any): List of goals, typically provideded as strings 'max' for maximization and 'min' for minimization ndim (int): number of dimensions Raises: ValueError: If goals is a list and the length is not equal to ndim ValueError: If goals is a list and the elements are not strings 'min', 'max' or -1 and 1 Returns: np.ndarray: Array of -1 and 1 """ if goals is None: warnings.warn( "No goals provided, will assume that every dimension should be maximized", UserWarning, ) return np.array([1] * ndim) if isinstance(goals, list): if len(goals) != ndim: raise ValueError("If goals is a list, the length must be equal to the ndim") for goal in goals: if not isinstance(goal, str) | (goal != 1) | (goal != -1): raise ValueError("If goals is a list, it must be a list of strings") clean_goals = [] for goal in goals: if isinstance(goal, str): if "max" in goal.lower(): clean_goals.append(1) elif "min" in goal.lower(): clean_goals.append(-1) else: raise ValueError("The strings in the goals list must be min or max") elif isinstance(goal, int): if goal == 1: clean_goals.append(1) elif goal == -1: clean_goals.append(-1) else: raise ValueError("The ints in the goals list must be 1 or -1") assert len(clean_goals) == ndim return np.array(clean_goals) raise ValueError( "Goal can be set to None or must be a list of strings\ or -1 and 1 of length equal to ndim" )
[docs]def base_validate_models(models: Any) -> list: """Currently no validation as the predict and train function are implemented independet of the base class""" if models: return models raise ValueError("You must provide some models to initialize pyepal")
[docs]def validate_number_models(models: Any, ndim: int): """Make sure that there are as many models as objectives Args: models (Any): List of models ndim (int): Number of objectives Raises: ValueError: If the number of models does not equal the number of objectives """ if not isinstance(models, list): raise ValueError("You must provide a list of models. One model per objective") if len(models) != ndim: raise ValueError("You must provide a list of models. One model per objective")
[docs]def validate_gpy_model(models: Any): """Make sure that all elements of the list a GPRegression models""" import GPy # pylint:disable=import-outside-toplevel for model in models: if not isinstance(model, GPy.models.GPRegression): raise ValueError("The models must be an instance of GPy.model")
[docs]def validate_coregionalized_gpy(models: Any): """Make sure that model is a coregionalized GPR model""" if not isinstance(models, list): raise ValueError("You must provide a list of models with one element") from ..models.coregionalized import ( # pylint:disable=import-outside-toplevel GPCoregionalizedRegression, ) if not isinstance(models[0], GPCoregionalizedRegression): raise ValueError("Model must be a GPCoregionalized regression object from this package!")
[docs]def validate_njobs(njobs: Any) -> int: """Make sure that njobs is an int > 1""" if not isinstance(njobs, int): raise ValueError("njobs musst be of type int") if njobs < 1: raise ValueError("njobs must be a number greater equal 1") return njobs
[docs]def validate_coef_var(coef_var: Any): """Make sure that the coef_var makes sense""" if not isinstance(coef_var, (float, int)): raise ValueError("coef_var must be of type float or int") if coef_var <= 0: raise ValueError("coef_var must be greater 0") return coef_var
def _validate_sklearn_gpr_model(model: Any) -> GaussianProcessRegressor: """Make sure that we deal with a GaussianProcessRegressor instance, if it is a fitted random or grid search instance, extract the model""" if isinstance(model, (RandomizedSearchCV, GridSearchCV)): try: if isinstance(model.best_estimator_, GaussianProcessRegressor): return model.best_estimator_ raise ValueError( """If you provide a grid or random search instance, it needs to contain a GaussianProcessRegressor instance.""" ) except AttributeError as not_fitted_exception: raise ValueError( "If you provide a grid or random search instance it needs to be fitted." ) from not_fitted_exception elif isinstance(model, GaussianProcessRegressor): return model raise ValueError("You need to provide a GaussianProcessRegressor instance.")
[docs]def validate_sklearn_gpr_models(models: Any, ndim: int) -> List[GaussianProcessRegressor]: """Make sure that there is a list of GPR models, one model per objective""" validate_number_models(models, ndim) models_validated = [] for model in models: models_validated.append(_validate_sklearn_gpr_model(model)) return models_validated
def _validate_quantile_loss(lightgbmregressor): try: alpha = lightgbmregressor.alpha loss = lightgbmregressor.objective except AttributeError as missing_attribute: raise ValueError( """Make sure that you initialize at least the first and last model with quantile loss. """ ) from missing_attribute if loss != "quantile": raise ValueError( """Make sure that you initialize at least the first and last model with quantile loss. """ ) assert alpha > 0
[docs]def validate_gbdt_models(models: Any, ndim: int) -> List[Iterable]: """Make sure that the number of iterables is equal to the number of objectives and that every iterable contains three LGBMRegressors. Also, we check that at least the first and last models use quantile loss""" validate_number_models(models, ndim) from lightgbm import LGBMRegressor # pylint:disable=import-outside-toplevel for model_tuple in models: if len(model_tuple) != 3: raise ValueError( """The model list must contain tuples with three LGBMRegressor instances. """ ) for counter, model in enumerate(model_tuple): if not isinstance(model, LGBMRegressor): raise ValueError( """The model list must contain tuples with three LGBMRegressor instances. """ ) if counter != 1: _validate_quantile_loss(model) return models
[docs]def validate_catboost_models(models: Any, ndim: int) -> List[Iterable]: """Make sure that the number of models is equal to the number of objectives. Also make sure that the models are CatBoostRegressor instances with RSMEWithUncertainty loss function""" validate_number_models(models, ndim) from catboost import CatBoostRegressor # pylint:disable=import-outside-toplevel for model in models: if not isinstance(model, CatBoostRegressor): raise ValueError("""The models must be an instance of CatBoostRegressor""") if model.get_param("loss_function") != "RMSEWithUncertainty": raise ValueError( """The models must be an instance of CatBoostRegressor with RMSEWithUncertainty loss function""" ) if not model.get_param("posterior_sampling"): raise ValueError( """The models must be an instance of CatBoostRegressor with posterior_sampling set to True""" ) return models
[docs]def validate_interquartile_scaler(interquartile_scaler: Any) -> float: """Make sure that the interquartile_scaler makes sense""" if not isinstance(interquartile_scaler, (float, int)): raise ValueError("interquartile_scaler must be a number.") if interquartile_scaler < 0: raise ValueError("interquartile_scaler must be a number greater 0.") return interquartile_scaler
def _is_jaxoptimizer(optimizer: Any) -> bool: from ..models.nt import JaxOptimizer # pylint:disable=import-outside-toplevel return isinstance(optimizer, JaxOptimizer)
[docs]def validate_optimizers(optimizers: Any, ndim: int) -> Sequence: """Make sure that we can work with a Sequence if JaxOptimizer""" if not isinstance(optimizers, Sequence): raise ValueError("You must have one optimizer per objective.") if not len(optimizers) == ndim: raise ValueError("If you provide a sequence it must have one optimizer per objective.") for optimizer in optimizers: if not _is_jaxoptimizer(optimizer): raise ValueError("You need to provide a `pyepal.models.nt.JaxOptimizer` instance") return optimizers
[docs]def validate_nt_models(models: Any, ndim: int) -> Sequence: """Make sure that we can work with a sequence of :py:func:`pyepal.pal.models.nt.NTModel`""" from pyepal.models.nt import NTModel # pylint:disable=import-outside-toplevel if not isinstance(models, collections.Sequence): raise ValueError("You need to provide a sequence of `pyepal.models.nt.NTModel` instances") for model in models: if not len(models) == ndim: raise ValueError("You need to provide one model per objective.") if not isinstance(model, NTModel): raise ValueError( "You need to provide a sequence of `pyepal.models.nt.NTModel` instances" ) return models
[docs]def validate_positive_integer_list( seq: Any, ndim: int, parameter_name: str = "Parameter" ) -> Sequence[int]: """Can be used, e.g., to validate and standardize the ensemble size and epochs input""" if not isinstance(seq, collections.Sequence): if (not isinstance(seq, int)) or (seq < 1): raise ValueError(f"{parameter_name} must be a positive integer") return [seq] * ndim if not len(seq) == ndim: raise ValueError( f"If you provide a sequence for {parameter_name} its length must match \ the number of objectives" ) for elem in seq: if (not isinstance(elem, int)) or (elem < 1): raise ValueError(f"{parameter_name} must be a positive integer") return seq
def validate_ranges(ranges: Any, ndim: int) -> Union[None, np.ndarray]: """Make sure that it has the correct numnber of elements and that all elements are positive.""" if not isinstance(ranges, (np.ndarray, list)): return None if not len(ranges) == ndim: raise ValueError("The number of elements in ranges must match the number of objectives.") for elem in ranges: if not elem > 0: raise ValueError("Ranges must be positive.") return np.array(ranges)