Source code for pyepal.plotting

# -*- coding: utf-8 -*-
# Copyright 2022 PyePAL authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Plotting utilities"""

from collections import defaultdict
from typing import List, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold

from .. import PALBase

plt.rcParams["font.family"] = "sans-serif"


__all__ = [
    "plot_bar_iterations",
    "plot_pareto_front_2d",
    "plot_residuals",
    "plot_histogram",
    "plot_jointplot",
]


[docs]def plot_bar_iterations( # pylint:disable=invalid-name pareto_optimal: np.ndarray, non_pareto_points: np.ndarray, unclassified_points: np.ndarray, ax=None, ): """Plot stacked barplots for every step of the iteration. Args: pareto_optimal (np.ndarray): Number of pareto optimal points for every iteration. non_pareto_points (np.ndarray): Number of discarded points for every iteration unclassified_points (np.ndarray): Number of unclassified points for every iteration Returns: axis: matplotlib axis (the same that was provided as input or one from a new figure if no axis was provided) """ assert ( len(pareto_optimal) == len(non_pareto_points) == len(unclassified_points) ), "Make sure that the arrays have the same length" # We need numpy arrays as we assume that we can add the arrays # ToDo: We could potentially cast any iterable assert isinstance(pareto_optimal, np.ndarray), "The arguments must be numpy arrays" assert isinstance(non_pareto_points, np.ndarray), "The arguments must be numpy arrays" assert isinstance(unclassified_points, np.ndarray), "The arguments must be numpy arrays" if ax is None: _, ax = plt.subplots(1, 1) ax.bar(range(len(pareto_optimal)), unclassified_points, label="unclassified") ax.bar( range(len(pareto_optimal)), non_pareto_points, bottom=unclassified_points, label="discarded", ) ax.bar( range(len(pareto_optimal)), pareto_optimal, bottom=np.array(non_pareto_points) + np.array(unclassified_points), label="Pareto optimal", ) ax.set_xlabel("iteration (after initialization)") ax.set_ylabel("number of design points") return ax
[docs]def plot_pareto_front_2d( # pylint:disable=too-many-arguments, invalid-name y_0: np.ndarray, y_1: np.ndarray, std_0: np.ndarray, std_1: np.ndarray, palinstance: PALBase, ax=None, ): """Plot a 2D pareto front, with the different categories indicated in color. Args: y_0 (np.ndarray): objective 0 y_1 (np.ndarray): objective 1 std_0 (np.ndarray): standard deviation objective 0 std_1 (np.ndarray): standard deviation objective 0 palinstance (PALBase): PAL instance ax (axix, optional): Matplotlib figure axis. Defaults to None. Returns: ax: matplotlib axis (the same that was provided as input or one from a new figure if no axis was provided) """ # Here, we need numpy arrays for the indexing for array in [y_0, y_1, std_0, std_1]: assert isinstance(array, np.ndarray), "array must be a numpy array" assert ( len(y_0) == len(y_1) == len(std_0) == len(std_1) == palinstance.number_design_points ), "Make sure that the arrays have the same length" if ax is None: _, ax = plt.subplots(1, 1) ax.errorbar( y_0, y_1, xerr=std_0, yerr=std_1, c="gray", alpha=0.3, label="all design points", fmt=".", capsize=5, zorder=0, ) ax.scatter( y_0[palinstance.sampled_indices], y_1[palinstance.sampled_indices], c="blue", label="sampled", s=20, zorder=5, ) ax.scatter( y_0[palinstance.discarded], y_1[palinstance.discarded], c="red", label="discarded", s=15, alpha=1, zorder=10, ) ax.scatter( y_0[palinstance.pareto_optimal], y_1[palinstance.pareto_optimal], c="green", label="Pareto optimal", s=10, alpha=0.8, zorder=15, ) return ax
[docs]def plot_histogram(y: np.ndarray, palinstance: PALBase, ax=None): # pylint:disable=invalid-name """Plot histograms, with maxima scaled to one and different categories indicated in color for one objective Args: y (np.ndarray): objective (measurement) palinstance (PALBase): instance of a PAL class ax (ax): Matplotlib figure axis Returns: ax: matplotlib axis (the same that was provided as input or one from a new figure if no axis was provided)ƒ """ assert isinstance(y, np.ndarray), "Input array y must be a numpy array" assert y.ndim == 1, "Input array y must be one dimensional" assert ( len(y) == palinstance.number_design_points ), "Length of y must equal the size of the design space" if ax is None: _, ax = plt.subplots(1, 1) heights, bins = np.histogram(y) bin_width = bins[1] - bins[0] ax.bar( bins[:-1], heights / heights.max(), bin_width, label="all design points", color="gray", alpha=0.6, ) heights, bins = np.histogram(y[palinstance.sampled_indices]) bin_width = bins[1] - bins[0] ax.bar( bins[:-1], heights / heights.max(), bin_width, label="sampled", color="blue", alpha=0.6, ) heights, bins = np.histogram(y[palinstance.pareto_optimal]) bin_width = bins[1] - bins[0] ax.bar( bins[:-1], heights / heights.max(), bin_width, label="Pareto optimal", color="green", alpha=0.6, ) return ax
[docs]def plot_residuals( # pylint:disable=invalid-name y: np.array, palinstance: PALBase, labels: Union[List[str], None] = None, figsize: tuple = (6.0, 4.0), ): """Plot signed residual (on y axis) vs fitted (on x axis) plot of sampled points. Will create suplots for y.ndim > 1. Args: y (np.array): Two-dimensional array with the objectives (measurements) palinstance (PALBase): "trained" PAL instance labels (Union[List[str], None], optional): Labels for each objective. Defaults to "objective [index]". figsize (tuple, optional): Figure size for each individual residual vs fitted objective plot. Defaults to (6.0, 4.0). Returns: fig: matplotlib Figure object """ assert isinstance(y, np.ndarray), "Input array y must be a numpy array" assert ( len(y) == palinstance.number_design_points ), "Length of y must equal the size of the design space" assert y.ndim == 2, "y must be a two-dimensional numpy array" assert ( y.shape[1] == palinstance.ndim ), "y needs to be a two-dimensional array which \ column number equals the number of targets" if palinstance.means is None: raise ValueError( "Predicted means is None. Execute run_one_step() \ to obtain predicted means for each model." ) num_targets = palinstance.ndim fig, ax = plt.subplots( # pylint:disable=invalid-name num_targets, 1, figsize=(figsize[0], num_targets * figsize[1]), tight_layout=True, ) for index in range(num_targets): sampled = palinstance.sampled[:, index] fitted = palinstance._means[sampled, index] # pylint:disable=protected-access residuals = y[sampled, index] - fitted ax[index].scatter(fitted, residuals) ax[index].spines["top"].set_color("none") ax[index].spines["right"].set_color("none") if labels is None: labels = [f"objective {i}" for i in range(num_targets)] else: assert ( len(labels) == num_targets ), "If labels are provided the length of the labels list \ must equal the number of targets" for index in range(num_targets): ax[index].set_title(labels[index]) return fig
[docs]def plot_jointplot( # pylint:disable=invalid-name y: np.array, palinstance: PALBase, labels: Union[List[str], None] = None, figsize: tuple = (8.0, 6.0), ): """Plot a jointplot of the objective space with histograms on the diagonal and 2D-Pareto plots on the off-diagonal. Args: y (np.array): Two-dimensional array with the objectives (measurements) palinstance (PALBase): "trained" PAL instance labels (Union[List[str], None], optional): Labels for each objective. Defaults to "objective [index]". figsize (tuple, optional): Figure size for joint plot. Defaults to (8.0, 6.0). Returns: fig: matplotlib Figure object. """ assert isinstance(y, np.ndarray), "Input array y must be a numpy array" assert ( len(y) == palinstance.number_design_points ), "Length of y must equal the size of the design space" assert y.ndim == 2, "y must be a two-dimensional numpy array" assert ( y.shape[1] == palinstance.ndim ), "y needs to be a two-dimensional array which column number \ equals the number of targets" if (palinstance.means is None) or (palinstance.std is None) or (palinstance.beta is None): raise ValueError( "Predicted means is None. Execute run_one_step() \ to obtain predicted means for each model." ) num_targets = y.shape[1] fig, ax = plt.subplots( # pylint:disable=invalid-name num_targets, num_targets, figsize=figsize, tight_layout=True ) for row in range(num_targets): for column in range(num_targets): if row == column: plot_histogram(y[:, row], palinstance, ax[row, column]) else: plot_pareto_front_2d( y[:, row], y[:, column], palinstance.std[:, row] * np.sqrt(palinstance.beta), palinstance.std[:, column] * np.sqrt(palinstance.beta), palinstance, ax=ax[row, column], ) ax[row, column].spines["top"].set_color("none") ax[row, column].spines["right"].set_color("none") if labels is None: labels = [f"objective {i}" for i in range(num_targets)] else: assert len(labels) == num_targets for index in range(num_targets): ax[index, 0].set_ylabel(labels[index]) ax[num_targets - 1, index].set_xlabel(labels[index]) ax[0, num_targets - 1].legend() return fig
def plot_learning_curve( # pylint:disable=dangerous-default-value, too-many-arguments, too-many-locals palinstance, observations: np.ndarray, indices: np.ndarray = None, num_steps: int = 5, figsize: Tuple[float, float] = (8.0, 6.0), metrics: List[Tuple[str, callable]] = [ ("MAE", mean_absolute_error), ("MSE", mean_squared_error), ("r2", r2_score), ], k_fold: object = KFold(n_splits=5, shuffle=True), ): """Plot learning curve Args: palinstance (pyepal.PALBase): A palinstance object (must have implemented _train and _predict methods) observations (np.ndarray): y array, measurements indices (np.ndarray, optional): Indices in the design space to wish the observations belong to. If none, it will default to the first ones. Defaults to None. num_steps (int, optional): Number of points on the learning curve. The maximum point will be the number of observation. Defaults to 5. figsize (Tuple[(float, float)], optional): Figure size. Defaults to (8.0, 6.0). metrics (List[Tuple[(str, callable)], optional): List of tuples (name, function) where the function is a metric function that takes an array of true values and predictions and returns a float. Defaults to [ ("MAE", mean_absolute_error), ("MSE", mean_squared_error), ("r2", r2_score), ]. k_fold (object): Object with split method that takes an array and returns train/test indicies like the sklearn.model_selection.KFold object. Defaults to KFold(n_splits=5, shuffle=True). Returns: fig, dict: figure and dictionary with the learning curve results """ if indices is None: indices = np.arange(0, len(observations)) assert len(indices) == len(observations), "The number of indices and observations must be equal" assert len(indices) > 5, "You need to use at least five points" grid = np.linspace(5, len(observations), num_steps, dtype=np.int8) grid = np.unique(grid) true_grid = [] metrics_dict = defaultdict(list) for num_points in grid: sampled_indices = np.arange(0, num_points) sampled_indices = indices[sampled_indices] test_errors = defaultdict(list) num_training_points = None for train_idx, test_idx in k_fold.split(sampled_indices): train_idx = indices[train_idx] test_idx = [i for i in indices if i not in train_idx] num_training_points = len(train_idx) palinstance._reset() # pylint: disable=protected-access palinstance.update_train_set(train_idx, observations[train_idx]) palinstance._set_data() # pylint: disable=protected-access palinstance._set_hyperparameters() # pylint: disable=protected-access palinstance._train() # pylint: disable=protected-access palinstance._predict() # pylint: disable=protected-access for metric_name, metric_function in metrics: test_errors[metric_name].append( metric_function( observations[test_idx], palinstance._means[test_idx], # pylint:disable=protected-access ) ) for metric_name, results in test_errors.items(): metrics_dict[metric_name].append(np.mean(results)) true_grid.append(num_training_points) fig, ax = plt.subplots( # pylint:disable=invalid-name len(metrics), 1, figsize=figsize, tight_layout=True, ) true_grid = list(true_grid) for i, items in enumerate(metrics_dict.items()): metric_name, metric_errors = items ax[i].plot(true_grid, metric_errors, label=metric_name) ax[i].set_ylabel(metric_name) ax[-1].set_xlabel("number of training points") metrics_dict["grid"] = grid return fig, metrics_dict