PK!;]..LICENSEMIT License Copyright (c) 2019 Luminovo GmbH Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!Mzmetriculous/__init__.pyfrom metriculous import evaluators from metriculous import utilities from metriculous._comparison import Comparator from metriculous._comparison import Comparison from metriculous._evaluation import Evaluation from metriculous._evaluation import Evaluator from metriculous._evaluation import Quantity __all__ = [ "Comparator", "Comparison", "Evaluator", "Evaluation", "Quantity", "evaluators", "utilities", ] PK!a%%(metriculous/__init___import_star_test.pyfrom metriculous import * # noqa def test_import_star(): _ = Quantity("q", 42.0) # noqa e = Evaluator() # noqa _ = Evaluation([], "MyModel") # noqa _ = Comparator(evaluator=e) # noqa _ = Comparison([]) # noqa _ = evaluators.ClassificationEvaluator() # noqa PK!@imetriculous/__init___test.pyimport pytest from assertpy import assert_that def test_exposed_entities(): expected = [ "Comparator", "Comparison", "Evaluator", "Evaluation", "Quantity", "evaluators", "utilities", ] import metriculous assert_that(metriculous.__all__).is_equal_to(expected) def test_imports_from_style(): from metriculous import Comparator from metriculous import Comparison from metriculous import Evaluation from metriculous import Evaluator from metriculous import Quantity num_classes = 42 _ = Quantity("q", 42.0) e = Evaluator() _ = Evaluation("MyModel", [], []) _ = Comparator(evaluator=e) _ = Comparison([]) with pytest.raises(ImportError): # noinspection PyUnresolvedReferences,PyProtectedMember from metriculous import ClassificationEvaluator _ = ClassificationEvaluator() from metriculous.evaluators import ClassificationEvaluator _ = ClassificationEvaluator() with pytest.raises(ImportError): # noinspection PyUnresolvedReferences,PyProtectedMember from metriculous import SegmentationEvaluator _ = SegmentationEvaluator(num_classes) from metriculous.evaluators import SegmentationEvaluator _ = SegmentationEvaluator(num_classes) def test_imports_prefix_style(): import metriculous as met num_classes = 42 _ = met.Quantity("q", 42.0) e = met.Evaluator() _ = met.Evaluation("MyModel", [], []) _ = met.Comparator(evaluator=e) _ = met.Comparison([]) _ = met.evaluators.ClassificationEvaluator() _ = met.evaluators.SegmentationEvaluator(num_classes) _ = met.utilities.sample_weights_simulating_class_distribution( [0, 1, 2, 2], [0.8, 0.2, 0.0] ) with pytest.raises(AttributeError): # noinspection PyUnresolvedReferences _ = met.ClassificationEvaluator() _ = met.SegmentationEvaluator(num_classes) PK!L,Q$$$$metriculous/_comparison.pyimport os from dataclasses import dataclass from typing import Any, Optional from typing import List from typing import Sequence import bokeh.layouts import numpy as np import pandas as pd from assertpy import assert_that from bokeh import plotting from bokeh.models import Spacer from IPython.display import display from IPython.display import HTML from IPython.display import Markdown from metriculous._evaluation import Evaluation from metriculous._evaluation import Evaluator @dataclass(frozen=True) class Comparison: evaluations: List[Evaluation] def __post_init__(self): _check_consistency(self.evaluations) def display(self, include_spacer=False): _display_comparison_table(self.evaluations, include_spacer) # noinspection PyBroadException try: os.system('say "Model comparison is ready."') except Exception: pass class Comparator: """Can generate model comparisons after initialization with an Evaluator.""" def __init__(self, evaluator: Evaluator): self.evaluator = evaluator def compare( self, ground_truth: Any, model_predictions: Sequence[Any], model_names=None, sample_weights: Optional[Sequence[float]] = None, ) -> Comparison: """Generates a Comparison from a list of predictions and the ground truth. Args: model_predictions: List with one prediction object per model to be compared. ground_truth: A single ground truth object. model_names: Optional list of model names. If `None` generic names will be generated. sample_weights: Optional sequence of floats to modify the influence of individual samples on the statistics that will be measured. Returns: A Comparison object with one Evaluation per prediction. """ if model_names is None: model_names = [f"Model_{i}" for i in range(len(model_predictions))] else: assert_that(model_names).is_length(len(model_predictions)) model_evaluations = [ self.evaluator.evaluate( ground_truth, model_prediction=pred, model_name=model_name, sample_weights=sample_weights, ) for pred, model_name in zip(model_predictions, model_names) ] return Comparison(model_evaluations) def _get_and_supplement_model_names(model_evaluations: List[Evaluation]): return [ evaluation.model_name if evaluation.model_name is not None else f"model_{i_model}" for i_model, evaluation in enumerate(model_evaluations) ] def _model_evaluations_to_data_frame( model_evaluations: List[Evaluation] ) -> pd.DataFrame: quantity_names = [q.name for q in model_evaluations[0].quantities] # create one row per quantity data = [] for i_q, quantity_name in enumerate(quantity_names): row = [quantity_name] for evaluation in model_evaluations: quantity = evaluation.quantities[i_q] assert_that(quantity.name).is_equal_to(quantity_name) row.append(quantity.value) data.append(row) model_names = _get_and_supplement_model_names(model_evaluations) return pd.DataFrame(data, columns=["Quantity"] + model_names) def _check_consistency(model_evaluations: List[Evaluation]): if len(model_evaluations) == 0: return first = model_evaluations[0] for evaluation in model_evaluations: assert_that(evaluation.primary_metric).is_equal_to(first.primary_metric) assert_that(len(evaluation.quantities)).is_equal_to(len(first.quantities)) for q, q_first in zip(evaluation.quantities, first.quantities): # check that everything except the value is equal assert_that(q.name).is_equal_to(q_first.name) assert_that(q.higher_is_better).is_equal_to(q_first.higher_is_better) assert_that(q.description).is_equal_to(q_first.description) not_none_model_names = [ ms.model_name for ms in model_evaluations if ms.model_name is not None ] assert_that(not_none_model_names).does_not_contain_duplicates() good_color = "#b2ffb2" def _highlight_max(data): """Highlights the maximum in a Series or DataFrame. Checkout http://pandas.pydata.org/pandas-docs/stable/style.html for cool stuff. """ attr = "background-color: {}".format(good_color) if data.ndim == 1: # Series from .apply(axis=0) or axis=1 is_max = data == data.max() # noinspection PyTypeChecker return [attr if v else "" for v in is_max] else: # from .apply(axis=None) is_max = data == data.max().max() return pd.DataFrame( np.where(is_max, attr, ""), index=data.index, columns=data.columns ) def _highlight_min(data): """Highlights the minimum in a Series or DataFrame. Checkout http://pandas.pydata.org/pandas-docs/stable/style.html for cool stuff. """ attr = "background-color: {}".format(good_color) if data.ndim == 1: # Series from .apply(axis=0) or axis=1 is_min = data == data.min() # noinspection PyTypeChecker return [attr if v else "" for v in is_min] else: # from .apply(axis=None) is_min = data == data.min().min() return pd.DataFrame( np.where(is_min, attr, ""), index=data.index, columns=data.columns ) def _display_comparison_table( model_evaluations: List[Evaluation], include_spacer: bool ): _check_consistency(model_evaluations) primary_metric = model_evaluations[0].primary_metric n_models = len(model_evaluations) scores_data_frame = _model_evaluations_to_data_frame( [ evaluation.filtered(keep_higher_is_better=True) for evaluation in model_evaluations ] ) losses_data_frame = _model_evaluations_to_data_frame( [ evaluation.filtered(keep_lower_is_better=True) for evaluation in model_evaluations ] ) neutral_data_frame = _model_evaluations_to_data_frame( [ evaluation.filtered(keep_neutral_quantities=True) for evaluation in model_evaluations ] ) def is_primary_metric(a_metric: str): return a_metric.lower() == primary_metric def highlight_primary_metric(data): attr = "font-weight: bold; font-size: 120%;" if data.ndim == 1: metric = data[0].lower() if is_primary_metric(metric): return [attr for v in data] else: return ["" for v in data] else: # from .apply(axis=None) good_things = np.ones_like(data).astype(bool) return pd.DataFrame( np.where(good_things, "", ""), index=data.index, columns=data.columns ) def display_stylish_table(df: pd.DataFrame, highlight_fn=None): df_styled = df.style.set_properties(width="400px").format(_format_numbers) df_styled = df_styled.apply(highlight_primary_metric, axis=1) if highlight_fn is None: display(df_styled) else: display(df_styled.apply(highlight_fn, axis=1, subset=df.columns[1:])) # increase usable Jupyter notebook width when comparing many models if n_models > 3: # noinspection PyTypeChecker display(HTML("")) if len(scores_data_frame): # noinspection PyTypeChecker display(Markdown("## Scores (higher is better)")) display_stylish_table( scores_data_frame, _highlight_max if n_models > 1 else None ) if len(losses_data_frame): # noinspection PyTypeChecker display(Markdown("## Losses (lower is better)")) display_stylish_table( losses_data_frame, _highlight_min if n_models > 1 else None ) if len(neutral_data_frame): # noinspection PyTypeChecker display(Markdown("## Other Quantities")) display_stylish_table(neutral_data_frame) # hide DataFrame indices # noinspection PyTypeChecker display( HTML( """ """ ) ) # TODO check figure consistency # tell bokeh to output to notebook plotting.output_notebook() # show rows of figures for i_showable, _ in enumerate(model_evaluations[0].figures): row_of_figures = [ evaluation.figures[i_showable] for i_model, evaluation in enumerate(model_evaluations) ] if include_spacer: row_of_figures = [Spacer()] + row_of_figures plotting.show(bokeh.layouts.row(row_of_figures, sizing_mode="scale_width")) def _format_numbers(entry): try: flt = float(entry) return "{:.3f}".format(flt) except ValueError: return entry PK!|{ { metriculous/_evaluation.py""" This module contains data types and interfaces that are used throughout the library. Here we do not make any assumptions about the structure of ground truth and predictions. """ from dataclasses import dataclass from dataclasses import field from dataclasses import replace from typing import Any, Sequence from typing import List from typing import Optional from typing import Union from bokeh.plotting import Figure @dataclass(frozen=True) class Quantity: name: str value: Union[float, str] higher_is_better: Optional[bool] = None description: Optional[str] = None @dataclass(frozen=True) class Evaluation: model_name: str quantities: List[Quantity] = field(default_factory=list) figures: List[Figure] = field(default_factory=list) primary_metric: Optional[str] = None def get_by_name(self, quantity_name) -> Quantity: # Number of quantities is usually small, # so do not bother with internal dict for lookup for q in self.quantities: if quantity_name == q.name: return q raise ValueError(f"Could not find quantity named {quantity_name}") def get_primary(self) -> Optional[Quantity]: if self.primary_metric is None: return None return self.get_by_name(self.primary_metric) def filtered( self, keep_higher_is_better=False, keep_lower_is_better=False, keep_neutral_quantities=False, ): return replace( self, quantities=[ q for q in self.quantities if any( [ (q.higher_is_better is True and keep_higher_is_better), (q.higher_is_better is False and keep_lower_is_better), (q.higher_is_better is None and keep_neutral_quantities), ] ) ], ) class Evaluator: """ Interface to be implemented by the user to compute quantities and charts that are relevant and applicable to the problem at hand. """ def evaluate( self, ground_truth: Any, model_prediction: Any, model_name: str, sample_weights: Optional[Sequence[float]] = None, ) -> Evaluation: """Generates an Evaluation from ground truth and a model prediction.""" raise NotImplementedError PK!_Y-metriculous/_evaluation_test.pyimport pytest from metriculous import Quantity @pytest.mark.parametrize("name", ["", "accuracy", "What Ever"]) @pytest.mark.parametrize("value", [-0.5, 0.0, 1e15]) @pytest.mark.parametrize("higher_is_better", [True, False]) @pytest.mark.parametrize("description", [None, "", "Quantifies the whateverness"]) def test_quantity(name, value, higher_is_better, description): quantity = Quantity(name, value, higher_is_better, description) quantity_ = Quantity(name, value, higher_is_better, description) assert quantity == quantity_ PK!jr"metriculous/evaluators/__init__.py""" This module provides various default Evaluator implementations that are useful for the most common machine learning problems, such as classification and regression. """ from metriculous.evaluators._classification_evaluator import ClassificationEvaluator from metriculous.evaluators._segmentation_evaluator import SegmentationEvaluator __all__ = ["ClassificationEvaluator", "SegmentationEvaluator"] PK!y[R[R3metriculous/evaluators/_classification_evaluator.pyfrom typing import Callable from typing import Optional from typing import Sequence import numpy as np from assertpy import assert_that from scipy.stats import entropy from sklearn import metrics as sklmetrics from ._classification_utils import check_normalization from .._evaluation import Evaluation from .._evaluation import Evaluator from .._evaluation import Quantity from ..evaluators._classification_figures_bokeh import _bokeh_automation_rate_analysis from ..evaluators._classification_figures_bokeh import _bokeh_confusion_matrix from ..evaluators._classification_figures_bokeh import _bokeh_confusion_scatter from ..evaluators._classification_figures_bokeh import _bokeh_output_histogram from ..evaluators._classification_figures_bokeh import _bokeh_precision_recall_curve from ..evaluators._classification_figures_bokeh import _bokeh_roc_curve from ..metrics import top_n_accuracy from ..utilities import sample_weights_simulating_class_distribution class ClassificationEvaluator(Evaluator): """ Default Evaluator implementation that serves well for most classification problems. """ def __init__( self, class_names: Optional[Sequence[str]] = None, one_vs_all_quantities=True, one_vs_all_figures=False, top_n_accuracies: Sequence[int] = (), filter_quantities: Optional[Callable[[str], bool]] = None, filter_figures: Optional[Callable[[str], bool]] = None, primary_metric: Optional[str] = None, simulated_class_distribution: Optional[Sequence[float]] = None, class_label_rotation_x="horizontal", class_label_rotation_y="vertical", ): """ Initializes the evaluator with the option to overwrite the default settings. Args: class_names: Optional, names of the classes. one_vs_all_quantities: If `True` show quantities like "ROC AUC Class_i vs Rest" for all i. one_vs_all_figures: If `True` show figures like "ROC Curve Class_i vs Rest" for all i. top_n_accuracies: A sequence of positive integers to specify which top-N accuracy metrics should be computed. Example: `top_n_accuracies=[2, 3, 5, 10]` filter_quantities: Callable that receives a quantity name and returns `False` if the quantity should be excluded. Example: `filter_quantities=lambda name: "vs Rest" not in name` filter_figures: Callable that receives a figure title and returns `False` if the figure should be excluded. Example: `filter_figures=lambda name: "ROC" in name` primary_metric: Optional string to specify the most important metric that should be used for model selection. simulated_class_distribution: Optional sequence of floats that indicates a hypothetical class distribution on which models should be evaluated. If not `None`, sample weights will be computed and used to simulate the desired class distribution. class_label_rotation_x: Rotation of x-axis tick labels for figures with class name tick labels. class_label_rotation_y: Rotation of y-axis tick labels for figures with class name tick labels. """ self.class_names = class_names self.one_vs_all_quantities = one_vs_all_quantities self.one_vs_all_figures = one_vs_all_figures self.top_n_accuracies = top_n_accuracies assert all(isinstance(val, int) for val in self.top_n_accuracies) assert all(val >= 1 for val in self.top_n_accuracies) self.filter_quantities = ( (lambda name: True) if filter_quantities is None else filter_quantities ) self.filter_figures = ( (lambda name: True) if filter_figures is None else filter_figures ) self.primary_metric = primary_metric if simulated_class_distribution is not None: check_normalization(simulated_class_distribution, axis=0) np.testing.assert_equal( np.asarray(simulated_class_distribution) > 0.0, True ) self.simulated_class_distribution = simulated_class_distribution self.class_label_rotation_x = class_label_rotation_x self.class_label_rotation_y = class_label_rotation_y def evaluate( self, ground_truth: np.ndarray, model_prediction: np.ndarray, model_name: str, sample_weights: Optional[Sequence[float]] = None, ) -> Evaluation: """ Computes Quantities and generates Figures that are useful for most classification problems. Args: model_prediction: Sequence of 2d arrays where each array corresponds to a model and each row is a probability distribution. ground_truth: 2d array with each row being a probability distribution. model_name: Name of the model that is being evaluated. sample_weights: Sequence of floats to modify the influence of individual samples on the statistics that will be measured. Returns: An Evaluation object containing Quantities and Figures that are useful for most classification problems. """ # === Preparations ============================================================= # give variables more specific names y_pred_proba = model_prediction y_true_proba = ground_truth # and delete interface parameter names to avoid confusion del model_prediction del ground_truth n_classes = y_true_proba.shape[1] if self.class_names is None: self.class_names = ["class_{}".format(i) for i in range(n_classes)] assert len(self.class_names) == y_true_proba.shape[1] # check shapes assert np.ndim(y_true_proba) == 2 assert ( y_true_proba.shape == y_pred_proba.shape ), f"{y_true_proba.shape} != {y_pred_proba.shape}" # check normalization check_normalization(y_true_proba, axis=1) check_normalization(y_pred_proba, axis=1) np.testing.assert_equal(y_true_proba >= 0.0, True) np.testing.assert_equal(y_pred_proba >= 0.0, True) # compute non-probabilistic class decisions y_true = np.argmax(y_true_proba, axis=1) y_pred = np.argmax(y_pred_proba, axis=1) # make one-hot arrays, which are required for some sklearn metrics y_true_one_hot: np.ndarray = np.eye(n_classes)[y_true] y_pred_one_hot: np.ndarray = np.eye(n_classes)[y_pred] # process sample_weights parameter and self.simulated_class_distribution if sample_weights is not None: assert self.simulated_class_distribution is None, ( "Cannot use `sample_weights` with ClassificationEvaluator that was " "initialized with `simulated_class_distribution`." ) sample_weights = np.asarray(sample_weights) assert_that(sample_weights.ndim).is_equal_to(1) assert_that(sample_weights.shape).is_equal_to((len(y_pred),)) np.testing.assert_array_equal(sample_weights >= 0.0, True) elif self.simulated_class_distribution is not None: assert_that(np.shape(self.simulated_class_distribution)).is_equal_to( (n_classes,) ) sample_weights = sample_weights_simulating_class_distribution( y_true=y_true, hypothetical_class_distribution=self.simulated_class_distribution, ) # === Quantities =============================================================== # Note: Optimization potential here for problems with many classes. # We are currently computing all quantities and then throwing away some of them, # rather than only computing those that are requested by self.filter_quantities quantities = [ q for q in self._quantities( y_pred, y_pred_one_hot, y_pred_proba, y_true, y_true_one_hot, y_true_proba, maybe_sample_weights=sample_weights, ) if self.filter_quantities(q.name) ] # === Figures ================================================================== figures = self._figures( model_name, y_pred=y_pred, y_pred_one_hot=y_pred_one_hot, y_pred_proba=y_pred_proba, y_true=y_true, y_true_one_hot=y_true_one_hot, y_true_proba=y_true_proba, maybe_sample_weights=sample_weights, ) return Evaluation( quantities=quantities, figures=figures, model_name=model_name, primary_metric=self.primary_metric, ) def _figures( self, model_name: str, y_pred: np.ndarray, y_pred_one_hot: np.ndarray, y_pred_proba: np.ndarray, y_true: np.ndarray, y_true_one_hot: np.ndarray, y_true_proba: np.ndarray, maybe_sample_weights: Optional[np.ndarray], ): figures = [] # --- Histogram of predicted and ground truth classes --- if maybe_sample_weights is None: figure_name = "Class Distribution" if self.filter_figures(figure_name): figures.append( _bokeh_output_histogram( y_true=y_true, y_pred=y_pred, class_names=self.class_names, title_rows=[model_name, figure_name], sample_weights=None, x_label_rotation=self.class_label_rotation_x, ) ) else: figure_name = "Unweighted Class Distribution" if self.filter_figures(figure_name): figures.append( _bokeh_output_histogram( y_true=y_true, y_pred=y_pred, class_names=self.class_names, title_rows=[model_name, figure_name], sample_weights=None, x_label_rotation=self.class_label_rotation_x, ) ) figure_name = "Weighted Class Distribution" if self.filter_figures(figure_name): figures.append( _bokeh_output_histogram( y_true=y_true, y_pred=y_pred, class_names=self.class_names, title_rows=[model_name, figure_name], sample_weights=maybe_sample_weights, x_label_rotation=self.class_label_rotation_x, ) ) # --- Confusion Scatter Plot --- figure_name = "Confusion Scatter Plot" if maybe_sample_weights is None and self.filter_figures(figure_name): figures.append( _bokeh_confusion_scatter( y_true=y_true, y_pred=y_pred, class_names=self.class_names, title_rows=[model_name, figure_name], x_label_rotation=self.class_label_rotation_x, y_label_rotation=self.class_label_rotation_y, ) ) # --- Confusion Matrix --- figure_name = "Confusion Matrix" if maybe_sample_weights is None and self.filter_figures(figure_name): figures.append( _bokeh_confusion_matrix( y_true=y_true, y_pred=y_pred, class_names=self.class_names, title_rows=[model_name, figure_name], x_label_rotation=self.class_label_rotation_x, y_label_rotation=self.class_label_rotation_y, ) ) # --- Automation Rate Analysis --- figure_name = "Automation Rate Analysis" if self.filter_figures(figure_name): figures.append( _bokeh_automation_rate_analysis( y_target_one_hot=y_true_one_hot, y_pred_proba=y_pred_proba, title_rows=[model_name, figure_name], sample_weights=maybe_sample_weights, ) ) # --- ROC curves --- if self.one_vs_all_figures: for class_index, class_name in enumerate(self.class_names): figure_name = f"ROC {class_name} vs Rest" if self.filter_figures(figure_name): figures.append( _bokeh_roc_curve( y_true_binary=(y_true == class_index), y_pred_score=y_pred_proba[:, class_index], title_rows=[model_name, figure_name], sample_weights=maybe_sample_weights, ) ) # --- PR curves --- if self.one_vs_all_figures: for class_index, class_name in enumerate(self.class_names): figure_name = f"PR Curve {class_name} vs Rest" if self.filter_figures(figure_name): figures.append( _bokeh_precision_recall_curve( y_true_binary=(y_true == class_index), y_pred_score=y_pred_proba[:, class_index], title_rows=[model_name, figure_name], sample_weights=maybe_sample_weights, ) ) return figures def _quantities( self, y_pred: np.ndarray, y_pred_one_hot: np.ndarray, y_pred_proba: np.ndarray, y_true: np.ndarray, y_true_one_hot: np.ndarray, y_true_proba: np.ndarray, maybe_sample_weights: Optional[np.ndarray], ): quantities = [] quantities.append( Quantity( "Accuracy", sklmetrics.accuracy_score( y_true, y_pred, sample_weight=maybe_sample_weights ), higher_is_better=True, ) ) quantities.append( Quantity( "ROC AUC Macro Average", sklmetrics.roc_auc_score( y_true_one_hot, y_pred_proba, average="macro", sample_weight=maybe_sample_weights, ), higher_is_better=True, ) ) quantities.append( Quantity( "ROC AUC Micro Average", sklmetrics.roc_auc_score( y_true_one_hot, y_pred_proba, average="micro", sample_weight=maybe_sample_weights, ), higher_is_better=True, ) ) quantities.append( Quantity( "F1-Score Macro Average", sklmetrics.f1_score( y_true_one_hot, y_pred_one_hot, average="macro", sample_weight=maybe_sample_weights, ), higher_is_better=True, ) ) quantities.append( Quantity( "F1-Score Micro Average", sklmetrics.f1_score( y_true_one_hot, y_pred_one_hot, average="micro", sample_weight=maybe_sample_weights, ), higher_is_better=True, ) ) # --- Top-N accuracies --- for n in self.top_n_accuracies: quantities.append( Quantity( f"Top-{n} Accuracy", value=top_n_accuracy( y_true, y_pred_proba, n=n, sample_weights=maybe_sample_weights ), higher_is_better=True, ) ) # --- One-vs-rest ROC AUC scores --- if self.one_vs_all_quantities: # noinspection PyTypeChecker roc_auc_scores: Sequence[float] = sklmetrics.roc_auc_score( y_true_one_hot, y_pred_proba, average=None, sample_weight=maybe_sample_weights, ) for class_index, class_name in enumerate(self.class_names): quantities.append( Quantity( f"ROC AUC {class_name} vs Rest", value=roc_auc_scores[class_index], higher_is_better=True, ) ) # --- One-vs-rest average precision scores --- if self.one_vs_all_quantities: # noinspection PyTypeChecker ap_scores: Sequence[float] = sklmetrics.average_precision_score( y_true_one_hot, y_pred_proba, average=None, sample_weight=maybe_sample_weights, ) for class_index, class_name in enumerate(self.class_names): quantities.append( Quantity( f"Average Precision {class_name} vs Rest", value=ap_scores[class_index], higher_is_better=True, ) ) # --- One-vs-rest F1-scores --- if self.one_vs_all_quantities: f1_scores = sklmetrics.f1_score( y_true_one_hot, y_pred_one_hot, average=None, sample_weight=maybe_sample_weights, ) for class_index, class_name in enumerate(self.class_names): quantities.append( Quantity( f"F1-Score {class_name} vs Rest", value=f1_scores[class_index], higher_is_better=True, ) ) # --- KL-divergence --- # keep in mind entropy(p, q) != entropy(q, p) kl_divergences = np.array( [ entropy(pk=true_dist, qk=pred_dist) for true_dist, pred_dist in zip(y_true_proba, y_pred_proba) ] ) quantities.append( Quantity( "Mean KLD(P=target||Q=prediction)", np.average(kl_divergences, weights=maybe_sample_weights), higher_is_better=False, ) ) # --- Log loss --- quantities.append( Quantity( "Log Loss", sklmetrics.log_loss( y_true_one_hot, y_pred_one_hot, sample_weight=maybe_sample_weights ), higher_is_better=False, ) ) # --- Brier score loss --- # Be careful with sklmetrics.brier_score_loss, it deviates from Brier's # definition for multi-class problems. # See https://stats.stackexchange.com/questions # /403544/how-to-compute-the-brier-score-for-more-than-two-classes # and Wikipedia # noinspection PyTypeChecker quantities.append( Quantity( "Brier Score Loss", np.mean((y_pred_proba - y_true_one_hot) ** 2), higher_is_better=False, ) ) # noinspection PyTypeChecker quantities.append( Quantity( "Brier Score Loss (Soft Targets)", np.mean((y_pred_proba - y_true_proba) ** 2), higher_is_better=False, ) ) # --- entropy of prediction probability distributions --- entropies_pred = np.array([entropy(proba_dist) for proba_dist in y_pred_proba]) quantities.append(Quantity("Max Entropy", entropies_pred.max())) quantities.append( Quantity( "Mean Entropy", np.average(entropies_pred, weights=maybe_sample_weights) ) ) quantities.append(Quantity("Min Entropy", entropies_pred.min())) quantities.append(Quantity("Max Probability", y_pred_proba.max())) quantities.append(Quantity("Min Probability", y_pred_proba.min())) return quantities PK!1~//8metriculous/evaluators/_classification_evaluator_test.pyfrom dataclasses import replace from typing import Optional, List import numpy as np import pytest from .._evaluation import Evaluation from .._evaluation import Quantity from ..evaluators import ClassificationEvaluator from ..test_resources import noisy_prediction def random_targets_one_hot(num_classes: int, num_samples: int) -> np.ndarray: target_class_indices = np.random.randint(0, high=num_classes, size=num_samples) return np.eye(num_classes)[target_class_indices] @pytest.mark.parametrize("noise_factor, num_samples", [(0.1, 100), (10.0, 200)]) @pytest.mark.parametrize( "classes, simulated_class_distribution", [ (None, None), (["Cat", "Dog", "Lion"], [0.01, 0.02, 0.97]), (["Cat", "Dog", "Lion"], None), (["Spam", "Ham"], [0.2, 0.8]), ], ) @pytest.mark.parametrize( argnames=( "one_vs_all_quantities," "one_vs_all_figures," "top_n_accuracies," "filter_quantities," "primary_metric," ), argvalues=zip( [None, False, True], [True, None, False], [(4,), [], [2, 3, 42]], [None, lambda name: "a" in name, lambda name: False], ["Accuracy", None, None], ), ) @pytest.mark.parametrize("use_sample_weights", [False, True]) def test_ClassificationEvaluator( noise_factor: float, simulated_class_distribution: bool, num_samples: int, classes: Optional[List[str]], one_vs_all_quantities: Optional[bool], one_vs_all_figures: Optional[bool], top_n_accuracies: Optional[List[int]], filter_quantities: callable, primary_metric: Optional[str], use_sample_weights: bool, ): """Basic smoke test making sure we don't crash with valid input.""" np.random.seed(42) targets_one_hot = random_targets_one_hot( num_classes=len(classes) if classes is not None else 3, num_samples=num_samples ) prediction = noisy_prediction(targets_one_hot, noise_factor=noise_factor) ce = ClassificationEvaluator( class_names=classes, one_vs_all_quantities=one_vs_all_quantities, one_vs_all_figures=one_vs_all_figures, top_n_accuracies=top_n_accuracies, filter_quantities=filter_quantities, primary_metric=primary_metric, simulated_class_distribution=( None if use_sample_weights else simulated_class_distribution ), ) evaluation = ce.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", sample_weights=( 42.0 * np.random.random(size=num_samples) if use_sample_weights is True else None ), ) assert isinstance(evaluation, Evaluation) assert evaluation.model_name == "MockModel" @pytest.mark.parametrize("num_samples", [100, 200, 999]) @pytest.mark.parametrize( "use_sample_weights, simulated_class_distribution", [(False, None), (False, [0.3, 0.5, 0.2]), (True, None)], ) def test_ClassificationEvaluator_perfect_prediction( num_samples, use_sample_weights: bool, simulated_class_distribution: List[float] ): np.random.seed(42) targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples) prediction = noisy_prediction(targets_one_hot, noise_factor=0.0) ce = ClassificationEvaluator( simulated_class_distribution=simulated_class_distribution ) evaluation = ce.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", sample_weights=( 42.0 * np.random.random(size=num_samples) if use_sample_weights is True else None ), ) assert isinstance(evaluation, Evaluation) assert evaluation.model_name == "MockModel" expected_quantities = [ Quantity(name="Accuracy", value=1.0, higher_is_better=True, description=None), Quantity( name="ROC AUC Macro Average", value=1.0, higher_is_better=True, description=None, ), Quantity( name="ROC AUC Micro Average", value=1.0, higher_is_better=True, description=None, ), Quantity( name="F1-Score Macro Average", value=1.0, higher_is_better=True, description=None, ), Quantity( name="F1-Score Micro Average", value=1.0, higher_is_better=True, description=None, ), Quantity( name="ROC AUC class_0 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="ROC AUC class_1 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="ROC AUC class_2 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="Average Precision class_0 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="Average Precision class_1 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="Average Precision class_2 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="F1-Score class_0 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="F1-Score class_1 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="F1-Score class_2 vs Rest", value=1.0, higher_is_better=True, description=None, ), Quantity( name="Mean KLD(P=target||Q=prediction)", value=0.0, higher_is_better=False, description=None, ), Quantity( name="Log Loss", value=2.1094237467877998e-15, higher_is_better=False, description=None, ), Quantity( name="Brier Score Loss", value=0.0, higher_is_better=False, description=None ), Quantity( name="Brier Score Loss (Soft Targets)", value=0.0, higher_is_better=False, description=None, ), Quantity( name="Max Entropy", value=0.0, higher_is_better=None, description=None ), Quantity( name="Mean Entropy", value=0.0, higher_is_better=None, description=None ), Quantity( name="Min Entropy", value=0.0, higher_is_better=None, description=None ), Quantity( name="Max Probability", value=1.0, higher_is_better=None, description=None ), Quantity( name="Min Probability", value=0.0, higher_is_better=None, description=None ), ] assert len(evaluation.quantities) == len(expected_quantities) for actual, expected in zip(evaluation.quantities, expected_quantities): # check that everything except value is equal assert replace(actual, value=42) == replace(expected, value=42) # check that values are approximately equal if isinstance(expected.value, str): assert isinstance(actual, str) assert actual.value == expected.value else: assert isinstance(expected.value, float) assert isinstance(actual.value, float) np.testing.assert_allclose(actual.value, expected.value) @pytest.mark.parametrize("num_samples", [100, 200]) @pytest.mark.parametrize( "quantity_filter", [ lambda name: False, lambda name: True, lambda name: "F1" not in name, lambda name: "vs Rest" not in name, ], ) def test_ClassificationEvaluator_filter_quantities(num_samples, quantity_filter): np.random.seed(42) targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples) prediction = noisy_prediction(targets_one_hot, noise_factor=0.0) ce_all = ClassificationEvaluator() ce_filtering = ClassificationEvaluator(filter_quantities=quantity_filter) evaluation_all = ce_all.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", ) evaluation_filtered = ce_filtering.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", ) # assert all equal except quantities # (ignore figures as they do not support equality in the way we need it) assert replace(evaluation_all, quantities=[], figures=[]) == replace( evaluation_filtered, quantities=[], figures=[] ) for quantity in evaluation_all.quantities: if quantity_filter(quantity.name): same_quantity = evaluation_filtered.get_by_name(quantity.name) assert same_quantity == quantity else: with pytest.raises(ValueError): evaluation_filtered.get_by_name(quantity.name) for filtered_quantity in evaluation_filtered.quantities: same_quantity = evaluation_all.get_by_name(filtered_quantity.name) assert same_quantity == filtered_quantity @pytest.mark.parametrize("num_samples", [100, 200]) @pytest.mark.parametrize( "desired_number_of_figures, figure_filter", [ (0, lambda name: False), (10, None), (10, lambda name: True), (9, lambda name: "Distribution" not in name), (4, lambda name: "vs Rest" not in name), ], ) def test_ClassificationEvaluator_filter_figures( num_samples: int, desired_number_of_figures: int, figure_filter: callable ): np.random.seed(42) targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples) prediction = noisy_prediction(targets_one_hot, noise_factor=0.0) ce_all = ClassificationEvaluator(one_vs_all_figures=True) ce_filtering = ClassificationEvaluator( one_vs_all_figures=True, filter_figures=figure_filter ) evaluation_all = ce_all.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", ) evaluation_filtered = ce_filtering.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", ) # assert all equal except figures assert replace(evaluation_all, figures=[]) == replace( evaluation_filtered, figures=[] ) # check number of figures assert len(evaluation_filtered.figures) == desired_number_of_figures @pytest.mark.parametrize("num_samples", [100, 200]) def test_ClassificationEvaluator_exception_when_passing_distribution_and_weights( num_samples: int ): """ Checks that an exception is raised when `sample_weights` are passed to an evaluator that has been initialized with `simulated_class_distribution`. """ np.random.seed(42) targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples) prediction = noisy_prediction(targets_one_hot, noise_factor=0.0) ce = ClassificationEvaluator( one_vs_all_figures=True, simulated_class_distribution=[0.3, 0.1, 0.6] ) _ = ce.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", ) with pytest.raises(AssertionError) as exception_info: _ = ce.evaluate( ground_truth=targets_one_hot, model_prediction=prediction, model_name="MockModel", sample_weights=np.random.random(size=len(targets_one_hot)), ) assert str(exception_info.value) == ( "Cannot use `sample_weights` with ClassificationEvaluator that" " was initialized with `simulated_class_distribution`." ) PK!W'EE7metriculous/evaluators/_classification_figures_bokeh.pyfrom typing import Sequence, Optional import numpy as np from assertpy import assert_that from bokeh import plotting from bokeh.models import ( ColumnDataSource, LinearColorMapper, ColorBar, BasicTicker, PrintfTickFormatter, ) from bokeh.models import HoverTool from bokeh.models import Title from bokeh.plotting import Figure from sklearn import metrics as sklmetrics from sklearn.metrics import accuracy_score from metriculous.evaluators._classification_utils import check_normalization TOOLS = "pan,box_zoom,reset" TOOLBAR_LOCATION = "right" FONT_SIZE = "8pt" def _bokeh_output_histogram( y_true: np.ndarray, y_pred: np.ndarray, class_names: Sequence[str], title_rows: Sequence[str], sample_weights: Optional[np.ndarray] = None, x_label_rotation="horizontal", ) -> Figure: """ Creates a scatter plot that contains the same information as a confusion matrix. Args: y_true: 1d integer array indicating the reference labels. y_pred: 1d integer array indicating the predictions. class_names: Sequence of strings corresponding to the classes. title_rows: Sequence of strings to be used for the chart title. sample_weights: Sequence of floats to modify the influence of individual samples. x_label_rotation: Rotation of the class name labels. Returns: A bokeh figure """ n = len(class_names) assert_that(np.shape(y_true)).is_equal_to(np.shape(y_pred)) if sample_weights is None: sample_weights = np.ones_like(y_true) assert_that(np.shape(y_true)).is_equal_to(np.shape(sample_weights)) p = plotting.figure( x_range=class_names, plot_height=350, plot_width=350, tools=TOOLS, toolbar_location=TOOLBAR_LOCATION, ) bins = np.arange(0, n + 1, 1) normalize = not np.allclose(sample_weights, 1.0) # class distribution in prediction p.vbar( x=class_names, top=np.histogram(y_pred, bins=bins, weights=sample_weights, density=normalize)[ 0 ], width=0.85, alpha=0.6, legend="Prediction", ) # class distribution in ground truth p.vbar( x=class_names, top=np.histogram(y_true, bins=bins, weights=sample_weights, density=normalize)[ 0 ], width=0.85, alpha=0.6, legend="Ground Truth", fill_color=None, line_color="black", line_width=2.5, ) _add_title_rows(p, title_rows) _apply_default_style(p) p.yaxis.axis_label = "Fraction of Instances" if normalize else "Number of Instances" p.xaxis.major_label_orientation = x_label_rotation p.xgrid.grid_line_color = None # prevent panning to empty regions p.x_range.bounds = (-0.5, 0.5 + len(class_names)) return p def _bokeh_confusion_matrix( y_true: np.ndarray, y_pred: np.ndarray, class_names: Sequence[str], title_rows: Sequence[str], x_label_rotation="horizontal", y_label_rotation="vertical", ) -> Figure: """ Creates a confusion matrix heatmap. Args: y_true: 1d integer array indicating the reference labels. y_pred: 1d integer array indicating the predictions. class_names: Sequence of strings corresponding to the classes. title_rows: Sequence of strings to be used for the chart title. x_label_rotation: Rotation of the x-axis class name labels. y_label_rotation: Rotation of the y-axis class name labels. Returns: A bokeh figure """ cm = sklmetrics.confusion_matrix(y_true, y_pred) cm_normalized = cm.astype("float") / cm.sum() cm_normalized_by_pred = cm.astype("float") / cm.sum(axis=0, keepdims=True) cm_normalized_by_true = cm.astype("float") / cm.sum(axis=1, keepdims=True) predicted = list() actual = list() count = list() normalized = list() normalized_by_pred = list() normalized_by_true = list() for i, i_class in enumerate(class_names): for j, j_class in enumerate(class_names): predicted.append(j_class) actual.append(i_class) count.append(cm[i, j]) normalized.append(cm_normalized[i, j]) normalized_by_pred.append(cm_normalized_by_pred[i, j]) normalized_by_true.append(cm_normalized_by_true[i, j]) source = ColumnDataSource( data={ "predicted": predicted, "actual": actual, "count": count, "normalized": normalized, "normalized_by_true": normalized_by_true, "normalized_by_pred": normalized_by_pred, } ) p = plotting.figure(tools=TOOLS, x_range=class_names, y_range=class_names) mapper = LinearColorMapper(palette="Viridis256", low=0.0, high=1.0) p.rect( x="actual", y="predicted", width=0.95, height=0.95, source=source, fill_color={"field": "normalized_by_true", "transform": mapper}, line_width=0, line_color="black", ) p.xaxis.axis_label = "Ground Truth" p.yaxis.axis_label = "Prediction" p.xaxis.major_label_orientation = x_label_rotation p.yaxis.major_label_orientation = y_label_rotation p.add_tools( HoverTool( tooltips=[ ("Predicted", "@predicted"), ("Ground truth", "@actual"), ("Count", "@count"), ("Normalized", "@normalized"), ("Normalized by prediction", "@normalized_by_pred"), ("Normalize by ground truth", "@normalized_by_true"), ] ) ) color_bar = ColorBar( color_mapper=mapper, major_label_text_font_size=FONT_SIZE, ticker=BasicTicker(desired_num_ticks=10), formatter=PrintfTickFormatter(format="%.1f"), label_standoff=5, border_line_color=None, location=(0, 0), ) p.add_layout(color_bar, "right") _add_title_rows(p, title_rows) _apply_default_style(p) return p def _bokeh_confusion_scatter( y_true: np.ndarray, y_pred: np.ndarray, class_names: Sequence[str], title_rows: Sequence[str], x_label_rotation="horizontal", y_label_rotation="vertical", ) -> Figure: """ Creates a scatter plot that contains the same information as a confusion matrix. Args: y_true: 1d integer array indicating the reference labels. y_pred: 1d integer array indicating the predictions. class_names: Sequence of strings corresponding to the classes. title_rows: Sequence of strings to be used for the chart title. x_label_rotation: Rotation of the x-axis class name labels. y_label_rotation: Rotation of the y-axis class name labels. Returns: A bokeh figure """ if len(y_true) != len(y_pred): raise ValueError("y_true and y_pred must have the same length!") p = plotting.figure( x_range=(-0.5, -0.5 + len(class_names)), y_range=(-0.5, -0.5 + len(class_names)), plot_height=350, plot_width=350, tools=TOOLS, toolbar_location=TOOLBAR_LOCATION, match_aspect=True, ) def noise(): return (np.random.beta(1, 1, size=len(y_true)) - 0.5) * 0.6 p.scatter(x=y_true + noise(), y=y_pred + noise(), alpha=0.6) _add_title_rows(p, title_rows) _apply_default_style(p) p.xaxis.axis_label = "Ground Truth" p.yaxis.axis_label = "Prediction" arange = np.arange(len(class_names)) p.xaxis.ticker = arange p.yaxis.ticker = arange p.xaxis.major_label_overrides = {i: name for i, name in enumerate(class_names)} p.yaxis.major_label_overrides = {i: name for i, name in enumerate(class_names)} p.xaxis.major_label_orientation = x_label_rotation p.yaxis.major_label_orientation = y_label_rotation # grid between classes, not at classes p.xgrid.ticker = arange[0:-1] + 0.5 p.ygrid.ticker = arange[0:-1] + 0.5 p.xgrid.grid_line_width = 3 p.ygrid.grid_line_width = 3 # prevent panning to empty regions p.x_range.bounds = (-0.5, -0.5 + len(class_names)) p.y_range.bounds = (-0.5, -0.5 + len(class_names)) return p def _bokeh_roc_curve( y_true_binary: np.ndarray, y_pred_score: np.ndarray, title_rows: Sequence[str], sample_weights: Optional[np.ndarray], ) -> Figure: """Plots an interactive receiver operator characteristic (ROC) curve. Args: y_true_binary: An array of zeros and ones. y_pred_score: A continuous value, such as a probability estimate for the positive class. title_rows: Sequence of strings to be used for the chart title. sample_weights: Sequence of floats to modify the influence of individual samples. Returns: A bokeh figure """ assert y_true_binary.shape == y_pred_score.shape assert set(y_true_binary).issubset({0, 1}) or set(y_true_binary).issubset( {False, True} ) assert np.ndim(y_true_binary) == 1 fpr, tpr, thresholds = sklmetrics.roc_curve( y_true=y_true_binary, y_score=y_pred_score, sample_weight=sample_weights ) source = ColumnDataSource( data={"FPR": fpr, "TPR": tpr, "threshold": thresholds, "specificity": 1.0 - fpr} ) p = plotting.figure( plot_height=400, plot_width=350, tools=TOOLS, toolbar_location=TOOLBAR_LOCATION, # toolbar_location=None, # hides entire toolbar match_aspect=True, ) p.background_fill_color = "#f5f5f5" p.grid.grid_line_color = "white" p.xaxis.axis_label = "FPR" p.yaxis.axis_label = "TPR" _add_title_rows(p, title_rows) _apply_default_style(p) curve = p.line(x="FPR", y="TPR", line_width=2, color="#326496", source=source) p.line( x=[0.0, 1.0], y=[0.0, 1.0], line_alpha=0.75, color="grey", line_dash="dotted" ) p.add_tools( HoverTool( # make sure there is no tool tip for the diagonal baseline renderers=[curve], tooltips=[ ("TPR", "@TPR"), ("FPR", "@FPR"), ("Sensitivity", "@TPR"), ("Specificity", "@specificity"), ("Threshold", "@threshold"), ], # display a tooltip whenever the cursor is vertically in line with a glyph mode="vline", ) ) return p def _bokeh_precision_recall_curve( y_true_binary: np.ndarray, y_pred_score: np.ndarray, title_rows: Sequence[str], sample_weights=Optional[np.ndarray], ) -> Figure: """ Plots an interactive precision recall curve. Args: y_true_binary: An array of zeros and ones. y_pred_score: A continuous value, such as a probability estimate for the positive class. title_rows: Sequence of strings to be used for the chart title. sample_weights: Sequence of floats to modify the influence of individual samples. Returns: A bokeh figure """ assert y_true_binary.shape == y_pred_score.shape assert set(y_true_binary).issubset({0, 1}) or set(y_true_binary).issubset( {False, True} ) assert np.ndim(y_true_binary) == 1 # Note: len(thresholds) == len(precision) - 1 # The last precision recall pair does not have a corresponding threshold. precision, recall, thresholds = sklmetrics.precision_recall_curve( y_true=y_true_binary, probas_pred=y_pred_score, sample_weight=sample_weights ) precision = precision[:-1] recall = recall[:-1] p = plotting.figure( plot_height=400, plot_width=350, x_range=(-0.05, 1.05), y_range=(-0.05, 1.05), tools=TOOLS, toolbar_location=TOOLBAR_LOCATION, # match_aspect=True, ) source = ColumnDataSource( data={"precision": precision, "recall": recall, "threshold": thresholds} ) # reminder: tpr == recall == sensitivity p.line(x="recall", y="precision", line_width=2, source=source) _add_title_rows(p, title_rows) _apply_default_style(p) p.xaxis.axis_label = "Recall" p.yaxis.axis_label = "Precision" p.add_tools( HoverTool( tooltips=[ ("Precision", "@precision"), ("Recall", "@recall"), ("Threshold", "@threshold"), ], # display a tooltip whenever the cursor is vertically in line with a glyph mode="vline", ) ) return p def _bokeh_automation_rate_analysis( y_target_one_hot: np.ndarray, y_pred_proba: np.ndarray, title_rows: Sequence[str], sample_weights: Optional[np.ndarray], ) -> Figure: """ Plots various quantities over automation rate, where a single probability threshold is used for all classes to decide if we are confident enough to automate the classification. Args: y_target_one_hot: Array with one-hot encoded ground truth, shape(n_samples, n_classes). y_pred_proba: Array with estimated probability distributions, shape(n_samples, n_classes). title_rows: Sequence of strings to be used for the chart title. sample_weights: Sequence of floats to modify the influence of individual samples. Returns: A bokeh figure """ # ----- check input ----- assert y_target_one_hot.ndim == 2 assert y_pred_proba.ndim == 2 assert ( y_target_one_hot.shape == y_pred_proba.shape ), f"{y_target_one_hot.shape} != {y_pred_proba.shape}" check_normalization(y_target_one_hot, axis=1) check_normalization(y_pred_proba, axis=1) assert set(y_target_one_hot.ravel()) == {0, 1}, set(y_target_one_hot.ravel()) if sample_weights is None: sample_weights = np.ones(len(y_target_one_hot)) assert_that(sample_weights.shape).is_equal_to((len(y_target_one_hot),)) # ----- compute chart data ----- y_target = y_target_one_hot.argmax(axis=1) argmaxes = y_pred_proba.argmax(axis=1) maxes = y_pred_proba.max(axis=1) assert isinstance(maxes, np.ndarray) # making IntelliJ's type checker happy chart_data = {"automation_rate": [], "threshold": [], "accuracy": []} for threshold in sorted(maxes): automated = maxes >= threshold chart_data["automation_rate"].append( np.average(automated, weights=sample_weights) ) chart_data["threshold"].append(threshold) chart_data["accuracy"].append( accuracy_score( y_true=y_target[automated], y_pred=argmaxes[automated], sample_weight=sample_weights[automated], ) ) # ----- bokeh plot ----- p = plotting.figure( plot_height=400, plot_width=350, x_range=(-0.05, 1.05), y_range=(-0.05, 1.05), tools=TOOLS, toolbar_location=TOOLBAR_LOCATION, # match_aspect=True, ) source = ColumnDataSource( data={key: np.array(lst) for key, lst in chart_data.items()} ) accuracy_line = p.line( x="automation_rate", y="accuracy", line_width=2, source=source, legend="Accuracy", ) p.line( x="automation_rate", y="threshold", line_width=2, color="grey", source=source, legend="Threshold", ) # make sure something is visible if lines consist of just a single point p.scatter( x=source.data["automation_rate"][[0, -1]], y=source.data["accuracy"][[0, -1]] ) p.scatter( x=source.data["automation_rate"][[0, -1]], y=source.data["threshold"][[0, -1]], color="grey", ) _add_title_rows(p, title_rows) _apply_default_style(p) p.xaxis.axis_label = "Automation Rate" p.legend.location = "bottom_left" p.add_tools( HoverTool( renderers=[accuracy_line], tooltips=[ ("Accuracy", "@accuracy"), ("Threshold", "@threshold"), ("Automation Rate", "@automation_rate"), ], # display a tooltip whenever the cursor is vertically in line with a glyph mode="vline", ) ) return p def _add_title_rows(p: Figure, title_rows: Sequence[str]): for title_row in reversed(title_rows): p.add_layout( Title(text=title_row, text_font_size=FONT_SIZE, align="center"), place="above", ) def _apply_default_style(p: Figure): p.background_fill_color = "#f5f5f5" p.grid.grid_line_color = "white" p.toolbar.logo = None p.xaxis.axis_label_text_font_size = FONT_SIZE p.yaxis.axis_label_text_font_size = FONT_SIZE p.axis.axis_line_color = None p.xaxis.major_tick_line_color = None # turn off x-axis major ticks p.xaxis.minor_tick_line_color = None # turn off x-axis minor ticks p.yaxis.major_tick_line_color = None # turn off y-axis major ticks p.yaxis.minor_tick_line_color = None # turn off y-axis minor ticks p.axis.major_label_standoff = 0 if p.legend: p.legend.label_text_font_size = FONT_SIZE p.legend.background_fill_alpha = 0.85 PK!2/metriculous/evaluators/_classification_utils.pyfrom typing import Union, Sequence import numpy as np NORMALIZATION_ABS_TOLERANCE = 1e-5 NORMALIZATION_REL_TOLERANCE = 1e-5 def check_normalization(probabilities: Union[np.ndarray, Sequence[float]], axis: int): np.testing.assert_allclose( np.sum(probabilities, axis=axis), desired=1.0, rtol=NORMALIZATION_REL_TOLERANCE, atol=NORMALIZATION_ABS_TOLERANCE, ) PK!&߉1metriculous/evaluators/_segmentation_evaluator.pyfrom typing import Callable from typing import Optional from typing import Sequence from typing import Iterable import numpy as np from sklearn import metrics as sklmetrics from .._evaluation import Evaluation from .._evaluation import Evaluator from .._evaluation import Quantity from ..evaluators._classification_figures_bokeh import _bokeh_output_histogram from ..evaluators._segmentation_figures_bokeh import _bokeh_heatmap class SegmentationEvaluator(Evaluator): """ Implementation of the Segmentation Evaluator which should work well for most image segmentation problems. """ def __init__( self, num_classes: int, class_names: Optional[Sequence[str]] = None, class_weights: Optional[Sequence[float]] = None, filter_quantities: Optional[Callable[[str], bool]] = None, filter_figures: Optional[Callable[[str], bool]] = None, primary_metric: Optional[str] = None, ): """ Initializes the segmentation evaluator Args: num_classes: The number of classes class_names: Optional, names of classes class_weights: Optional, weights of classes in the same order as class_names. These weights don't necessarily need to add up to 1.0 as the weights are normalized but their ratios should reflect the weight distribution desired. filter_quantities: Callable that receives a quantity name and returns `False` if the quantity should be excluded. Examples: `filter_quantities=lambda name: "vs Rest" not in name` `filter_quantities=lambda name: "ROC" in name` filter_figures: Callable that receives a figure title and returns `False` if the figure should be excluded. Examples: `filter_figures=lambda name: "vs Rest" not in name` `filter_figures=lambda name: "ROC" in name` primary_metric: Optional string to specify the most important metric that should be used for model selection. """ self.num_classes = num_classes if class_names is None: self.class_names = ["class_{}".format(i) for i in range(num_classes)] else: self.class_names = class_names if class_weights is None: self.class_weights = [1.0 / num_classes] * num_classes else: total = sum(class_weights) self.class_weights = [weight / total for weight in class_weights] self.filter_quantities = ( (lambda name: True) if filter_quantities is None else filter_quantities ) self.filter_figures = ( (lambda name: True) if filter_figures is None else filter_figures ) self.primary_metric = primary_metric # Check for shape consistency if len(self.class_names) != self.num_classes: raise ValueError( "The number of classes don't match the length of the class_names" ) if len(self.class_weights) != self.num_classes: raise ValueError( "The number of classes don't match the length of the class_weights" ) def evaluate( self, ground_truth: np.ndarray, model_prediction: np.ndarray, model_name: str, sample_weights: Optional[Iterable[float]] = None, ) -> Evaluation: """ Args: ground_truth: A 3D array of the shape - (Num_Samples, Height, Width) model_prediction: A 3D array with the same shape as ground_truth with each channel being the prediction of the model for the corresponding image. model_name: Name of the model to be evaluated sample_weights: Sequence of floats to modify the influence of individual samples on the statistics that will be measured. Returns: An Evaluation object containing Quantities and Figures that are useful for most segmentation problems. """ if sample_weights is not None: raise NotImplementedError( "SegmentationEvaluator currently doesn't support sample weights" ) if ground_truth.shape != model_prediction.shape: raise ValueError( ( f"The shape of the ground truth and the model predictions should be" f"the same. Got ground_truth_shape: {ground_truth.shape}, " f"model_predictions.shape: {model_prediction.shape}" ) ) if ground_truth.ndim != 3: raise ValueError( f"Ground Truth must be a 3D array. Got an {ground_truth.ndim}-d array" ) if model_prediction.ndim != 3: raise ValueError( ( f"Model prediction must be a 3D array. " f"Got a {model_prediction.ndim}-d array" ) ) quantities = [ q for q in self._quantities(model_prediction, ground_truth) if self.filter_quantities(q.name) ] figures = self._figures(model_name, model_prediction, ground_truth) return Evaluation( quantities=quantities, figures=figures, model_name=model_name, primary_metric=self.primary_metric, ) def _figures(self, model_name: str, y_pred: np.ndarray, y_true: np.ndarray): figures = [] figure_name = "Class Distribution" if self.filter_figures(figure_name): figure = _bokeh_output_histogram( y_true=y_true, y_pred=y_pred, class_names=self.class_names, title_rows=[model_name, figure_name], sample_weights=None, ) figure.yaxis.axis_label = "Number of Pixels" figures.append(figure) for class_label, class_name in enumerate(self.class_names): figure_name = f"Heatmap for {class_name}" if self.filter_figures(figure_name): figure = _bokeh_heatmap( y_true=y_true, y_pred=y_pred, class_label=class_label, class_name=class_name, ) figures.append(figure) return figures def _quantities(self, y_pred: np.ndarray, y_true: np.ndarray): # Flattened them as jaccard_score requires it in this way y_true_flattened, y_pred_flattened = y_true.flatten(), y_pred.flatten() quantities = list() weighted_miou = 0.0 class_specific_miou = sklmetrics.jaccard_score( y_true_flattened, y_pred_flattened, average=None ) if len(class_specific_miou) != self.num_classes: raise ValueError( ( f"The number of classes specified ({self.num_classes}) doesn't " f"match with the number of classes actually present in the " f"ground_truth/predictions ({len(class_specific_miou)}). Update the" f" num_classes, class_names & class_weights parameters accordingly" ) ) for class_name, class_weight, value in zip( self.class_names, self.class_weights, class_specific_miou ): quantities.append( Quantity(f"{class_name} mIoU", value, higher_is_better=True) ) weighted_miou += class_weight * value quantities.append( Quantity("Class weighted Mean mIoU", weighted_miou, higher_is_better=True) ) return quantities PK!ɔ  6metriculous/evaluators/_segmentation_evaluator_test.pyfrom typing import List import numpy as np import pytest import copy from dataclasses import replace from .._evaluation import Evaluation from .._evaluation import Quantity from ..evaluators import SegmentationEvaluator def get_random_prediction_and_mask(image_size, num_classes): return ( np.random.randint(0, num_classes, image_size), np.random.randint(0, num_classes, image_size), ) @pytest.mark.parametrize("classes", (["dog", "cat", "snake"], ["dog", "cat"])) def test_SegmentationEvaluator(classes: List[str]): np.random.seed(42) num_classes = len(classes) prediction, mask = get_random_prediction_and_mask((2, 256, 256), num_classes) se = SegmentationEvaluator(num_classes, class_names=classes) evaluation = se.evaluate( ground_truth=mask, model_prediction=prediction, model_name="MockModel" ) assert isinstance(evaluation, Evaluation) assert evaluation.model_name == "MockModel" @pytest.mark.parametrize("classes", (["dog", "cat", "snake"], ["dog", "cat"])) def test_SegmentationEvaluator_perfect_prediction(classes: List[str]): np.random.seed(42) num_classes = len(classes) predictions, _ = get_random_prediction_and_mask((2, 256, 256), num_classes) mask = copy.deepcopy(predictions) se = SegmentationEvaluator(num_classes, class_names=classes) evaluation = se.evaluate( ground_truth=mask, model_prediction=predictions, model_name="MockModel" ) expected_quantities = [] for class_name in classes: expected_quantities.append( Quantity(name=f"{class_name} mIoU", value=1.0, higher_is_better=True) ) expected_quantities.append( Quantity(name="Class weighted Mean mIoU", value=1.0, higher_is_better=True) ) assert len(evaluation.quantities) == len(expected_quantities) for actual, expected in zip(evaluation.quantities, expected_quantities): # check that everything except value is equal assert replace(actual, value=42) == replace(expected, value=42) # check that values are approximately equal if isinstance(expected.value, str): assert isinstance(actual, str) assert actual.value == expected.value else: assert isinstance(expected.value, float) assert isinstance(actual.value, float) np.testing.assert_allclose(actual.value, expected.value) @pytest.mark.parametrize( "num_classes, class_names", [(1, ["dog", "cat"]), (2, ["dog"])] ) def test_SegmentationEvaluator_inconsistent_class_names( num_classes: int, class_names: List[str] ): """ Tests if the __init__ method of SegmentationEvaluator raises an error if the length of the class_names list is not equal to num_classes """ with pytest.raises(ValueError): _ = SegmentationEvaluator(num_classes, class_names=class_names) @pytest.mark.parametrize("num_classes, class_weights", [(1, [0.2, 0.3]), (2, [0.2])]) def test_SegmentationEvaluator_inconsistent_class_weights( num_classes: int, class_weights: List[float] ): """ Tests if the __init__ method of SegmentationEvaluator raises an error if the length of the class_weights list is not equal to num_classes """ with pytest.raises(ValueError): _ = SegmentationEvaluator(num_classes, class_weights=class_weights) @pytest.mark.parametrize( "num_classes, ground_truth, model_prediction", [ (3, *get_random_prediction_and_mask((2, 256, 256), 2)), (2, *get_random_prediction_and_mask((2, 256, 256), 3)), ], ) def test_SegmentationEvaluator_inconsistent_num_classes( num_classes, ground_truth, model_prediction ): """ Tests if the evaluate method of SegmentationEvaluator raises an error if the actual number of classes present in the ground_truth/prediction is not equal to num_classes. """ se = SegmentationEvaluator(num_classes) with pytest.raises(ValueError): se.evaluate(ground_truth, model_prediction, model_name="MockModel") @pytest.mark.parametrize( "num_classes, ground_truth, model_prediction", [ ( 3, np.random.randint(0, 3, (1, 256, 256)), np.random.randint(0, 3, (2, 256, 256)), ) ], ) def test_SegmentationEvaluator_inconsistent_shapes( num_classes, ground_truth, model_prediction ): """ Tests if the evaluate method of SegmentationEvaluator raises an error if the shapes of the ground_truth and model_prediction aren't the same """ se = SegmentationEvaluator(num_classes) with pytest.raises(ValueError): se.evaluate(ground_truth, model_prediction, model_name="MockModel") @pytest.mark.parametrize( "num_classes, ground_truth, model_prediction", [ ( 3, np.random.randint(0, 3, (256, 256)), np.random.randint(0, 3, (2, 256, 256)), ), ( 3, np.random.randint(0, 3, (2, 256, 256)), np.random.randint(0, 3, (256, 256)), ), ], ) def test_SegmentationEvaluator_not_a_3D_array( num_classes, ground_truth, model_prediction ): """ Tests if the evaluate method of SegmentationEvaluator raises an error if the ground_truth or model_prediction isn't a 3D array """ se = SegmentationEvaluator(num_classes) with pytest.raises(ValueError): se.evaluate(ground_truth, model_prediction, model_name="MockModel") @pytest.mark.parametrize("num_classes", [2, 3]) @pytest.mark.parametrize( "quantity_filter", [ lambda name: False, lambda name: True, lambda name: "Weighted" not in name, lambda name: "mIoU" not in name, ], ) def test_SegmentationEvaluator_filter_quantities( num_classes: int, quantity_filter: callable ): np.random.seed(42) predictions, mask = get_random_prediction_and_mask((2, 256, 256), num_classes) se_all = SegmentationEvaluator(num_classes) se_filtering = SegmentationEvaluator(num_classes, filter_quantities=quantity_filter) evaluation_all = se_all.evaluate( ground_truth=mask, model_prediction=predictions, model_name="MockModel" ) evaluation_filtered = se_filtering.evaluate( ground_truth=mask, model_prediction=predictions, model_name="MockModel" ) assert replace(evaluation_all, quantities=[], figures=[]) == replace( evaluation_filtered, quantities=[], figures=[] ) for quantity in evaluation_all.quantities: if quantity_filter(quantity.name): same_quantity = evaluation_filtered.get_by_name(quantity.name) assert same_quantity == quantity else: with pytest.raises(ValueError): evaluation_filtered.get_by_name(quantity.name) for filtered_quantity in evaluation_filtered.quantities: same_quantity = evaluation_all.get_by_name(filtered_quantity.name) assert same_quantity == filtered_quantity @pytest.mark.parametrize( "num_classes, desired_number_of_figures, figure_filter", [ (3, 0, lambda name: False), (3, 4, lambda name: True), (3, 1, lambda name: "Heatmap" not in name), (3, 3, lambda name: "Class" not in name), (2, 2, lambda name: "Class" not in name), (2, 3, lambda name: True), ], ) def test_SegmentationEvaluator_filter_figures( num_classes: int, desired_number_of_figures: int, figure_filter: callable ): np.random.seed(42) predictions, mask = get_random_prediction_and_mask((2, 256, 256), num_classes) se_all = SegmentationEvaluator(num_classes) se_filtering = SegmentationEvaluator(num_classes, filter_figures=figure_filter) evaluation_all = se_all.evaluate( ground_truth=mask, model_prediction=predictions, model_name="MockModel" ) evaluation_filtered = se_filtering.evaluate( ground_truth=mask, model_prediction=predictions, model_name="MockModel" ) assert replace(evaluation_all, figures=[]) == replace( evaluation_filtered, figures=[] ) assert len(evaluation_filtered.figures) == desired_number_of_figures PK!ld  5metriculous/evaluators/_segmentation_figures_bokeh.pyfrom typing import Optional import numpy as np from bokeh import plotting from bokeh.plotting import Figure from bokeh.models import Title from bokeh.layouts import column TOOLS = "pan,box_zoom,reset" TOOLBAR_LOCATION = "right" def _bokeh_heatmap( y_true: np.ndarray, y_pred: np.ndarray, class_label: int, class_name: Optional[str] = None, ) -> Figure: """ Creates heatmaps of the predictions and ground_truth corresponding to the class_label Args: y_true: 3d integer array indicating the ground_truth masks. Shape: (Num_Samples, Height, Width) y_pred: 3d integer array indicating the predictions of the model as the same shape as y_true class_label: An integer corresponding to the class for which the heatmap is desired class_name: Class Name corresponding to the class_label Returns: A bokeh figure """ if y_pred.shape != y_true.shape: raise ValueError( ( "The shapes of y_pred and y_true must be the same. " f"Got y_pred shape: {y_pred.shape}, y_true shape: {y_true.shape}" ) ) if class_label not in np.unique(y_true): raise ValueError("Incorrect class_label provided, doesn't exist in y_true") if class_name is None: class_name = f"Class {class_label}" padding = 5 mean_activation_predictions = np.average( (y_pred == class_label).astype(np.uint8), axis=0 ) mean_activation_ground_truth = np.average( (y_true == class_label).astype(np.uint8), axis=0 ) p1 = plotting.figure( tools=TOOLS, toolbar_location=TOOLBAR_LOCATION, width=y_true.shape[2], height=y_true.shape[1], ) p1.x_range.range_padding = p1.y_range.range_padding = 0 p1.toolbar.logo = None p1.image( image=[mean_activation_predictions], x=0, y=0, dw=y_true.shape[2], dh=y_true.shape[1], ) p1.add_layout(Title(text="Ground Truth", align="center"), "below") p1.add_layout( Title(text=f"Heatmap for {class_name}", align="center"), place="above" ) p1.axis.visible = False p2 = plotting.figure( tools=TOOLS, toolbar_location=TOOLBAR_LOCATION, width=y_true.shape[2], height=y_true.shape[1], x_range=p1.x_range, ) p2.x_range.range_padding = p2.y_range.range_padding = 0 p2.toolbar.logo = None p2.image( image=[mean_activation_ground_truth], x=0, y=y_true.shape[1] + padding, dw=y_true.shape[2], dh=y_true.shape[1], ) p2.add_layout(Title(text="Prediction", align="center"), "below") p2.axis.visible = False return column(p1, p2) PK!2Ůmetriculous/metrics.py"""Module defining generic metric functions.""" from typing import List from typing import Optional from typing import Tuple import numpy as np from assertpy import assert_that from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve def normalized(matrix: np.ndarray) -> np.ndarray: """Returns normalized array where each row sums up to 1.0.""" assert np.ndim(matrix) == 2 sums = np.sum(matrix, axis=1, keepdims=True) # avoid crash on zeros matrix = matrix + (sums == 0.0) * 1e-15 return matrix / np.sum(matrix, axis=1, keepdims=True) def cross_entropy( target_probas: np.ndarray, pred_probas: np.ndarray, epsilon=1e-15 ) -> float: """Returns the cross-entropy for probabilistic ground truth labels. Args: target_probas: 2D array with rows being target probability distributions. pred_probas: 2D array with rows being estimated probability distributions. epsilon: Clipping offset to avoid numerical blowup (NaNs, inf, etc). """ # check normalization before clipping assert np.allclose( np.sum(target_probas, axis=1), 1.0, atol=1e-3 ), "Target probability distributions not normalized!" assert np.allclose( np.sum(pred_probas, axis=1), 1.0, atol=1e-3 ), "Predicted probability distributions not normalized!" # clip predicted probabilities pred_probas = np.clip(pred_probas, a_min=epsilon, a_max=1.0 - epsilon) # normalize pred_probas = normalized(pred_probas) # compute cross entropy values = -np.sum(target_probas * np.log(pred_probas), axis=1) # noinspection PyTypeChecker ce: float = np.mean(values) return ce def a_vs_b_auroc( target_ints: np.ndarray, predicted_probas: np.ndarray, class_a: int, class_b: int ) -> Optional[float]: """ Keeps only targets of class A or B, then computes the ROC AUC for the binary problem. Args: target_ints: 1d array of target class integers. predicted_probas: 2d array of predicted probabilities, one row per data point. class_a: Integer specifying the positive class. class_b: Integer specifying the negative class. Returns: A float or None if the result could not be computed. """ # only consider instances with targets of class A or B filter_mask = np.logical_or(target_ints == class_a, target_ints == class_b) target_ints = target_ints[filter_mask] predicted_probas = predicted_probas[filter_mask] # return None if not both classes represented if len(np.unique(target_ints)) != 2: return None # consider only probability columns for class A and B and renormalize binary_probas = normalized(predicted_probas[:, (class_a, class_b)]) # use class A as the positive class scores = binary_probas[:, 0] return roc_auc_score(y_true=target_ints == class_a, y_score=scores) def one_vs_all_auroc_values( target_ints: np.ndarray, predicted_probas: np.ndarray ) -> List[Optional[float]]: """Returns one AUROC (area under ROC curve, aka ROC AUC) score per class. Args: target_ints: 1d array of target class integers. predicted_probas: 2d array of predicted probabilities, one row per data point. Returns: A list with one AUROC value per class. """ assert len(predicted_probas) == len(target_ints) n_classes = predicted_probas.shape[1] auroc_values = [] for positive_class in range(n_classes): scores = predicted_probas[:, positive_class] is_positive_class = target_ints == positive_class if any(is_positive_class) and not all(is_positive_class): auroc_values.append(roc_auc_score(y_true=is_positive_class, y_score=scores)) else: auroc_values.append(None) return auroc_values def sensitivity_at_x_specificity( target_ints: np.ndarray, positive_probas: np.ndarray, at_specificity: float ) -> Tuple[Optional[float], Optional[float]]: """Compute sensitivity (recall) at a given specificity. Sensitivity = true positive rate = true positives / positives = recall = P(prediction positive | class positive) Specificity = true negative rate = true negatives / negatives = 1 - false positive rate = P(prediction negative | class negative) Args: target_ints: 1d array of binary class labels, zeros and ones positive_probas: 1d array of probabilities of class 1 at_specificity: specificity at which to compute sensitivity Returns: (float): sensitivity at returned specificity (float): specificity closest to input specificity """ assert 0 < at_specificity < 1 if len(set(target_ints)) < 2: return None, None fprs, sensitivities, _ = roc_curve(target_ints, positive_probas) specificities = 1.0 - fprs # last and first entries are not interesting (0 or 1) if len(specificities) > 2: specificities = specificities[1:-1] sensitivities = sensitivities[1:-1] # find point on curve that is closest to desired at_specificity index = np.argmin(np.abs(specificities - at_specificity)) return sensitivities[index], specificities[index] def specificity_at_x_sensitivity( target_ints: np.ndarray, positive_probas: np.ndarray, at_sensitivity: float ) -> Tuple[Optional[float], Optional[float]]: """Compute specificity at a given sensitivity (recall). Sensitivity = true positive rate = true positives / positives = recall = P(prediction positive | class positive) Specificity = true negative rate = true negatives / negatives = 1 - false positive rate = P(prediction negative | class negative) Args: target_ints: 1d array of binary class labels, zeros and ones positive_probas: 1d array of probabilities of class 1 at_sensitivity: sensitivity at which to compute specificity Returns: (float): specificity at returned sensitivity (float): sensitivity closest to input sensitivity """ assert 0 < at_sensitivity < 1 if len(set(target_ints)) < 2: return None, None fprs, sensitivities, _ = roc_curve(target_ints, positive_probas) specificities = 1.0 - fprs # last and first entries are not interesting if len(specificities) > 2: specificities = specificities[1:-1] sensitivities = sensitivities[1:-1] # find point on curve that is closest to desired sensitivity index = np.argmin(np.abs(sensitivities - at_sensitivity)) return specificities[index], sensitivities[index] def top_n_accuracy( target_ints: np.ndarray, predicted_probas: np.ndarray, n: int, sample_weights: Optional[np.ndarray] = None, ) -> float: """Fraction of test cases where the true target is among the top n predictions.""" assert len(target_ints) == len(predicted_probas) assert np.ndim(target_ints) == 1 assert np.ndim(predicted_probas) == 2 if sample_weights is None: sample_weights = np.ones_like(target_ints) assert_that(sample_weights.shape).is_equal_to(target_ints.shape) np.testing.assert_array_equal(sample_weights >= 0.0, True) # sort predicted class indices by probability (ascending) classes_by_probability = predicted_probas.argsort(axis=1) # take last n columns, because we sorted ascending top_n_predictions = classes_by_probability[:, -n:] # check if target is included is_target_in_top_n_predictions = [ target in top_n for target, top_n in zip(target_ints, top_n_predictions) ] top_n_acc = np.average(is_target_in_top_n_predictions, weights=sample_weights) return top_n_acc PK!ϩ*v*v*metriculous/metrics_test.pyimport numpy as np import pytest import sklearn.metrics as sklmetrics from scipy.stats import entropy import metriculous.metrics as metrics from metriculous.metrics import normalized from metriculous.metrics import sensitivity_at_x_specificity from metriculous.metrics import specificity_at_x_sensitivity from metriculous.metrics import top_n_accuracy # --- normalized ----------------------------------------------------------------------- def test_normalized(): # fmt: off result = metrics.normalized(np.array([ [.0, .0], [.1, .1], [.2, .3], [.5, .5], [.6, .4], [1., 4.], [0., 1.], [0., 1e-20], ])) expected = np.array([ [.5, .5], [.5, .5], [.4, .6], [.5, .5], [.6, .4], [.2, .8], [0., 1.], [0., 1.], ]) assert np.allclose(result, expected, atol=0.0) # fmt: on # --- cross-entropy -------------------------------------------------------------------- def test_cross_entropy_zero(): ce = metrics.cross_entropy( target_probas=np.array([[1.0, 0.0], [1.0, 0.0]]), pred_probas=np.array([[1.0, 0.0], [1.0, 0.0]]), epsilon=1e-15, ) np.testing.assert_allclose(ce, 0.0, atol=1e-15) def test_cross_entropy_certainty_in_targets(): target_probas = np.array([[1.0, 0.0], [1.0, 0.0]]) pred_probas = np.array([[0.6, 0.4], [0.1, 0.9]]) eps = 1e-15 ce = metrics.cross_entropy(target_probas, pred_probas, eps) ll = sklmetrics.log_loss(target_probas, pred_probas, eps) np.testing.assert_allclose(ce, ll) def test_cross_entropy_general_fuzz_test(): rng = np.random.RandomState(42) for _ in range(10): probas = normalized(rng.rand(100, 2)) ce = metrics.cross_entropy(probas, probas) scipy_entropy = np.sum(entropy(probas.T)) / len(probas) np.testing.assert_allclose(ce, scipy_entropy) # --- A vs B AUROC --------------------------------------------------------------------- def test_a_vs_b_auroc(): value = metrics.a_vs_b_auroc( target_ints=np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), predicted_probas=np.array( [ [0.8, 0.0, 0.2], # correct [0.9, 0.1, 0.0], # correct [0.7, 0.1, 0.2], # correct [0.1, 0.8, 0.1], # correct [0.1, 0.8, 0.1], # correct [0.1, 0.6, 0.3], # correct [0.9, 0.0, 0.1], # wrong [0.1, 0.9, 0.0], # wrong [0.1, 0.0, 0.9], # wrong ] ), class_a=0, class_b=1, ) assert value == 1.0 def test_a_vs_b_auroc_symmetry(): """Check that result is the same when classes are swapped.""" rng = np.random.RandomState(42) for _ in range(50): probas = normalized(rng.rand(100, 4)) target_ints = rng.randint(0, 4, size=len(probas)) a1b2 = metrics.a_vs_b_auroc( target_ints=target_ints, predicted_probas=probas, class_a=1, class_b=2 ) a2b1 = metrics.a_vs_b_auroc( target_ints=target_ints, predicted_probas=probas, class_a=2, class_b=1 ) np.testing.assert_allclose(a1b2, a2b1, atol=1e-15) def test_a_vs_b_auroc_zeros(): """Check case with zeros in all interesting columns.""" value = metrics.a_vs_b_auroc( target_ints=np.array([0, 1]), predicted_probas=np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]), class_a=0, class_b=1, ) # we just want to make sure this did not crash assert 0.0 <= value <= 1.0 def test_a_vs_b_auroc_none(): """Check case where it should return None.""" rng = np.random.RandomState(42) for _ in range(50): probas = normalized(rng.rand(100, 4)) target_ints = rng.randint(0, 1, size=len(probas)) value = metrics.a_vs_b_auroc( target_ints=target_ints, predicted_probas=probas, class_a=1, class_b=2 ) assert value is None # --- sensitivity at specificity ------------------------------------------------------- def test_sensitivity_at_x_specificity(): """Test AUC 0.5 prediction.""" n = 500 labels = np.concatenate((np.zeros(n), np.ones(n))) randoms = np.random.random(n) positive_probas = np.concatenate((randoms, randoms + 1e-9)) for at in np.linspace(0.1, 0.9, num=9): sens, spec = sensitivity_at_x_specificity( target_ints=labels, positive_probas=positive_probas, at_specificity=at ) np.testing.assert_allclose(spec, at, atol=0.003) np.testing.assert_allclose(sens, 1.0 - spec, atol=0.003) # --- specificity at sensitivity ------------------------------------------------------- def test_specificity_at_x_sensitivity(): """Test AUC 0.5 prediction.""" n = 500 labels = np.concatenate((np.zeros(n), np.ones(n))) randoms = np.random.random(n) positive_probas = np.concatenate((randoms, randoms + 1e-9)) for at in np.linspace(0.1, 0.9, num=9): spec, sens = specificity_at_x_sensitivity( target_ints=labels, positive_probas=positive_probas, at_sensitivity=at ) np.testing.assert_allclose(sens, at, atol=0.003) np.testing.assert_allclose(spec, 1.0 - sens, atol=0.003) # --- top N accuracy ------------------------------------------------------------------- def test_top_n_accuracy_all_correct(): np.random.seed(42) n_classes = 30 for i in range(5): target_ints = np.random.randint(0, n_classes, size=100) pred_probas = np.eye(n_classes)[target_ints] + np.random.rand( len(target_ints), n_classes ) for n in [1, 2, 3, 40, 100]: assert top_n_accuracy(target_ints, pred_probas, n) == 1.0 def test_top_n_accuracy(): target_ints = np.array([3, 1, 4]) # fmt:off pred_probas = np.array([ [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 ]) # fmt:on assert 1 / 3 == top_n_accuracy(target_ints, pred_probas, n=1) assert 2 / 3 == top_n_accuracy(target_ints, pred_probas, n=2) assert 2 / 3 == top_n_accuracy(target_ints, pred_probas, n=3) assert 3 / 3 == top_n_accuracy(target_ints, pred_probas, n=4) assert 3 / 3 == top_n_accuracy(target_ints, pred_probas, n=5) assert 3 / 3 == top_n_accuracy(target_ints, pred_probas, n=999) def test_top_n_accuracy__sample_weights_default(): """ Checks that passing in a uniform sample_weights vector does the same as passing `None` or using the default. """ target_ints = np.array([3, 1, 4]) # fmt:off pred_probas = np.array([ [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 ]) # fmt:on assert top_n_accuracy(target_ints, pred_probas, n=1) == top_n_accuracy( target_ints, pred_probas, n=1, sample_weights=np.ones_like(target_ints) ) assert top_n_accuracy( target_ints, pred_probas, n=1, sample_weights=None ) == top_n_accuracy( target_ints, pred_probas, n=1, sample_weights=np.ones_like(target_ints) ) def test_top_n_accuracy__sample_weights(): """ Same test as above, with additional zero-weighted samples, should get same output. """ target_ints = np.array([3, 1, 4, 1, 1, 1]) sample_weights = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) # fmt:off pred_probas = np.array([ [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 ]) # fmt:on assert 1 / 3 == top_n_accuracy( target_ints, pred_probas, n=1, sample_weights=sample_weights ) assert 2 / 3 == top_n_accuracy( target_ints, pred_probas, n=2, sample_weights=sample_weights ) assert 2 / 3 == top_n_accuracy( target_ints, pred_probas, n=3, sample_weights=sample_weights ) assert 3 / 3 == top_n_accuracy( target_ints, pred_probas, n=4, sample_weights=sample_weights ) assert 3 / 3 == top_n_accuracy( target_ints, pred_probas, n=5, sample_weights=sample_weights ) assert 3 / 3 == top_n_accuracy( target_ints, pred_probas, n=999, sample_weights=sample_weights ) def test_top_n_accuracy__sample_weights_scaled(): """ Checks that scaling the weight vector does not change the results. """ target_ints = np.array([3, 1, 4, 1, 1, 1]) sample_weights = np.array([2.4, 0.5, 2.1, 0.01, 0.9, 35.7]) # fmt:off pred_probas = np.array([ [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 ]) # fmt:on assert top_n_accuracy( target_ints, pred_probas, n=1, sample_weights=sample_weights ) == top_n_accuracy( target_ints, pred_probas, n=1, sample_weights=42.0 * sample_weights ) def test_top_n_accuracy__sample_weights_all_zeros(): """ Checks that passing in zero vector `sample_weights` raises `ZeroDivisionError`. """ target_ints = np.array([3, 1, 4, 1, 1, 1]) sample_weights = np.zeros_like(target_ints) # fmt:off pred_probas = np.array([ [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 ]) # fmt:on with pytest.raises(ZeroDivisionError): _ = top_n_accuracy(target_ints, pred_probas, n=1, sample_weights=sample_weights) def test_top_n_accuracy__sample_weights_negative(): """ Checks that an exception is raised if at least one of the sample weights is negative. """ target_ints = np.array([3, 1, 4, 1, 1, 1]) sample_weights = np.array([1.0, 1.0, -1.0, 1.0, 1.0, 1.0]) # fmt:off pred_probas = np.array([ [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 [.3, .4, .2, .1, .0], # target in top 4 [.4, .3, .2, .1, .0], # target in top 2 [.0, .1, .2, .3, .4], # target in top 1 ]) # fmt:on with pytest.raises(AssertionError): _ = top_n_accuracy(target_ints, pred_probas, n=1, sample_weights=sample_weights) PK!|metriculous/test_resources.pyfrom typing import List from typing import Tuple import numpy as np def noisy_prediction(targets_one_hot: np.array, noise_factor: float): """Simulates a classifier prediction on the dataset.""" assert targets_one_hot.ndim == 2 # Add some noise to the predictions to simulate a classifier noisy_target = targets_one_hot + noise_factor * np.random.random( size=targets_one_hot.shape ) # Normalize the rows, making sure they are valid probability distributions probability_distributions = noisy_target / noisy_target.sum(axis=1, keepdims=True) return probability_distributions def generate_input( num_classes: int, num_samples: int, num_models: int ) -> Tuple[np.ndarray, List[np.ndarray]]: target_class_indices = np.random.randint(0, high=num_classes, size=num_samples) targets_one_hot = np.eye(num_classes)[target_class_indices] # For each model that goes into the comparison, let's generate a prediction. # Note that we pick a random noise factor to make sure some models have more noise # than others. predicted_probabilities = [ noisy_prediction(targets_one_hot, noise_factor=3 * np.random.random()) for i_model in range(num_models) ] return targets_one_hot, predicted_probabilities PK!}Kk k metriculous/utilities.pyfrom typing import Sequence import numpy as np from assertpy import assert_that def sample_weights_simulating_class_distribution( y_true: Sequence[int], hypothetical_class_distribution: Sequence[float] ) -> np.ndarray: """ Computes a 1D array of sample weights that results in the requested `hypothetical_class_distribution` if applied to the dataset. This is useful when you know that the class distribution in your dataset deviates from the distribution you expect to encounter in the environment where your machine learning model is going to be deployed. Example: You have a data set with 40% spam 60% ham emails. However, you expect that only 4% of the emails in the deployment environment will be spam, and you would like to measure various performance characteristics on a dataset with 4% spam and 96% ham. This function will return an array with * sample weights 4% / 40% = 0.1 for all of the spam examples * sample weights 96% / 60% = 1.6 for all of the ham examples if called with: >>> weights = sample_weights_simulating_class_distribution( ... y_true=[0, 1, 1, 0, 1, 0, 1, 1, 0, 1], # zeros for spam ... hypothetical_class_distribution=[0.04, 0.96] ... ) >>> print(weights) array([0.1 , 1.6]) Args: y_true: 1D array of integers with class indices of the dataset. There must be at least one sample for each class. hypothetical_class_distribution: Sequence of floats describing the distribution you assume to encounter in your deployment environment. Returns: 1D numpy array with sample weights, same length as `y_true`. """ # --- check input --- assert_that(set(y_true)).is_equal_to( set(range(len(hypothetical_class_distribution))) ) assert_that(len(set(y_true))).is_equal_to(len(hypothetical_class_distribution)) y_true = np.asarray(y_true) hypothetical_class_distribution = np.asarray(hypothetical_class_distribution) np.testing.assert_allclose(hypothetical_class_distribution.sum(), 1.0) assert_that(y_true.ndim).is_equal_to(1) assert_that(hypothetical_class_distribution.ndim).is_equal_to(1) # --- compute output --- class_distribution = np.bincount(y_true) / len(y_true) np.testing.assert_equal(class_distribution > 0.0, True) np.testing.assert_allclose(class_distribution.sum(), 1.0) weights = [ hypothetical_class_distribution[y] / class_distribution[y] for y in y_true ] return np.array(weights) PK! metriculous/utilities_test.pyimport numpy as np import pytest from sklearn import metrics as sklmetrics from . import utilities def test_sample_weights(): y_true = np.array([0, 0, 0, 0, 1, 1, 2, 2, 2, 2]) weights = utilities.sample_weights_simulating_class_distribution( y_true=y_true, # distribution: [0.4, 0.2, 0.4] hypothetical_class_distribution=[0.90, 0.08, 0.02], ) expected_weights = np.array( [ 0.90 / 0.4, 0.90 / 0.4, 0.90 / 0.4, 0.90 / 0.4, 0.08 / 0.2, 0.08 / 0.2, 0.02 / 0.4, 0.02 / 0.4, 0.02 / 0.4, 0.02 / 0.4, ] ) assert np.shape(weights) == np.shape(y_true) np.testing.assert_allclose(weights, expected_weights) # Now use the sample weights and see if they have the desired effect: # Use predictions where first four entries, # which correspond to true class 0, are correct. some_prediction = np.array([0, 0, 0, 0, 2, 2, 1, 1, 1, 1]) accuracy_with_weights = sklmetrics.accuracy_score( y_true=y_true, y_pred=some_prediction, sample_weight=weights ) accuracy_without_weights = 0.4 assert accuracy_with_weights == pytest.approx( accuracy_without_weights * 0.90 / 0.4, abs=1e-9 ) def test_sample_weights__distribution_not_normalized(): """ Checks that an exception is raised if the hypothetical class distribution is not normalized. """ not_normalized = [0.4, 0.3, 0.1] with pytest.raises(AssertionError): _ = utilities.sample_weights_simulating_class_distribution( y_true=[0, 1, 2, 0, 1, 2], hypothetical_class_distribution=not_normalized ) @pytest.mark.parametrize( "y_true, hypothetical_class_weights", [ ([0, 0, 1, 3], [0.5, 0.3, 0.1, 0.1]), ([0, 0, 1, 3], [0.5, 0.3, 0.2]), ([0, 0, 1, 3], [0.5, 0.1, 0.2, 0.1, 0.1]), ([0, 1, 2, 3], [0.5, 0.1, 0.2, 0.1, 0.1]), ([3], [0.5, 0.3, 0.2]), ([0], [0.5, 0.5]), ([1], [1.0]), ], ) def test_sample_weights__class_not_represented(y_true, hypothetical_class_weights): """ Checks that an exception is raised if at least one class is not represented in the input. """ np.testing.assert_allclose(sum(hypothetical_class_weights), 1.0) with pytest.raises(AssertionError): _ = utilities.sample_weights_simulating_class_distribution( y_true=y_true, hypothetical_class_distribution=hypothetical_class_weights ) PK!;]..#metriculous-0.1.0.dist-info/LICENSEMIT License Copyright (c) 2019 Luminovo GmbH Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HڽTU!metriculous-0.1.0.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!Ho$metriculous-0.1.0.dist-info/METADATAUQo6~篸%k ,Y`I%^Z"YR)9i~%wwwG_"Z`ʏ' Vm[1?bmπ{h"jYZ\+/Zz&FZ TttKe.j0Vy%񢰑֎Ĉ7nxntSfB.eѹ%m Sd:D v#?d!_6 p! ~mE\#'og{Dz4<ܒ" @v,"SCuXw*ksҊ *^ U$^YPߋ;'g@<J-?:|;Ǔj^j$hӷCW.k?T} nh'{]n4* ,{COifօ~P Pdhl.xjɺ4E[eSawF@?l$օR(JLs:(K(~JbOcp{Vk:k=-7јr%*D(h >о!=ц||i|ͳK\^eC 4-zԈ5*`GT!Hug#=>l{ZRQmxGUk$uC{@=H|L\=[VBCH;w2[1aU(XU*'8 U,!';LeAdEX7pXF|DnE[VlHkKlAꝴPѓVF-vYOE$h*p6ü%J "0ɉQ֑(T)i<8qEbM04JGM!  !:K L?$ .OoQ:$YD X~ E ft߾n8-cy9:sC\Y8dpC6Ċbſ}:Y/PK!H-?p"metriculous-0.1.0.dist-info/RECORDIJ-Ёb^da0 f7GxU6宅!vGu%tmPBTgT65:୽M[yR3&Gl.jhcܞw8OaVMI ?Y6 x31x=nFn^hgV R/6Q<c S\Mm[ZK6EZ4P$'n0`B =O}^tNuX,!N3ߚ[kb#\N1{\ZCɑy҃ &Ѵ3WvsN!?@g:{rFfT=,+eͼ=jQSjyw_j8 {F"1淪Mzh%s0W #< 5 ՓT*"}OE;Ⴊ گd;wk(Bw׶ 1lQ)!d#_iu/N /4UfCpi8}$#UθEX/w*M/ R$I4f$6QT%0TWyRVe}^'U\&4bx5I:'vM/]AVU:p+ #6i;$ݦCyןI:.i:G !0p4s A_PK!;]..LICENSEPK!MzSmetriculous/__init__.pyPK!a%%(>metriculous/__init___import_star_test.pyPK!@imetriculous/__init___test.pyPK!L,Q$$$$metriculous/_comparison.pyPK!|{ { 3metriculous/_evaluation.pyPK!_Y-=metriculous/_evaluation_test.pyPK!jr"@metriculous/evaluators/__init__.pyPK!y[R[R3Ametriculous/evaluators/_classification_evaluator.pyPK!1~//8metriculous/evaluators/_classification_evaluator_test.pyPK!W'EE7metriculous/evaluators/_classification_figures_bokeh.pyPK!2/ metriculous/evaluators/_classification_utils.pyPK!&߉1 metriculous/evaluators/_segmentation_evaluator.pyPK!ɔ  6+metriculous/evaluators/_segmentation_evaluator_test.pyPK!ld  5Lmetriculous/evaluators/_segmentation_figures_bokeh.pyPK!2ŮfWmetriculous/metrics.pyPK!ϩ*v*v*Gvmetriculous/metrics_test.pyPK!|metriculous/test_resources.pyPK!}Kk k 4metriculous/utilities.pyPK! հmetriculous/utilities_test.pyPK!;]..#metriculous-0.1.0.dist-info/LICENSEPK!HڽTU!nmetriculous-0.1.0.dist-info/WHEELPK!Ho$metriculous-0.1.0.dist-info/METADATAPK!H-?p"Wmetriculous-0.1.0.dist-info/RECORDPK