PK ! ;]. . LICENSEMIT License
Copyright (c) 2019 Luminovo GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
PK ! Mz metriculous/__init__.pyfrom metriculous import evaluators
from metriculous import utilities
from metriculous._comparison import Comparator
from metriculous._comparison import Comparison
from metriculous._evaluation import Evaluation
from metriculous._evaluation import Evaluator
from metriculous._evaluation import Quantity
__all__ = [
"Comparator",
"Comparison",
"Evaluator",
"Evaluation",
"Quantity",
"evaluators",
"utilities",
]
PK ! a% % ( metriculous/__init___import_star_test.pyfrom metriculous import * # noqa
def test_import_star():
_ = Quantity("q", 42.0) # noqa
e = Evaluator() # noqa
_ = Evaluation([], "MyModel") # noqa
_ = Comparator(evaluator=e) # noqa
_ = Comparison([]) # noqa
_ = evaluators.ClassificationEvaluator() # noqa
PK ! @i metriculous/__init___test.pyimport pytest
from assertpy import assert_that
def test_exposed_entities():
expected = [
"Comparator",
"Comparison",
"Evaluator",
"Evaluation",
"Quantity",
"evaluators",
"utilities",
]
import metriculous
assert_that(metriculous.__all__).is_equal_to(expected)
def test_imports_from_style():
from metriculous import Comparator
from metriculous import Comparison
from metriculous import Evaluation
from metriculous import Evaluator
from metriculous import Quantity
num_classes = 42
_ = Quantity("q", 42.0)
e = Evaluator()
_ = Evaluation("MyModel", [], [])
_ = Comparator(evaluator=e)
_ = Comparison([])
with pytest.raises(ImportError):
# noinspection PyUnresolvedReferences,PyProtectedMember
from metriculous import ClassificationEvaluator
_ = ClassificationEvaluator()
from metriculous.evaluators import ClassificationEvaluator
_ = ClassificationEvaluator()
with pytest.raises(ImportError):
# noinspection PyUnresolvedReferences,PyProtectedMember
from metriculous import SegmentationEvaluator
_ = SegmentationEvaluator(num_classes)
from metriculous.evaluators import SegmentationEvaluator
_ = SegmentationEvaluator(num_classes)
def test_imports_prefix_style():
import metriculous as met
num_classes = 42
_ = met.Quantity("q", 42.0)
e = met.Evaluator()
_ = met.Evaluation("MyModel", [], [])
_ = met.Comparator(evaluator=e)
_ = met.Comparison([])
_ = met.evaluators.ClassificationEvaluator()
_ = met.evaluators.SegmentationEvaluator(num_classes)
_ = met.utilities.sample_weights_simulating_class_distribution(
[0, 1, 2, 2], [0.8, 0.2, 0.0]
)
with pytest.raises(AttributeError):
# noinspection PyUnresolvedReferences
_ = met.ClassificationEvaluator()
_ = met.SegmentationEvaluator(num_classes)
PK ! L,Q$$ $$ metriculous/_comparison.pyimport os
from dataclasses import dataclass
from typing import Any, Optional
from typing import List
from typing import Sequence
import bokeh.layouts
import numpy as np
import pandas as pd
from assertpy import assert_that
from bokeh import plotting
from bokeh.models import Spacer
from IPython.display import display
from IPython.display import HTML
from IPython.display import Markdown
from metriculous._evaluation import Evaluation
from metriculous._evaluation import Evaluator
@dataclass(frozen=True)
class Comparison:
evaluations: List[Evaluation]
def __post_init__(self):
_check_consistency(self.evaluations)
def display(self, include_spacer=False):
_display_comparison_table(self.evaluations, include_spacer)
# noinspection PyBroadException
try:
os.system('say "Model comparison is ready."')
except Exception:
pass
class Comparator:
"""Can generate model comparisons after initialization with an Evaluator."""
def __init__(self, evaluator: Evaluator):
self.evaluator = evaluator
def compare(
self,
ground_truth: Any,
model_predictions: Sequence[Any],
model_names=None,
sample_weights: Optional[Sequence[float]] = None,
) -> Comparison:
"""Generates a Comparison from a list of predictions and the ground truth.
Args:
model_predictions:
List with one prediction object per model to be compared.
ground_truth:
A single ground truth object.
model_names:
Optional list of model names. If `None` generic names will be generated.
sample_weights:
Optional sequence of floats to modify the influence of individual
samples on the statistics that will be measured.
Returns:
A Comparison object with one Evaluation per prediction.
"""
if model_names is None:
model_names = [f"Model_{i}" for i in range(len(model_predictions))]
else:
assert_that(model_names).is_length(len(model_predictions))
model_evaluations = [
self.evaluator.evaluate(
ground_truth,
model_prediction=pred,
model_name=model_name,
sample_weights=sample_weights,
)
for pred, model_name in zip(model_predictions, model_names)
]
return Comparison(model_evaluations)
def _get_and_supplement_model_names(model_evaluations: List[Evaluation]):
return [
evaluation.model_name
if evaluation.model_name is not None
else f"model_{i_model}"
for i_model, evaluation in enumerate(model_evaluations)
]
def _model_evaluations_to_data_frame(
model_evaluations: List[Evaluation]
) -> pd.DataFrame:
quantity_names = [q.name for q in model_evaluations[0].quantities]
# create one row per quantity
data = []
for i_q, quantity_name in enumerate(quantity_names):
row = [quantity_name]
for evaluation in model_evaluations:
quantity = evaluation.quantities[i_q]
assert_that(quantity.name).is_equal_to(quantity_name)
row.append(quantity.value)
data.append(row)
model_names = _get_and_supplement_model_names(model_evaluations)
return pd.DataFrame(data, columns=["Quantity"] + model_names)
def _check_consistency(model_evaluations: List[Evaluation]):
if len(model_evaluations) == 0:
return
first = model_evaluations[0]
for evaluation in model_evaluations:
assert_that(evaluation.primary_metric).is_equal_to(first.primary_metric)
assert_that(len(evaluation.quantities)).is_equal_to(len(first.quantities))
for q, q_first in zip(evaluation.quantities, first.quantities):
# check that everything except the value is equal
assert_that(q.name).is_equal_to(q_first.name)
assert_that(q.higher_is_better).is_equal_to(q_first.higher_is_better)
assert_that(q.description).is_equal_to(q_first.description)
not_none_model_names = [
ms.model_name for ms in model_evaluations if ms.model_name is not None
]
assert_that(not_none_model_names).does_not_contain_duplicates()
good_color = "#b2ffb2"
def _highlight_max(data):
"""Highlights the maximum in a Series or DataFrame.
Checkout http://pandas.pydata.org/pandas-docs/stable/style.html for cool stuff.
"""
attr = "background-color: {}".format(good_color)
if data.ndim == 1: # Series from .apply(axis=0) or axis=1
is_max = data == data.max()
# noinspection PyTypeChecker
return [attr if v else "" for v in is_max]
else: # from .apply(axis=None)
is_max = data == data.max().max()
return pd.DataFrame(
np.where(is_max, attr, ""), index=data.index, columns=data.columns
)
def _highlight_min(data):
"""Highlights the minimum in a Series or DataFrame.
Checkout http://pandas.pydata.org/pandas-docs/stable/style.html for cool stuff.
"""
attr = "background-color: {}".format(good_color)
if data.ndim == 1: # Series from .apply(axis=0) or axis=1
is_min = data == data.min()
# noinspection PyTypeChecker
return [attr if v else "" for v in is_min]
else: # from .apply(axis=None)
is_min = data == data.min().min()
return pd.DataFrame(
np.where(is_min, attr, ""), index=data.index, columns=data.columns
)
def _display_comparison_table(
model_evaluations: List[Evaluation], include_spacer: bool
):
_check_consistency(model_evaluations)
primary_metric = model_evaluations[0].primary_metric
n_models = len(model_evaluations)
scores_data_frame = _model_evaluations_to_data_frame(
[
evaluation.filtered(keep_higher_is_better=True)
for evaluation in model_evaluations
]
)
losses_data_frame = _model_evaluations_to_data_frame(
[
evaluation.filtered(keep_lower_is_better=True)
for evaluation in model_evaluations
]
)
neutral_data_frame = _model_evaluations_to_data_frame(
[
evaluation.filtered(keep_neutral_quantities=True)
for evaluation in model_evaluations
]
)
def is_primary_metric(a_metric: str):
return a_metric.lower() == primary_metric
def highlight_primary_metric(data):
attr = "font-weight: bold; font-size: 120%;"
if data.ndim == 1:
metric = data[0].lower()
if is_primary_metric(metric):
return [attr for v in data]
else:
return ["" for v in data]
else: # from .apply(axis=None)
good_things = np.ones_like(data).astype(bool)
return pd.DataFrame(
np.where(good_things, "", ""), index=data.index, columns=data.columns
)
def display_stylish_table(df: pd.DataFrame, highlight_fn=None):
df_styled = df.style.set_properties(width="400px").format(_format_numbers)
df_styled = df_styled.apply(highlight_primary_metric, axis=1)
if highlight_fn is None:
display(df_styled)
else:
display(df_styled.apply(highlight_fn, axis=1, subset=df.columns[1:]))
# increase usable Jupyter notebook width when comparing many models
if n_models > 3:
# noinspection PyTypeChecker
display(HTML(""))
if len(scores_data_frame):
# noinspection PyTypeChecker
display(Markdown("## Scores (higher is better)"))
display_stylish_table(
scores_data_frame, _highlight_max if n_models > 1 else None
)
if len(losses_data_frame):
# noinspection PyTypeChecker
display(Markdown("## Losses (lower is better)"))
display_stylish_table(
losses_data_frame, _highlight_min if n_models > 1 else None
)
if len(neutral_data_frame):
# noinspection PyTypeChecker
display(Markdown("## Other Quantities"))
display_stylish_table(neutral_data_frame)
# hide DataFrame indices
# noinspection PyTypeChecker
display(
HTML(
"""
"""
)
)
# TODO check figure consistency
# tell bokeh to output to notebook
plotting.output_notebook()
# show rows of figures
for i_showable, _ in enumerate(model_evaluations[0].figures):
row_of_figures = [
evaluation.figures[i_showable]
for i_model, evaluation in enumerate(model_evaluations)
]
if include_spacer:
row_of_figures = [Spacer()] + row_of_figures
plotting.show(bokeh.layouts.row(row_of_figures, sizing_mode="scale_width"))
def _format_numbers(entry):
try:
flt = float(entry)
return "{:.3f}".format(flt)
except ValueError:
return entry
PK ! |{ { metriculous/_evaluation.py"""
This module contains data types and interfaces that are used throughout the library.
Here we do not make any assumptions about the structure of ground truth and predictions.
"""
from dataclasses import dataclass
from dataclasses import field
from dataclasses import replace
from typing import Any, Sequence
from typing import List
from typing import Optional
from typing import Union
from bokeh.plotting import Figure
@dataclass(frozen=True)
class Quantity:
name: str
value: Union[float, str]
higher_is_better: Optional[bool] = None
description: Optional[str] = None
@dataclass(frozen=True)
class Evaluation:
model_name: str
quantities: List[Quantity] = field(default_factory=list)
figures: List[Figure] = field(default_factory=list)
primary_metric: Optional[str] = None
def get_by_name(self, quantity_name) -> Quantity:
# Number of quantities is usually small,
# so do not bother with internal dict for lookup
for q in self.quantities:
if quantity_name == q.name:
return q
raise ValueError(f"Could not find quantity named {quantity_name}")
def get_primary(self) -> Optional[Quantity]:
if self.primary_metric is None:
return None
return self.get_by_name(self.primary_metric)
def filtered(
self,
keep_higher_is_better=False,
keep_lower_is_better=False,
keep_neutral_quantities=False,
):
return replace(
self,
quantities=[
q
for q in self.quantities
if any(
[
(q.higher_is_better is True and keep_higher_is_better),
(q.higher_is_better is False and keep_lower_is_better),
(q.higher_is_better is None and keep_neutral_quantities),
]
)
],
)
class Evaluator:
"""
Interface to be implemented by the user to compute quantities and charts that are
relevant and applicable to the problem at hand.
"""
def evaluate(
self,
ground_truth: Any,
model_prediction: Any,
model_name: str,
sample_weights: Optional[Sequence[float]] = None,
) -> Evaluation:
"""Generates an Evaluation from ground truth and a model prediction."""
raise NotImplementedError
PK ! _Y- metriculous/_evaluation_test.pyimport pytest
from metriculous import Quantity
@pytest.mark.parametrize("name", ["", "accuracy", "What Ever"])
@pytest.mark.parametrize("value", [-0.5, 0.0, 1e15])
@pytest.mark.parametrize("higher_is_better", [True, False])
@pytest.mark.parametrize("description", [None, "", "Quantifies the whateverness"])
def test_quantity(name, value, higher_is_better, description):
quantity = Quantity(name, value, higher_is_better, description)
quantity_ = Quantity(name, value, higher_is_better, description)
assert quantity == quantity_
PK ! jr " metriculous/evaluators/__init__.py"""
This module provides various default Evaluator implementations that are useful for the
most common machine learning problems, such as classification and regression.
"""
from metriculous.evaluators._classification_evaluator import ClassificationEvaluator
from metriculous.evaluators._segmentation_evaluator import SegmentationEvaluator
__all__ = ["ClassificationEvaluator", "SegmentationEvaluator"]
PK ! y[R [R 3 metriculous/evaluators/_classification_evaluator.pyfrom typing import Callable
from typing import Optional
from typing import Sequence
import numpy as np
from assertpy import assert_that
from scipy.stats import entropy
from sklearn import metrics as sklmetrics
from ._classification_utils import check_normalization
from .._evaluation import Evaluation
from .._evaluation import Evaluator
from .._evaluation import Quantity
from ..evaluators._classification_figures_bokeh import _bokeh_automation_rate_analysis
from ..evaluators._classification_figures_bokeh import _bokeh_confusion_matrix
from ..evaluators._classification_figures_bokeh import _bokeh_confusion_scatter
from ..evaluators._classification_figures_bokeh import _bokeh_output_histogram
from ..evaluators._classification_figures_bokeh import _bokeh_precision_recall_curve
from ..evaluators._classification_figures_bokeh import _bokeh_roc_curve
from ..metrics import top_n_accuracy
from ..utilities import sample_weights_simulating_class_distribution
class ClassificationEvaluator(Evaluator):
"""
Default Evaluator implementation that serves well for most classification problems.
"""
def __init__(
self,
class_names: Optional[Sequence[str]] = None,
one_vs_all_quantities=True,
one_vs_all_figures=False,
top_n_accuracies: Sequence[int] = (),
filter_quantities: Optional[Callable[[str], bool]] = None,
filter_figures: Optional[Callable[[str], bool]] = None,
primary_metric: Optional[str] = None,
simulated_class_distribution: Optional[Sequence[float]] = None,
class_label_rotation_x="horizontal",
class_label_rotation_y="vertical",
):
"""
Initializes the evaluator with the option to overwrite the default settings.
Args:
class_names:
Optional, names of the classes.
one_vs_all_quantities:
If `True` show quantities like "ROC AUC Class_i vs Rest" for all i.
one_vs_all_figures:
If `True` show figures like "ROC Curve Class_i vs Rest" for all i.
top_n_accuracies:
A sequence of positive integers to specify which top-N accuracy metrics
should be computed.
Example: `top_n_accuracies=[2, 3, 5, 10]`
filter_quantities:
Callable that receives a quantity name and returns `False` if the
quantity should be excluded.
Example: `filter_quantities=lambda name: "vs Rest" not in name`
filter_figures:
Callable that receives a figure title and returns `False` if the figure
should be excluded.
Example: `filter_figures=lambda name: "ROC" in name`
primary_metric:
Optional string to specify the most important metric that should be used
for model selection.
simulated_class_distribution:
Optional sequence of floats that indicates a hypothetical class
distribution on which models should be evaluated. If not `None`, sample
weights will be computed and used to simulate the desired class
distribution.
class_label_rotation_x:
Rotation of x-axis tick labels for figures with class name tick labels.
class_label_rotation_y:
Rotation of y-axis tick labels for figures with class name tick labels.
"""
self.class_names = class_names
self.one_vs_all_quantities = one_vs_all_quantities
self.one_vs_all_figures = one_vs_all_figures
self.top_n_accuracies = top_n_accuracies
assert all(isinstance(val, int) for val in self.top_n_accuracies)
assert all(val >= 1 for val in self.top_n_accuracies)
self.filter_quantities = (
(lambda name: True) if filter_quantities is None else filter_quantities
)
self.filter_figures = (
(lambda name: True) if filter_figures is None else filter_figures
)
self.primary_metric = primary_metric
if simulated_class_distribution is not None:
check_normalization(simulated_class_distribution, axis=0)
np.testing.assert_equal(
np.asarray(simulated_class_distribution) > 0.0, True
)
self.simulated_class_distribution = simulated_class_distribution
self.class_label_rotation_x = class_label_rotation_x
self.class_label_rotation_y = class_label_rotation_y
def evaluate(
self,
ground_truth: np.ndarray,
model_prediction: np.ndarray,
model_name: str,
sample_weights: Optional[Sequence[float]] = None,
) -> Evaluation:
"""
Computes Quantities and generates Figures that are useful for most
classification problems.
Args:
model_prediction:
Sequence of 2d arrays where each array corresponds to a model
and each row is a probability distribution.
ground_truth:
2d array with each row being a probability distribution.
model_name:
Name of the model that is being evaluated.
sample_weights:
Sequence of floats to modify the influence of individual samples on the
statistics that will be measured.
Returns:
An Evaluation object containing Quantities and Figures that are useful for
most classification problems.
"""
# === Preparations =============================================================
# give variables more specific names
y_pred_proba = model_prediction
y_true_proba = ground_truth
# and delete interface parameter names to avoid confusion
del model_prediction
del ground_truth
n_classes = y_true_proba.shape[1]
if self.class_names is None:
self.class_names = ["class_{}".format(i) for i in range(n_classes)]
assert len(self.class_names) == y_true_proba.shape[1]
# check shapes
assert np.ndim(y_true_proba) == 2
assert (
y_true_proba.shape == y_pred_proba.shape
), f"{y_true_proba.shape} != {y_pred_proba.shape}"
# check normalization
check_normalization(y_true_proba, axis=1)
check_normalization(y_pred_proba, axis=1)
np.testing.assert_equal(y_true_proba >= 0.0, True)
np.testing.assert_equal(y_pred_proba >= 0.0, True)
# compute non-probabilistic class decisions
y_true = np.argmax(y_true_proba, axis=1)
y_pred = np.argmax(y_pred_proba, axis=1)
# make one-hot arrays, which are required for some sklearn metrics
y_true_one_hot: np.ndarray = np.eye(n_classes)[y_true]
y_pred_one_hot: np.ndarray = np.eye(n_classes)[y_pred]
# process sample_weights parameter and self.simulated_class_distribution
if sample_weights is not None:
assert self.simulated_class_distribution is None, (
"Cannot use `sample_weights` with ClassificationEvaluator that was "
"initialized with `simulated_class_distribution`."
)
sample_weights = np.asarray(sample_weights)
assert_that(sample_weights.ndim).is_equal_to(1)
assert_that(sample_weights.shape).is_equal_to((len(y_pred),))
np.testing.assert_array_equal(sample_weights >= 0.0, True)
elif self.simulated_class_distribution is not None:
assert_that(np.shape(self.simulated_class_distribution)).is_equal_to(
(n_classes,)
)
sample_weights = sample_weights_simulating_class_distribution(
y_true=y_true,
hypothetical_class_distribution=self.simulated_class_distribution,
)
# === Quantities ===============================================================
# Note: Optimization potential here for problems with many classes.
# We are currently computing all quantities and then throwing away some of them,
# rather than only computing those that are requested by self.filter_quantities
quantities = [
q
for q in self._quantities(
y_pred,
y_pred_one_hot,
y_pred_proba,
y_true,
y_true_one_hot,
y_true_proba,
maybe_sample_weights=sample_weights,
)
if self.filter_quantities(q.name)
]
# === Figures ==================================================================
figures = self._figures(
model_name,
y_pred=y_pred,
y_pred_one_hot=y_pred_one_hot,
y_pred_proba=y_pred_proba,
y_true=y_true,
y_true_one_hot=y_true_one_hot,
y_true_proba=y_true_proba,
maybe_sample_weights=sample_weights,
)
return Evaluation(
quantities=quantities,
figures=figures,
model_name=model_name,
primary_metric=self.primary_metric,
)
def _figures(
self,
model_name: str,
y_pred: np.ndarray,
y_pred_one_hot: np.ndarray,
y_pred_proba: np.ndarray,
y_true: np.ndarray,
y_true_one_hot: np.ndarray,
y_true_proba: np.ndarray,
maybe_sample_weights: Optional[np.ndarray],
):
figures = []
# --- Histogram of predicted and ground truth classes ---
if maybe_sample_weights is None:
figure_name = "Class Distribution"
if self.filter_figures(figure_name):
figures.append(
_bokeh_output_histogram(
y_true=y_true,
y_pred=y_pred,
class_names=self.class_names,
title_rows=[model_name, figure_name],
sample_weights=None,
x_label_rotation=self.class_label_rotation_x,
)
)
else:
figure_name = "Unweighted Class Distribution"
if self.filter_figures(figure_name):
figures.append(
_bokeh_output_histogram(
y_true=y_true,
y_pred=y_pred,
class_names=self.class_names,
title_rows=[model_name, figure_name],
sample_weights=None,
x_label_rotation=self.class_label_rotation_x,
)
)
figure_name = "Weighted Class Distribution"
if self.filter_figures(figure_name):
figures.append(
_bokeh_output_histogram(
y_true=y_true,
y_pred=y_pred,
class_names=self.class_names,
title_rows=[model_name, figure_name],
sample_weights=maybe_sample_weights,
x_label_rotation=self.class_label_rotation_x,
)
)
# --- Confusion Scatter Plot ---
figure_name = "Confusion Scatter Plot"
if maybe_sample_weights is None and self.filter_figures(figure_name):
figures.append(
_bokeh_confusion_scatter(
y_true=y_true,
y_pred=y_pred,
class_names=self.class_names,
title_rows=[model_name, figure_name],
x_label_rotation=self.class_label_rotation_x,
y_label_rotation=self.class_label_rotation_y,
)
)
# --- Confusion Matrix ---
figure_name = "Confusion Matrix"
if maybe_sample_weights is None and self.filter_figures(figure_name):
figures.append(
_bokeh_confusion_matrix(
y_true=y_true,
y_pred=y_pred,
class_names=self.class_names,
title_rows=[model_name, figure_name],
x_label_rotation=self.class_label_rotation_x,
y_label_rotation=self.class_label_rotation_y,
)
)
# --- Automation Rate Analysis ---
figure_name = "Automation Rate Analysis"
if self.filter_figures(figure_name):
figures.append(
_bokeh_automation_rate_analysis(
y_target_one_hot=y_true_one_hot,
y_pred_proba=y_pred_proba,
title_rows=[model_name, figure_name],
sample_weights=maybe_sample_weights,
)
)
# --- ROC curves ---
if self.one_vs_all_figures:
for class_index, class_name in enumerate(self.class_names):
figure_name = f"ROC {class_name} vs Rest"
if self.filter_figures(figure_name):
figures.append(
_bokeh_roc_curve(
y_true_binary=(y_true == class_index),
y_pred_score=y_pred_proba[:, class_index],
title_rows=[model_name, figure_name],
sample_weights=maybe_sample_weights,
)
)
# --- PR curves ---
if self.one_vs_all_figures:
for class_index, class_name in enumerate(self.class_names):
figure_name = f"PR Curve {class_name} vs Rest"
if self.filter_figures(figure_name):
figures.append(
_bokeh_precision_recall_curve(
y_true_binary=(y_true == class_index),
y_pred_score=y_pred_proba[:, class_index],
title_rows=[model_name, figure_name],
sample_weights=maybe_sample_weights,
)
)
return figures
def _quantities(
self,
y_pred: np.ndarray,
y_pred_one_hot: np.ndarray,
y_pred_proba: np.ndarray,
y_true: np.ndarray,
y_true_one_hot: np.ndarray,
y_true_proba: np.ndarray,
maybe_sample_weights: Optional[np.ndarray],
):
quantities = []
quantities.append(
Quantity(
"Accuracy",
sklmetrics.accuracy_score(
y_true, y_pred, sample_weight=maybe_sample_weights
),
higher_is_better=True,
)
)
quantities.append(
Quantity(
"ROC AUC Macro Average",
sklmetrics.roc_auc_score(
y_true_one_hot,
y_pred_proba,
average="macro",
sample_weight=maybe_sample_weights,
),
higher_is_better=True,
)
)
quantities.append(
Quantity(
"ROC AUC Micro Average",
sklmetrics.roc_auc_score(
y_true_one_hot,
y_pred_proba,
average="micro",
sample_weight=maybe_sample_weights,
),
higher_is_better=True,
)
)
quantities.append(
Quantity(
"F1-Score Macro Average",
sklmetrics.f1_score(
y_true_one_hot,
y_pred_one_hot,
average="macro",
sample_weight=maybe_sample_weights,
),
higher_is_better=True,
)
)
quantities.append(
Quantity(
"F1-Score Micro Average",
sklmetrics.f1_score(
y_true_one_hot,
y_pred_one_hot,
average="micro",
sample_weight=maybe_sample_weights,
),
higher_is_better=True,
)
)
# --- Top-N accuracies ---
for n in self.top_n_accuracies:
quantities.append(
Quantity(
f"Top-{n} Accuracy",
value=top_n_accuracy(
y_true, y_pred_proba, n=n, sample_weights=maybe_sample_weights
),
higher_is_better=True,
)
)
# --- One-vs-rest ROC AUC scores ---
if self.one_vs_all_quantities:
# noinspection PyTypeChecker
roc_auc_scores: Sequence[float] = sklmetrics.roc_auc_score(
y_true_one_hot,
y_pred_proba,
average=None,
sample_weight=maybe_sample_weights,
)
for class_index, class_name in enumerate(self.class_names):
quantities.append(
Quantity(
f"ROC AUC {class_name} vs Rest",
value=roc_auc_scores[class_index],
higher_is_better=True,
)
)
# --- One-vs-rest average precision scores ---
if self.one_vs_all_quantities:
# noinspection PyTypeChecker
ap_scores: Sequence[float] = sklmetrics.average_precision_score(
y_true_one_hot,
y_pred_proba,
average=None,
sample_weight=maybe_sample_weights,
)
for class_index, class_name in enumerate(self.class_names):
quantities.append(
Quantity(
f"Average Precision {class_name} vs Rest",
value=ap_scores[class_index],
higher_is_better=True,
)
)
# --- One-vs-rest F1-scores ---
if self.one_vs_all_quantities:
f1_scores = sklmetrics.f1_score(
y_true_one_hot,
y_pred_one_hot,
average=None,
sample_weight=maybe_sample_weights,
)
for class_index, class_name in enumerate(self.class_names):
quantities.append(
Quantity(
f"F1-Score {class_name} vs Rest",
value=f1_scores[class_index],
higher_is_better=True,
)
)
# --- KL-divergence ---
# keep in mind entropy(p, q) != entropy(q, p)
kl_divergences = np.array(
[
entropy(pk=true_dist, qk=pred_dist)
for true_dist, pred_dist in zip(y_true_proba, y_pred_proba)
]
)
quantities.append(
Quantity(
"Mean KLD(P=target||Q=prediction)",
np.average(kl_divergences, weights=maybe_sample_weights),
higher_is_better=False,
)
)
# --- Log loss ---
quantities.append(
Quantity(
"Log Loss",
sklmetrics.log_loss(
y_true_one_hot, y_pred_one_hot, sample_weight=maybe_sample_weights
),
higher_is_better=False,
)
)
# --- Brier score loss ---
# Be careful with sklmetrics.brier_score_loss, it deviates from Brier's
# definition for multi-class problems.
# See https://stats.stackexchange.com/questions
# /403544/how-to-compute-the-brier-score-for-more-than-two-classes
# and Wikipedia
# noinspection PyTypeChecker
quantities.append(
Quantity(
"Brier Score Loss",
np.mean((y_pred_proba - y_true_one_hot) ** 2),
higher_is_better=False,
)
)
# noinspection PyTypeChecker
quantities.append(
Quantity(
"Brier Score Loss (Soft Targets)",
np.mean((y_pred_proba - y_true_proba) ** 2),
higher_is_better=False,
)
)
# --- entropy of prediction probability distributions ---
entropies_pred = np.array([entropy(proba_dist) for proba_dist in y_pred_proba])
quantities.append(Quantity("Max Entropy", entropies_pred.max()))
quantities.append(
Quantity(
"Mean Entropy", np.average(entropies_pred, weights=maybe_sample_weights)
)
)
quantities.append(Quantity("Min Entropy", entropies_pred.min()))
quantities.append(Quantity("Max Probability", y_pred_proba.max()))
quantities.append(Quantity("Min Probability", y_pred_proba.min()))
return quantities
PK ! 1~/ / 8 metriculous/evaluators/_classification_evaluator_test.pyfrom dataclasses import replace
from typing import Optional, List
import numpy as np
import pytest
from .._evaluation import Evaluation
from .._evaluation import Quantity
from ..evaluators import ClassificationEvaluator
from ..test_resources import noisy_prediction
def random_targets_one_hot(num_classes: int, num_samples: int) -> np.ndarray:
target_class_indices = np.random.randint(0, high=num_classes, size=num_samples)
return np.eye(num_classes)[target_class_indices]
@pytest.mark.parametrize("noise_factor, num_samples", [(0.1, 100), (10.0, 200)])
@pytest.mark.parametrize(
"classes, simulated_class_distribution",
[
(None, None),
(["Cat", "Dog", "Lion"], [0.01, 0.02, 0.97]),
(["Cat", "Dog", "Lion"], None),
(["Spam", "Ham"], [0.2, 0.8]),
],
)
@pytest.mark.parametrize(
argnames=(
"one_vs_all_quantities,"
"one_vs_all_figures,"
"top_n_accuracies,"
"filter_quantities,"
"primary_metric,"
),
argvalues=zip(
[None, False, True],
[True, None, False],
[(4,), [], [2, 3, 42]],
[None, lambda name: "a" in name, lambda name: False],
["Accuracy", None, None],
),
)
@pytest.mark.parametrize("use_sample_weights", [False, True])
def test_ClassificationEvaluator(
noise_factor: float,
simulated_class_distribution: bool,
num_samples: int,
classes: Optional[List[str]],
one_vs_all_quantities: Optional[bool],
one_vs_all_figures: Optional[bool],
top_n_accuracies: Optional[List[int]],
filter_quantities: callable,
primary_metric: Optional[str],
use_sample_weights: bool,
):
"""Basic smoke test making sure we don't crash with valid input."""
np.random.seed(42)
targets_one_hot = random_targets_one_hot(
num_classes=len(classes) if classes is not None else 3, num_samples=num_samples
)
prediction = noisy_prediction(targets_one_hot, noise_factor=noise_factor)
ce = ClassificationEvaluator(
class_names=classes,
one_vs_all_quantities=one_vs_all_quantities,
one_vs_all_figures=one_vs_all_figures,
top_n_accuracies=top_n_accuracies,
filter_quantities=filter_quantities,
primary_metric=primary_metric,
simulated_class_distribution=(
None if use_sample_weights else simulated_class_distribution
),
)
evaluation = ce.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
sample_weights=(
42.0 * np.random.random(size=num_samples)
if use_sample_weights is True
else None
),
)
assert isinstance(evaluation, Evaluation)
assert evaluation.model_name == "MockModel"
@pytest.mark.parametrize("num_samples", [100, 200, 999])
@pytest.mark.parametrize(
"use_sample_weights, simulated_class_distribution",
[(False, None), (False, [0.3, 0.5, 0.2]), (True, None)],
)
def test_ClassificationEvaluator_perfect_prediction(
num_samples, use_sample_weights: bool, simulated_class_distribution: List[float]
):
np.random.seed(42)
targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples)
prediction = noisy_prediction(targets_one_hot, noise_factor=0.0)
ce = ClassificationEvaluator(
simulated_class_distribution=simulated_class_distribution
)
evaluation = ce.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
sample_weights=(
42.0 * np.random.random(size=num_samples)
if use_sample_weights is True
else None
),
)
assert isinstance(evaluation, Evaluation)
assert evaluation.model_name == "MockModel"
expected_quantities = [
Quantity(name="Accuracy", value=1.0, higher_is_better=True, description=None),
Quantity(
name="ROC AUC Macro Average",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="ROC AUC Micro Average",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="F1-Score Macro Average",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="F1-Score Micro Average",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="ROC AUC class_0 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="ROC AUC class_1 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="ROC AUC class_2 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="Average Precision class_0 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="Average Precision class_1 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="Average Precision class_2 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="F1-Score class_0 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="F1-Score class_1 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="F1-Score class_2 vs Rest",
value=1.0,
higher_is_better=True,
description=None,
),
Quantity(
name="Mean KLD(P=target||Q=prediction)",
value=0.0,
higher_is_better=False,
description=None,
),
Quantity(
name="Log Loss",
value=2.1094237467877998e-15,
higher_is_better=False,
description=None,
),
Quantity(
name="Brier Score Loss", value=0.0, higher_is_better=False, description=None
),
Quantity(
name="Brier Score Loss (Soft Targets)",
value=0.0,
higher_is_better=False,
description=None,
),
Quantity(
name="Max Entropy", value=0.0, higher_is_better=None, description=None
),
Quantity(
name="Mean Entropy", value=0.0, higher_is_better=None, description=None
),
Quantity(
name="Min Entropy", value=0.0, higher_is_better=None, description=None
),
Quantity(
name="Max Probability", value=1.0, higher_is_better=None, description=None
),
Quantity(
name="Min Probability", value=0.0, higher_is_better=None, description=None
),
]
assert len(evaluation.quantities) == len(expected_quantities)
for actual, expected in zip(evaluation.quantities, expected_quantities):
# check that everything except value is equal
assert replace(actual, value=42) == replace(expected, value=42)
# check that values are approximately equal
if isinstance(expected.value, str):
assert isinstance(actual, str)
assert actual.value == expected.value
else:
assert isinstance(expected.value, float)
assert isinstance(actual.value, float)
np.testing.assert_allclose(actual.value, expected.value)
@pytest.mark.parametrize("num_samples", [100, 200])
@pytest.mark.parametrize(
"quantity_filter",
[
lambda name: False,
lambda name: True,
lambda name: "F1" not in name,
lambda name: "vs Rest" not in name,
],
)
def test_ClassificationEvaluator_filter_quantities(num_samples, quantity_filter):
np.random.seed(42)
targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples)
prediction = noisy_prediction(targets_one_hot, noise_factor=0.0)
ce_all = ClassificationEvaluator()
ce_filtering = ClassificationEvaluator(filter_quantities=quantity_filter)
evaluation_all = ce_all.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
)
evaluation_filtered = ce_filtering.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
)
# assert all equal except quantities
# (ignore figures as they do not support equality in the way we need it)
assert replace(evaluation_all, quantities=[], figures=[]) == replace(
evaluation_filtered, quantities=[], figures=[]
)
for quantity in evaluation_all.quantities:
if quantity_filter(quantity.name):
same_quantity = evaluation_filtered.get_by_name(quantity.name)
assert same_quantity == quantity
else:
with pytest.raises(ValueError):
evaluation_filtered.get_by_name(quantity.name)
for filtered_quantity in evaluation_filtered.quantities:
same_quantity = evaluation_all.get_by_name(filtered_quantity.name)
assert same_quantity == filtered_quantity
@pytest.mark.parametrize("num_samples", [100, 200])
@pytest.mark.parametrize(
"desired_number_of_figures, figure_filter",
[
(0, lambda name: False),
(10, None),
(10, lambda name: True),
(9, lambda name: "Distribution" not in name),
(4, lambda name: "vs Rest" not in name),
],
)
def test_ClassificationEvaluator_filter_figures(
num_samples: int, desired_number_of_figures: int, figure_filter: callable
):
np.random.seed(42)
targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples)
prediction = noisy_prediction(targets_one_hot, noise_factor=0.0)
ce_all = ClassificationEvaluator(one_vs_all_figures=True)
ce_filtering = ClassificationEvaluator(
one_vs_all_figures=True, filter_figures=figure_filter
)
evaluation_all = ce_all.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
)
evaluation_filtered = ce_filtering.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
)
# assert all equal except figures
assert replace(evaluation_all, figures=[]) == replace(
evaluation_filtered, figures=[]
)
# check number of figures
assert len(evaluation_filtered.figures) == desired_number_of_figures
@pytest.mark.parametrize("num_samples", [100, 200])
def test_ClassificationEvaluator_exception_when_passing_distribution_and_weights(
num_samples: int
):
"""
Checks that an exception is raised when `sample_weights` are passed to an evaluator
that has been initialized with `simulated_class_distribution`.
"""
np.random.seed(42)
targets_one_hot = random_targets_one_hot(num_classes=3, num_samples=num_samples)
prediction = noisy_prediction(targets_one_hot, noise_factor=0.0)
ce = ClassificationEvaluator(
one_vs_all_figures=True, simulated_class_distribution=[0.3, 0.1, 0.6]
)
_ = ce.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
)
with pytest.raises(AssertionError) as exception_info:
_ = ce.evaluate(
ground_truth=targets_one_hot,
model_prediction=prediction,
model_name="MockModel",
sample_weights=np.random.random(size=len(targets_one_hot)),
)
assert str(exception_info.value) == (
"Cannot use `sample_weights` with ClassificationEvaluator that"
" was initialized with `simulated_class_distribution`."
)
PK ! W'E E 7 metriculous/evaluators/_classification_figures_bokeh.pyfrom typing import Sequence, Optional
import numpy as np
from assertpy import assert_that
from bokeh import plotting
from bokeh.models import (
ColumnDataSource,
LinearColorMapper,
ColorBar,
BasicTicker,
PrintfTickFormatter,
)
from bokeh.models import HoverTool
from bokeh.models import Title
from bokeh.plotting import Figure
from sklearn import metrics as sklmetrics
from sklearn.metrics import accuracy_score
from metriculous.evaluators._classification_utils import check_normalization
TOOLS = "pan,box_zoom,reset"
TOOLBAR_LOCATION = "right"
FONT_SIZE = "8pt"
def _bokeh_output_histogram(
y_true: np.ndarray,
y_pred: np.ndarray,
class_names: Sequence[str],
title_rows: Sequence[str],
sample_weights: Optional[np.ndarray] = None,
x_label_rotation="horizontal",
) -> Figure:
"""
Creates a scatter plot that contains the same information as a confusion matrix.
Args:
y_true:
1d integer array indicating the reference labels.
y_pred:
1d integer array indicating the predictions.
class_names:
Sequence of strings corresponding to the classes.
title_rows:
Sequence of strings to be used for the chart title.
sample_weights:
Sequence of floats to modify the influence of individual samples.
x_label_rotation:
Rotation of the class name labels.
Returns:
A bokeh figure
"""
n = len(class_names)
assert_that(np.shape(y_true)).is_equal_to(np.shape(y_pred))
if sample_weights is None:
sample_weights = np.ones_like(y_true)
assert_that(np.shape(y_true)).is_equal_to(np.shape(sample_weights))
p = plotting.figure(
x_range=class_names,
plot_height=350,
plot_width=350,
tools=TOOLS,
toolbar_location=TOOLBAR_LOCATION,
)
bins = np.arange(0, n + 1, 1)
normalize = not np.allclose(sample_weights, 1.0)
# class distribution in prediction
p.vbar(
x=class_names,
top=np.histogram(y_pred, bins=bins, weights=sample_weights, density=normalize)[
0
],
width=0.85,
alpha=0.6,
legend="Prediction",
)
# class distribution in ground truth
p.vbar(
x=class_names,
top=np.histogram(y_true, bins=bins, weights=sample_weights, density=normalize)[
0
],
width=0.85,
alpha=0.6,
legend="Ground Truth",
fill_color=None,
line_color="black",
line_width=2.5,
)
_add_title_rows(p, title_rows)
_apply_default_style(p)
p.yaxis.axis_label = "Fraction of Instances" if normalize else "Number of Instances"
p.xaxis.major_label_orientation = x_label_rotation
p.xgrid.grid_line_color = None
# prevent panning to empty regions
p.x_range.bounds = (-0.5, 0.5 + len(class_names))
return p
def _bokeh_confusion_matrix(
y_true: np.ndarray,
y_pred: np.ndarray,
class_names: Sequence[str],
title_rows: Sequence[str],
x_label_rotation="horizontal",
y_label_rotation="vertical",
) -> Figure:
"""
Creates a confusion matrix heatmap.
Args:
y_true:
1d integer array indicating the reference labels.
y_pred:
1d integer array indicating the predictions.
class_names:
Sequence of strings corresponding to the classes.
title_rows:
Sequence of strings to be used for the chart title.
x_label_rotation:
Rotation of the x-axis class name labels.
y_label_rotation:
Rotation of the y-axis class name labels.
Returns:
A bokeh figure
"""
cm = sklmetrics.confusion_matrix(y_true, y_pred)
cm_normalized = cm.astype("float") / cm.sum()
cm_normalized_by_pred = cm.astype("float") / cm.sum(axis=0, keepdims=True)
cm_normalized_by_true = cm.astype("float") / cm.sum(axis=1, keepdims=True)
predicted = list()
actual = list()
count = list()
normalized = list()
normalized_by_pred = list()
normalized_by_true = list()
for i, i_class in enumerate(class_names):
for j, j_class in enumerate(class_names):
predicted.append(j_class)
actual.append(i_class)
count.append(cm[i, j])
normalized.append(cm_normalized[i, j])
normalized_by_pred.append(cm_normalized_by_pred[i, j])
normalized_by_true.append(cm_normalized_by_true[i, j])
source = ColumnDataSource(
data={
"predicted": predicted,
"actual": actual,
"count": count,
"normalized": normalized,
"normalized_by_true": normalized_by_true,
"normalized_by_pred": normalized_by_pred,
}
)
p = plotting.figure(tools=TOOLS, x_range=class_names, y_range=class_names)
mapper = LinearColorMapper(palette="Viridis256", low=0.0, high=1.0)
p.rect(
x="actual",
y="predicted",
width=0.95,
height=0.95,
source=source,
fill_color={"field": "normalized_by_true", "transform": mapper},
line_width=0,
line_color="black",
)
p.xaxis.axis_label = "Ground Truth"
p.yaxis.axis_label = "Prediction"
p.xaxis.major_label_orientation = x_label_rotation
p.yaxis.major_label_orientation = y_label_rotation
p.add_tools(
HoverTool(
tooltips=[
("Predicted", "@predicted"),
("Ground truth", "@actual"),
("Count", "@count"),
("Normalized", "@normalized"),
("Normalized by prediction", "@normalized_by_pred"),
("Normalize by ground truth", "@normalized_by_true"),
]
)
)
color_bar = ColorBar(
color_mapper=mapper,
major_label_text_font_size=FONT_SIZE,
ticker=BasicTicker(desired_num_ticks=10),
formatter=PrintfTickFormatter(format="%.1f"),
label_standoff=5,
border_line_color=None,
location=(0, 0),
)
p.add_layout(color_bar, "right")
_add_title_rows(p, title_rows)
_apply_default_style(p)
return p
def _bokeh_confusion_scatter(
y_true: np.ndarray,
y_pred: np.ndarray,
class_names: Sequence[str],
title_rows: Sequence[str],
x_label_rotation="horizontal",
y_label_rotation="vertical",
) -> Figure:
"""
Creates a scatter plot that contains the same information as a confusion matrix.
Args:
y_true:
1d integer array indicating the reference labels.
y_pred:
1d integer array indicating the predictions.
class_names:
Sequence of strings corresponding to the classes.
title_rows:
Sequence of strings to be used for the chart title.
x_label_rotation:
Rotation of the x-axis class name labels.
y_label_rotation:
Rotation of the y-axis class name labels.
Returns:
A bokeh figure
"""
if len(y_true) != len(y_pred):
raise ValueError("y_true and y_pred must have the same length!")
p = plotting.figure(
x_range=(-0.5, -0.5 + len(class_names)),
y_range=(-0.5, -0.5 + len(class_names)),
plot_height=350,
plot_width=350,
tools=TOOLS,
toolbar_location=TOOLBAR_LOCATION,
match_aspect=True,
)
def noise():
return (np.random.beta(1, 1, size=len(y_true)) - 0.5) * 0.6
p.scatter(x=y_true + noise(), y=y_pred + noise(), alpha=0.6)
_add_title_rows(p, title_rows)
_apply_default_style(p)
p.xaxis.axis_label = "Ground Truth"
p.yaxis.axis_label = "Prediction"
arange = np.arange(len(class_names))
p.xaxis.ticker = arange
p.yaxis.ticker = arange
p.xaxis.major_label_overrides = {i: name for i, name in enumerate(class_names)}
p.yaxis.major_label_overrides = {i: name for i, name in enumerate(class_names)}
p.xaxis.major_label_orientation = x_label_rotation
p.yaxis.major_label_orientation = y_label_rotation
# grid between classes, not at classes
p.xgrid.ticker = arange[0:-1] + 0.5
p.ygrid.ticker = arange[0:-1] + 0.5
p.xgrid.grid_line_width = 3
p.ygrid.grid_line_width = 3
# prevent panning to empty regions
p.x_range.bounds = (-0.5, -0.5 + len(class_names))
p.y_range.bounds = (-0.5, -0.5 + len(class_names))
return p
def _bokeh_roc_curve(
y_true_binary: np.ndarray,
y_pred_score: np.ndarray,
title_rows: Sequence[str],
sample_weights: Optional[np.ndarray],
) -> Figure:
"""Plots an interactive receiver operator characteristic (ROC) curve.
Args:
y_true_binary:
An array of zeros and ones.
y_pred_score:
A continuous value, such as a probability estimate for the positive class.
title_rows:
Sequence of strings to be used for the chart title.
sample_weights:
Sequence of floats to modify the influence of individual samples.
Returns:
A bokeh figure
"""
assert y_true_binary.shape == y_pred_score.shape
assert set(y_true_binary).issubset({0, 1}) or set(y_true_binary).issubset(
{False, True}
)
assert np.ndim(y_true_binary) == 1
fpr, tpr, thresholds = sklmetrics.roc_curve(
y_true=y_true_binary, y_score=y_pred_score, sample_weight=sample_weights
)
source = ColumnDataSource(
data={"FPR": fpr, "TPR": tpr, "threshold": thresholds, "specificity": 1.0 - fpr}
)
p = plotting.figure(
plot_height=400,
plot_width=350,
tools=TOOLS,
toolbar_location=TOOLBAR_LOCATION,
# toolbar_location=None, # hides entire toolbar
match_aspect=True,
)
p.background_fill_color = "#f5f5f5"
p.grid.grid_line_color = "white"
p.xaxis.axis_label = "FPR"
p.yaxis.axis_label = "TPR"
_add_title_rows(p, title_rows)
_apply_default_style(p)
curve = p.line(x="FPR", y="TPR", line_width=2, color="#326496", source=source)
p.line(
x=[0.0, 1.0], y=[0.0, 1.0], line_alpha=0.75, color="grey", line_dash="dotted"
)
p.add_tools(
HoverTool(
# make sure there is no tool tip for the diagonal baseline
renderers=[curve],
tooltips=[
("TPR", "@TPR"),
("FPR", "@FPR"),
("Sensitivity", "@TPR"),
("Specificity", "@specificity"),
("Threshold", "@threshold"),
],
# display a tooltip whenever the cursor is vertically in line with a glyph
mode="vline",
)
)
return p
def _bokeh_precision_recall_curve(
y_true_binary: np.ndarray,
y_pred_score: np.ndarray,
title_rows: Sequence[str],
sample_weights=Optional[np.ndarray],
) -> Figure:
"""
Plots an interactive precision recall curve.
Args:
y_true_binary:
An array of zeros and ones.
y_pred_score:
A continuous value, such as a probability estimate for the positive class.
title_rows:
Sequence of strings to be used for the chart title.
sample_weights:
Sequence of floats to modify the influence of individual samples.
Returns:
A bokeh figure
"""
assert y_true_binary.shape == y_pred_score.shape
assert set(y_true_binary).issubset({0, 1}) or set(y_true_binary).issubset(
{False, True}
)
assert np.ndim(y_true_binary) == 1
# Note: len(thresholds) == len(precision) - 1
# The last precision recall pair does not have a corresponding threshold.
precision, recall, thresholds = sklmetrics.precision_recall_curve(
y_true=y_true_binary, probas_pred=y_pred_score, sample_weight=sample_weights
)
precision = precision[:-1]
recall = recall[:-1]
p = plotting.figure(
plot_height=400,
plot_width=350,
x_range=(-0.05, 1.05),
y_range=(-0.05, 1.05),
tools=TOOLS,
toolbar_location=TOOLBAR_LOCATION,
# match_aspect=True,
)
source = ColumnDataSource(
data={"precision": precision, "recall": recall, "threshold": thresholds}
)
# reminder: tpr == recall == sensitivity
p.line(x="recall", y="precision", line_width=2, source=source)
_add_title_rows(p, title_rows)
_apply_default_style(p)
p.xaxis.axis_label = "Recall"
p.yaxis.axis_label = "Precision"
p.add_tools(
HoverTool(
tooltips=[
("Precision", "@precision"),
("Recall", "@recall"),
("Threshold", "@threshold"),
],
# display a tooltip whenever the cursor is vertically in line with a glyph
mode="vline",
)
)
return p
def _bokeh_automation_rate_analysis(
y_target_one_hot: np.ndarray,
y_pred_proba: np.ndarray,
title_rows: Sequence[str],
sample_weights: Optional[np.ndarray],
) -> Figure:
"""
Plots various quantities over automation rate, where a single probability threshold
is used for all classes to decide if we are confident enough to automate the
classification.
Args:
y_target_one_hot:
Array with one-hot encoded ground truth, shape(n_samples, n_classes).
y_pred_proba:
Array with estimated probability distributions, shape(n_samples, n_classes).
title_rows:
Sequence of strings to be used for the chart title.
sample_weights:
Sequence of floats to modify the influence of individual samples.
Returns:
A bokeh figure
"""
# ----- check input -----
assert y_target_one_hot.ndim == 2
assert y_pred_proba.ndim == 2
assert (
y_target_one_hot.shape == y_pred_proba.shape
), f"{y_target_one_hot.shape} != {y_pred_proba.shape}"
check_normalization(y_target_one_hot, axis=1)
check_normalization(y_pred_proba, axis=1)
assert set(y_target_one_hot.ravel()) == {0, 1}, set(y_target_one_hot.ravel())
if sample_weights is None:
sample_weights = np.ones(len(y_target_one_hot))
assert_that(sample_weights.shape).is_equal_to((len(y_target_one_hot),))
# ----- compute chart data -----
y_target = y_target_one_hot.argmax(axis=1)
argmaxes = y_pred_proba.argmax(axis=1)
maxes = y_pred_proba.max(axis=1)
assert isinstance(maxes, np.ndarray) # making IntelliJ's type checker happy
chart_data = {"automation_rate": [], "threshold": [], "accuracy": []}
for threshold in sorted(maxes):
automated = maxes >= threshold
chart_data["automation_rate"].append(
np.average(automated, weights=sample_weights)
)
chart_data["threshold"].append(threshold)
chart_data["accuracy"].append(
accuracy_score(
y_true=y_target[automated],
y_pred=argmaxes[automated],
sample_weight=sample_weights[automated],
)
)
# ----- bokeh plot -----
p = plotting.figure(
plot_height=400,
plot_width=350,
x_range=(-0.05, 1.05),
y_range=(-0.05, 1.05),
tools=TOOLS,
toolbar_location=TOOLBAR_LOCATION,
# match_aspect=True,
)
source = ColumnDataSource(
data={key: np.array(lst) for key, lst in chart_data.items()}
)
accuracy_line = p.line(
x="automation_rate",
y="accuracy",
line_width=2,
source=source,
legend="Accuracy",
)
p.line(
x="automation_rate",
y="threshold",
line_width=2,
color="grey",
source=source,
legend="Threshold",
)
# make sure something is visible if lines consist of just a single point
p.scatter(
x=source.data["automation_rate"][[0, -1]], y=source.data["accuracy"][[0, -1]]
)
p.scatter(
x=source.data["automation_rate"][[0, -1]],
y=source.data["threshold"][[0, -1]],
color="grey",
)
_add_title_rows(p, title_rows)
_apply_default_style(p)
p.xaxis.axis_label = "Automation Rate"
p.legend.location = "bottom_left"
p.add_tools(
HoverTool(
renderers=[accuracy_line],
tooltips=[
("Accuracy", "@accuracy"),
("Threshold", "@threshold"),
("Automation Rate", "@automation_rate"),
],
# display a tooltip whenever the cursor is vertically in line with a glyph
mode="vline",
)
)
return p
def _add_title_rows(p: Figure, title_rows: Sequence[str]):
for title_row in reversed(title_rows):
p.add_layout(
Title(text=title_row, text_font_size=FONT_SIZE, align="center"),
place="above",
)
def _apply_default_style(p: Figure):
p.background_fill_color = "#f5f5f5"
p.grid.grid_line_color = "white"
p.toolbar.logo = None
p.xaxis.axis_label_text_font_size = FONT_SIZE
p.yaxis.axis_label_text_font_size = FONT_SIZE
p.axis.axis_line_color = None
p.xaxis.major_tick_line_color = None # turn off x-axis major ticks
p.xaxis.minor_tick_line_color = None # turn off x-axis minor ticks
p.yaxis.major_tick_line_color = None # turn off y-axis major ticks
p.yaxis.minor_tick_line_color = None # turn off y-axis minor ticks
p.axis.major_label_standoff = 0
if p.legend:
p.legend.label_text_font_size = FONT_SIZE
p.legend.background_fill_alpha = 0.85
PK ! 2 / metriculous/evaluators/_classification_utils.pyfrom typing import Union, Sequence
import numpy as np
NORMALIZATION_ABS_TOLERANCE = 1e-5
NORMALIZATION_REL_TOLERANCE = 1e-5
def check_normalization(probabilities: Union[np.ndarray, Sequence[float]], axis: int):
np.testing.assert_allclose(
np.sum(probabilities, axis=axis),
desired=1.0,
rtol=NORMALIZATION_REL_TOLERANCE,
atol=NORMALIZATION_ABS_TOLERANCE,
)
PK ! &߉ 1 metriculous/evaluators/_segmentation_evaluator.pyfrom typing import Callable
from typing import Optional
from typing import Sequence
from typing import Iterable
import numpy as np
from sklearn import metrics as sklmetrics
from .._evaluation import Evaluation
from .._evaluation import Evaluator
from .._evaluation import Quantity
from ..evaluators._classification_figures_bokeh import _bokeh_output_histogram
from ..evaluators._segmentation_figures_bokeh import _bokeh_heatmap
class SegmentationEvaluator(Evaluator):
"""
Implementation of the Segmentation Evaluator which should work well for most
image segmentation problems.
"""
def __init__(
self,
num_classes: int,
class_names: Optional[Sequence[str]] = None,
class_weights: Optional[Sequence[float]] = None,
filter_quantities: Optional[Callable[[str], bool]] = None,
filter_figures: Optional[Callable[[str], bool]] = None,
primary_metric: Optional[str] = None,
):
"""
Initializes the segmentation evaluator
Args:
num_classes:
The number of classes
class_names:
Optional, names of classes
class_weights:
Optional, weights of classes in the same order as class_names. These
weights don't necessarily need to add up to 1.0 as the weights are
normalized but their ratios should reflect the weight distribution
desired.
filter_quantities:
Callable that receives a quantity name and returns `False` if the
quantity should be excluded.
Examples:
`filter_quantities=lambda name: "vs Rest" not in name`
`filter_quantities=lambda name: "ROC" in name`
filter_figures:
Callable that receives a figure title and returns `False` if the figure
should be excluded.
Examples:
`filter_figures=lambda name: "vs Rest" not in name`
`filter_figures=lambda name: "ROC" in name`
primary_metric:
Optional string to specify the most important metric that should be used
for model selection.
"""
self.num_classes = num_classes
if class_names is None:
self.class_names = ["class_{}".format(i) for i in range(num_classes)]
else:
self.class_names = class_names
if class_weights is None:
self.class_weights = [1.0 / num_classes] * num_classes
else:
total = sum(class_weights)
self.class_weights = [weight / total for weight in class_weights]
self.filter_quantities = (
(lambda name: True) if filter_quantities is None else filter_quantities
)
self.filter_figures = (
(lambda name: True) if filter_figures is None else filter_figures
)
self.primary_metric = primary_metric
# Check for shape consistency
if len(self.class_names) != self.num_classes:
raise ValueError(
"The number of classes don't match the length of the class_names"
)
if len(self.class_weights) != self.num_classes:
raise ValueError(
"The number of classes don't match the length of the class_weights"
)
def evaluate(
self,
ground_truth: np.ndarray,
model_prediction: np.ndarray,
model_name: str,
sample_weights: Optional[Iterable[float]] = None,
) -> Evaluation:
"""
Args:
ground_truth:
A 3D array of the shape - (Num_Samples, Height, Width)
model_prediction:
A 3D array with the same shape as ground_truth with each channel
being the prediction of the model for the corresponding image.
model_name:
Name of the model to be evaluated
sample_weights:
Sequence of floats to modify the influence of individual samples on the
statistics that will be measured.
Returns:
An Evaluation object containing Quantities and Figures that are useful for
most segmentation problems.
"""
if sample_weights is not None:
raise NotImplementedError(
"SegmentationEvaluator currently doesn't support sample weights"
)
if ground_truth.shape != model_prediction.shape:
raise ValueError(
(
f"The shape of the ground truth and the model predictions should be"
f"the same. Got ground_truth_shape: {ground_truth.shape}, "
f"model_predictions.shape: {model_prediction.shape}"
)
)
if ground_truth.ndim != 3:
raise ValueError(
f"Ground Truth must be a 3D array. Got an {ground_truth.ndim}-d array"
)
if model_prediction.ndim != 3:
raise ValueError(
(
f"Model prediction must be a 3D array. "
f"Got a {model_prediction.ndim}-d array"
)
)
quantities = [
q
for q in self._quantities(model_prediction, ground_truth)
if self.filter_quantities(q.name)
]
figures = self._figures(model_name, model_prediction, ground_truth)
return Evaluation(
quantities=quantities,
figures=figures,
model_name=model_name,
primary_metric=self.primary_metric,
)
def _figures(self, model_name: str, y_pred: np.ndarray, y_true: np.ndarray):
figures = []
figure_name = "Class Distribution"
if self.filter_figures(figure_name):
figure = _bokeh_output_histogram(
y_true=y_true,
y_pred=y_pred,
class_names=self.class_names,
title_rows=[model_name, figure_name],
sample_weights=None,
)
figure.yaxis.axis_label = "Number of Pixels"
figures.append(figure)
for class_label, class_name in enumerate(self.class_names):
figure_name = f"Heatmap for {class_name}"
if self.filter_figures(figure_name):
figure = _bokeh_heatmap(
y_true=y_true,
y_pred=y_pred,
class_label=class_label,
class_name=class_name,
)
figures.append(figure)
return figures
def _quantities(self, y_pred: np.ndarray, y_true: np.ndarray):
# Flattened them as jaccard_score requires it in this way
y_true_flattened, y_pred_flattened = y_true.flatten(), y_pred.flatten()
quantities = list()
weighted_miou = 0.0
class_specific_miou = sklmetrics.jaccard_score(
y_true_flattened, y_pred_flattened, average=None
)
if len(class_specific_miou) != self.num_classes:
raise ValueError(
(
f"The number of classes specified ({self.num_classes}) doesn't "
f"match with the number of classes actually present in the "
f"ground_truth/predictions ({len(class_specific_miou)}). Update the"
f" num_classes, class_names & class_weights parameters accordingly"
)
)
for class_name, class_weight, value in zip(
self.class_names, self.class_weights, class_specific_miou
):
quantities.append(
Quantity(f"{class_name} mIoU", value, higher_is_better=True)
)
weighted_miou += class_weight * value
quantities.append(
Quantity("Class weighted Mean mIoU", weighted_miou, higher_is_better=True)
)
return quantities
PK ! ɔ 6 metriculous/evaluators/_segmentation_evaluator_test.pyfrom typing import List
import numpy as np
import pytest
import copy
from dataclasses import replace
from .._evaluation import Evaluation
from .._evaluation import Quantity
from ..evaluators import SegmentationEvaluator
def get_random_prediction_and_mask(image_size, num_classes):
return (
np.random.randint(0, num_classes, image_size),
np.random.randint(0, num_classes, image_size),
)
@pytest.mark.parametrize("classes", (["dog", "cat", "snake"], ["dog", "cat"]))
def test_SegmentationEvaluator(classes: List[str]):
np.random.seed(42)
num_classes = len(classes)
prediction, mask = get_random_prediction_and_mask((2, 256, 256), num_classes)
se = SegmentationEvaluator(num_classes, class_names=classes)
evaluation = se.evaluate(
ground_truth=mask, model_prediction=prediction, model_name="MockModel"
)
assert isinstance(evaluation, Evaluation)
assert evaluation.model_name == "MockModel"
@pytest.mark.parametrize("classes", (["dog", "cat", "snake"], ["dog", "cat"]))
def test_SegmentationEvaluator_perfect_prediction(classes: List[str]):
np.random.seed(42)
num_classes = len(classes)
predictions, _ = get_random_prediction_and_mask((2, 256, 256), num_classes)
mask = copy.deepcopy(predictions)
se = SegmentationEvaluator(num_classes, class_names=classes)
evaluation = se.evaluate(
ground_truth=mask, model_prediction=predictions, model_name="MockModel"
)
expected_quantities = []
for class_name in classes:
expected_quantities.append(
Quantity(name=f"{class_name} mIoU", value=1.0, higher_is_better=True)
)
expected_quantities.append(
Quantity(name="Class weighted Mean mIoU", value=1.0, higher_is_better=True)
)
assert len(evaluation.quantities) == len(expected_quantities)
for actual, expected in zip(evaluation.quantities, expected_quantities):
# check that everything except value is equal
assert replace(actual, value=42) == replace(expected, value=42)
# check that values are approximately equal
if isinstance(expected.value, str):
assert isinstance(actual, str)
assert actual.value == expected.value
else:
assert isinstance(expected.value, float)
assert isinstance(actual.value, float)
np.testing.assert_allclose(actual.value, expected.value)
@pytest.mark.parametrize(
"num_classes, class_names", [(1, ["dog", "cat"]), (2, ["dog"])]
)
def test_SegmentationEvaluator_inconsistent_class_names(
num_classes: int, class_names: List[str]
):
"""
Tests if the __init__ method of SegmentationEvaluator raises an error if the
length of the class_names list is not equal to num_classes
"""
with pytest.raises(ValueError):
_ = SegmentationEvaluator(num_classes, class_names=class_names)
@pytest.mark.parametrize("num_classes, class_weights", [(1, [0.2, 0.3]), (2, [0.2])])
def test_SegmentationEvaluator_inconsistent_class_weights(
num_classes: int, class_weights: List[float]
):
"""
Tests if the __init__ method of SegmentationEvaluator raises an error if the
length of the class_weights list is not equal to num_classes
"""
with pytest.raises(ValueError):
_ = SegmentationEvaluator(num_classes, class_weights=class_weights)
@pytest.mark.parametrize(
"num_classes, ground_truth, model_prediction",
[
(3, *get_random_prediction_and_mask((2, 256, 256), 2)),
(2, *get_random_prediction_and_mask((2, 256, 256), 3)),
],
)
def test_SegmentationEvaluator_inconsistent_num_classes(
num_classes, ground_truth, model_prediction
):
"""
Tests if the evaluate method of SegmentationEvaluator raises an error if the
actual number of classes present in the ground_truth/prediction is not equal to
num_classes.
"""
se = SegmentationEvaluator(num_classes)
with pytest.raises(ValueError):
se.evaluate(ground_truth, model_prediction, model_name="MockModel")
@pytest.mark.parametrize(
"num_classes, ground_truth, model_prediction",
[
(
3,
np.random.randint(0, 3, (1, 256, 256)),
np.random.randint(0, 3, (2, 256, 256)),
)
],
)
def test_SegmentationEvaluator_inconsistent_shapes(
num_classes, ground_truth, model_prediction
):
"""
Tests if the evaluate method of SegmentationEvaluator raises an error if the
shapes of the ground_truth and model_prediction aren't the same
"""
se = SegmentationEvaluator(num_classes)
with pytest.raises(ValueError):
se.evaluate(ground_truth, model_prediction, model_name="MockModel")
@pytest.mark.parametrize(
"num_classes, ground_truth, model_prediction",
[
(
3,
np.random.randint(0, 3, (256, 256)),
np.random.randint(0, 3, (2, 256, 256)),
),
(
3,
np.random.randint(0, 3, (2, 256, 256)),
np.random.randint(0, 3, (256, 256)),
),
],
)
def test_SegmentationEvaluator_not_a_3D_array(
num_classes, ground_truth, model_prediction
):
"""
Tests if the evaluate method of SegmentationEvaluator raises an error if the
ground_truth or model_prediction isn't a 3D array
"""
se = SegmentationEvaluator(num_classes)
with pytest.raises(ValueError):
se.evaluate(ground_truth, model_prediction, model_name="MockModel")
@pytest.mark.parametrize("num_classes", [2, 3])
@pytest.mark.parametrize(
"quantity_filter",
[
lambda name: False,
lambda name: True,
lambda name: "Weighted" not in name,
lambda name: "mIoU" not in name,
],
)
def test_SegmentationEvaluator_filter_quantities(
num_classes: int, quantity_filter: callable
):
np.random.seed(42)
predictions, mask = get_random_prediction_and_mask((2, 256, 256), num_classes)
se_all = SegmentationEvaluator(num_classes)
se_filtering = SegmentationEvaluator(num_classes, filter_quantities=quantity_filter)
evaluation_all = se_all.evaluate(
ground_truth=mask, model_prediction=predictions, model_name="MockModel"
)
evaluation_filtered = se_filtering.evaluate(
ground_truth=mask, model_prediction=predictions, model_name="MockModel"
)
assert replace(evaluation_all, quantities=[], figures=[]) == replace(
evaluation_filtered, quantities=[], figures=[]
)
for quantity in evaluation_all.quantities:
if quantity_filter(quantity.name):
same_quantity = evaluation_filtered.get_by_name(quantity.name)
assert same_quantity == quantity
else:
with pytest.raises(ValueError):
evaluation_filtered.get_by_name(quantity.name)
for filtered_quantity in evaluation_filtered.quantities:
same_quantity = evaluation_all.get_by_name(filtered_quantity.name)
assert same_quantity == filtered_quantity
@pytest.mark.parametrize(
"num_classes, desired_number_of_figures, figure_filter",
[
(3, 0, lambda name: False),
(3, 4, lambda name: True),
(3, 1, lambda name: "Heatmap" not in name),
(3, 3, lambda name: "Class" not in name),
(2, 2, lambda name: "Class" not in name),
(2, 3, lambda name: True),
],
)
def test_SegmentationEvaluator_filter_figures(
num_classes: int, desired_number_of_figures: int, figure_filter: callable
):
np.random.seed(42)
predictions, mask = get_random_prediction_and_mask((2, 256, 256), num_classes)
se_all = SegmentationEvaluator(num_classes)
se_filtering = SegmentationEvaluator(num_classes, filter_figures=figure_filter)
evaluation_all = se_all.evaluate(
ground_truth=mask, model_prediction=predictions, model_name="MockModel"
)
evaluation_filtered = se_filtering.evaluate(
ground_truth=mask, model_prediction=predictions, model_name="MockModel"
)
assert replace(evaluation_all, figures=[]) == replace(
evaluation_filtered, figures=[]
)
assert len(evaluation_filtered.figures) == desired_number_of_figures
PK ! ld 5 metriculous/evaluators/_segmentation_figures_bokeh.pyfrom typing import Optional
import numpy as np
from bokeh import plotting
from bokeh.plotting import Figure
from bokeh.models import Title
from bokeh.layouts import column
TOOLS = "pan,box_zoom,reset"
TOOLBAR_LOCATION = "right"
def _bokeh_heatmap(
y_true: np.ndarray,
y_pred: np.ndarray,
class_label: int,
class_name: Optional[str] = None,
) -> Figure:
"""
Creates heatmaps of the predictions and ground_truth
corresponding to the class_label
Args:
y_true:
3d integer array indicating the ground_truth masks.
Shape: (Num_Samples, Height, Width)
y_pred:
3d integer array indicating the predictions of the model as the same shape
as y_true
class_label:
An integer corresponding to the class for which the heatmap is desired
class_name:
Class Name corresponding to the class_label
Returns:
A bokeh figure
"""
if y_pred.shape != y_true.shape:
raise ValueError(
(
"The shapes of y_pred and y_true must be the same. "
f"Got y_pred shape: {y_pred.shape}, y_true shape: {y_true.shape}"
)
)
if class_label not in np.unique(y_true):
raise ValueError("Incorrect class_label provided, doesn't exist in y_true")
if class_name is None:
class_name = f"Class {class_label}"
padding = 5
mean_activation_predictions = np.average(
(y_pred == class_label).astype(np.uint8), axis=0
)
mean_activation_ground_truth = np.average(
(y_true == class_label).astype(np.uint8), axis=0
)
p1 = plotting.figure(
tools=TOOLS,
toolbar_location=TOOLBAR_LOCATION,
width=y_true.shape[2],
height=y_true.shape[1],
)
p1.x_range.range_padding = p1.y_range.range_padding = 0
p1.toolbar.logo = None
p1.image(
image=[mean_activation_predictions],
x=0,
y=0,
dw=y_true.shape[2],
dh=y_true.shape[1],
)
p1.add_layout(Title(text="Ground Truth", align="center"), "below")
p1.add_layout(
Title(text=f"Heatmap for {class_name}", align="center"), place="above"
)
p1.axis.visible = False
p2 = plotting.figure(
tools=TOOLS,
toolbar_location=TOOLBAR_LOCATION,
width=y_true.shape[2],
height=y_true.shape[1],
x_range=p1.x_range,
)
p2.x_range.range_padding = p2.y_range.range_padding = 0
p2.toolbar.logo = None
p2.image(
image=[mean_activation_ground_truth],
x=0,
y=y_true.shape[1] + padding,
dw=y_true.shape[2],
dh=y_true.shape[1],
)
p2.add_layout(Title(text="Prediction", align="center"), "below")
p2.axis.visible = False
return column(p1, p2)
PK ! 2Ů metriculous/metrics.py"""Module defining generic metric functions."""
from typing import List
from typing import Optional
from typing import Tuple
import numpy as np
from assertpy import assert_that
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
def normalized(matrix: np.ndarray) -> np.ndarray:
"""Returns normalized array where each row sums up to 1.0."""
assert np.ndim(matrix) == 2
sums = np.sum(matrix, axis=1, keepdims=True)
# avoid crash on zeros
matrix = matrix + (sums == 0.0) * 1e-15
return matrix / np.sum(matrix, axis=1, keepdims=True)
def cross_entropy(
target_probas: np.ndarray, pred_probas: np.ndarray, epsilon=1e-15
) -> float:
"""Returns the cross-entropy for probabilistic ground truth labels.
Args:
target_probas: 2D array with rows being target probability distributions.
pred_probas: 2D array with rows being estimated probability distributions.
epsilon: Clipping offset to avoid numerical blowup (NaNs, inf, etc).
"""
# check normalization before clipping
assert np.allclose(
np.sum(target_probas, axis=1), 1.0, atol=1e-3
), "Target probability distributions not normalized!"
assert np.allclose(
np.sum(pred_probas, axis=1), 1.0, atol=1e-3
), "Predicted probability distributions not normalized!"
# clip predicted probabilities
pred_probas = np.clip(pred_probas, a_min=epsilon, a_max=1.0 - epsilon)
# normalize
pred_probas = normalized(pred_probas)
# compute cross entropy
values = -np.sum(target_probas * np.log(pred_probas), axis=1)
# noinspection PyTypeChecker
ce: float = np.mean(values)
return ce
def a_vs_b_auroc(
target_ints: np.ndarray, predicted_probas: np.ndarray, class_a: int, class_b: int
) -> Optional[float]:
"""
Keeps only targets of class A or B, then computes the ROC AUC for the
binary problem.
Args:
target_ints: 1d array of target class integers.
predicted_probas: 2d array of predicted probabilities, one row per data point.
class_a: Integer specifying the positive class.
class_b: Integer specifying the negative class.
Returns:
A float or None if the result could not be computed.
"""
# only consider instances with targets of class A or B
filter_mask = np.logical_or(target_ints == class_a, target_ints == class_b)
target_ints = target_ints[filter_mask]
predicted_probas = predicted_probas[filter_mask]
# return None if not both classes represented
if len(np.unique(target_ints)) != 2:
return None
# consider only probability columns for class A and B and renormalize
binary_probas = normalized(predicted_probas[:, (class_a, class_b)])
# use class A as the positive class
scores = binary_probas[:, 0]
return roc_auc_score(y_true=target_ints == class_a, y_score=scores)
def one_vs_all_auroc_values(
target_ints: np.ndarray, predicted_probas: np.ndarray
) -> List[Optional[float]]:
"""Returns one AUROC (area under ROC curve, aka ROC AUC) score per class.
Args:
target_ints: 1d array of target class integers.
predicted_probas: 2d array of predicted probabilities, one row per data point.
Returns:
A list with one AUROC value per class.
"""
assert len(predicted_probas) == len(target_ints)
n_classes = predicted_probas.shape[1]
auroc_values = []
for positive_class in range(n_classes):
scores = predicted_probas[:, positive_class]
is_positive_class = target_ints == positive_class
if any(is_positive_class) and not all(is_positive_class):
auroc_values.append(roc_auc_score(y_true=is_positive_class, y_score=scores))
else:
auroc_values.append(None)
return auroc_values
def sensitivity_at_x_specificity(
target_ints: np.ndarray, positive_probas: np.ndarray, at_specificity: float
) -> Tuple[Optional[float], Optional[float]]:
"""Compute sensitivity (recall) at a given specificity.
Sensitivity = true positive rate
= true positives / positives
= recall
= P(prediction positive | class positive)
Specificity = true negative rate
= true negatives / negatives
= 1 - false positive rate
= P(prediction negative | class negative)
Args:
target_ints: 1d array of binary class labels, zeros and ones
positive_probas: 1d array of probabilities of class 1
at_specificity: specificity at which to compute sensitivity
Returns:
(float): sensitivity at returned specificity
(float): specificity closest to input specificity
"""
assert 0 < at_specificity < 1
if len(set(target_ints)) < 2:
return None, None
fprs, sensitivities, _ = roc_curve(target_ints, positive_probas)
specificities = 1.0 - fprs
# last and first entries are not interesting (0 or 1)
if len(specificities) > 2:
specificities = specificities[1:-1]
sensitivities = sensitivities[1:-1]
# find point on curve that is closest to desired at_specificity
index = np.argmin(np.abs(specificities - at_specificity))
return sensitivities[index], specificities[index]
def specificity_at_x_sensitivity(
target_ints: np.ndarray, positive_probas: np.ndarray, at_sensitivity: float
) -> Tuple[Optional[float], Optional[float]]:
"""Compute specificity at a given sensitivity (recall).
Sensitivity = true positive rate
= true positives / positives
= recall
= P(prediction positive | class positive)
Specificity = true negative rate
= true negatives / negatives
= 1 - false positive rate
= P(prediction negative | class negative)
Args:
target_ints: 1d array of binary class labels, zeros and ones
positive_probas: 1d array of probabilities of class 1
at_sensitivity: sensitivity at which to compute specificity
Returns:
(float): specificity at returned sensitivity
(float): sensitivity closest to input sensitivity
"""
assert 0 < at_sensitivity < 1
if len(set(target_ints)) < 2:
return None, None
fprs, sensitivities, _ = roc_curve(target_ints, positive_probas)
specificities = 1.0 - fprs
# last and first entries are not interesting
if len(specificities) > 2:
specificities = specificities[1:-1]
sensitivities = sensitivities[1:-1]
# find point on curve that is closest to desired sensitivity
index = np.argmin(np.abs(sensitivities - at_sensitivity))
return specificities[index], sensitivities[index]
def top_n_accuracy(
target_ints: np.ndarray,
predicted_probas: np.ndarray,
n: int,
sample_weights: Optional[np.ndarray] = None,
) -> float:
"""Fraction of test cases where the true target is among the top n predictions."""
assert len(target_ints) == len(predicted_probas)
assert np.ndim(target_ints) == 1
assert np.ndim(predicted_probas) == 2
if sample_weights is None:
sample_weights = np.ones_like(target_ints)
assert_that(sample_weights.shape).is_equal_to(target_ints.shape)
np.testing.assert_array_equal(sample_weights >= 0.0, True)
# sort predicted class indices by probability (ascending)
classes_by_probability = predicted_probas.argsort(axis=1)
# take last n columns, because we sorted ascending
top_n_predictions = classes_by_probability[:, -n:]
# check if target is included
is_target_in_top_n_predictions = [
target in top_n for target, top_n in zip(target_ints, top_n_predictions)
]
top_n_acc = np.average(is_target_in_top_n_predictions, weights=sample_weights)
return top_n_acc
PK ! ϩ*v* v* metriculous/metrics_test.pyimport numpy as np
import pytest
import sklearn.metrics as sklmetrics
from scipy.stats import entropy
import metriculous.metrics as metrics
from metriculous.metrics import normalized
from metriculous.metrics import sensitivity_at_x_specificity
from metriculous.metrics import specificity_at_x_sensitivity
from metriculous.metrics import top_n_accuracy
# --- normalized -----------------------------------------------------------------------
def test_normalized():
# fmt: off
result = metrics.normalized(np.array([
[.0, .0],
[.1, .1],
[.2, .3],
[.5, .5],
[.6, .4],
[1., 4.],
[0., 1.],
[0., 1e-20],
]))
expected = np.array([
[.5, .5],
[.5, .5],
[.4, .6],
[.5, .5],
[.6, .4],
[.2, .8],
[0., 1.],
[0., 1.],
])
assert np.allclose(result, expected, atol=0.0)
# fmt: on
# --- cross-entropy --------------------------------------------------------------------
def test_cross_entropy_zero():
ce = metrics.cross_entropy(
target_probas=np.array([[1.0, 0.0], [1.0, 0.0]]),
pred_probas=np.array([[1.0, 0.0], [1.0, 0.0]]),
epsilon=1e-15,
)
np.testing.assert_allclose(ce, 0.0, atol=1e-15)
def test_cross_entropy_certainty_in_targets():
target_probas = np.array([[1.0, 0.0], [1.0, 0.0]])
pred_probas = np.array([[0.6, 0.4], [0.1, 0.9]])
eps = 1e-15
ce = metrics.cross_entropy(target_probas, pred_probas, eps)
ll = sklmetrics.log_loss(target_probas, pred_probas, eps)
np.testing.assert_allclose(ce, ll)
def test_cross_entropy_general_fuzz_test():
rng = np.random.RandomState(42)
for _ in range(10):
probas = normalized(rng.rand(100, 2))
ce = metrics.cross_entropy(probas, probas)
scipy_entropy = np.sum(entropy(probas.T)) / len(probas)
np.testing.assert_allclose(ce, scipy_entropy)
# --- A vs B AUROC ---------------------------------------------------------------------
def test_a_vs_b_auroc():
value = metrics.a_vs_b_auroc(
target_ints=np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]),
predicted_probas=np.array(
[
[0.8, 0.0, 0.2], # correct
[0.9, 0.1, 0.0], # correct
[0.7, 0.1, 0.2], # correct
[0.1, 0.8, 0.1], # correct
[0.1, 0.8, 0.1], # correct
[0.1, 0.6, 0.3], # correct
[0.9, 0.0, 0.1], # wrong
[0.1, 0.9, 0.0], # wrong
[0.1, 0.0, 0.9], # wrong
]
),
class_a=0,
class_b=1,
)
assert value == 1.0
def test_a_vs_b_auroc_symmetry():
"""Check that result is the same when classes are swapped."""
rng = np.random.RandomState(42)
for _ in range(50):
probas = normalized(rng.rand(100, 4))
target_ints = rng.randint(0, 4, size=len(probas))
a1b2 = metrics.a_vs_b_auroc(
target_ints=target_ints, predicted_probas=probas, class_a=1, class_b=2
)
a2b1 = metrics.a_vs_b_auroc(
target_ints=target_ints, predicted_probas=probas, class_a=2, class_b=1
)
np.testing.assert_allclose(a1b2, a2b1, atol=1e-15)
def test_a_vs_b_auroc_zeros():
"""Check case with zeros in all interesting columns."""
value = metrics.a_vs_b_auroc(
target_ints=np.array([0, 1]),
predicted_probas=np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]),
class_a=0,
class_b=1,
)
# we just want to make sure this did not crash
assert 0.0 <= value <= 1.0
def test_a_vs_b_auroc_none():
"""Check case where it should return None."""
rng = np.random.RandomState(42)
for _ in range(50):
probas = normalized(rng.rand(100, 4))
target_ints = rng.randint(0, 1, size=len(probas))
value = metrics.a_vs_b_auroc(
target_ints=target_ints, predicted_probas=probas, class_a=1, class_b=2
)
assert value is None
# --- sensitivity at specificity -------------------------------------------------------
def test_sensitivity_at_x_specificity():
"""Test AUC 0.5 prediction."""
n = 500
labels = np.concatenate((np.zeros(n), np.ones(n)))
randoms = np.random.random(n)
positive_probas = np.concatenate((randoms, randoms + 1e-9))
for at in np.linspace(0.1, 0.9, num=9):
sens, spec = sensitivity_at_x_specificity(
target_ints=labels, positive_probas=positive_probas, at_specificity=at
)
np.testing.assert_allclose(spec, at, atol=0.003)
np.testing.assert_allclose(sens, 1.0 - spec, atol=0.003)
# --- specificity at sensitivity -------------------------------------------------------
def test_specificity_at_x_sensitivity():
"""Test AUC 0.5 prediction."""
n = 500
labels = np.concatenate((np.zeros(n), np.ones(n)))
randoms = np.random.random(n)
positive_probas = np.concatenate((randoms, randoms + 1e-9))
for at in np.linspace(0.1, 0.9, num=9):
spec, sens = specificity_at_x_sensitivity(
target_ints=labels, positive_probas=positive_probas, at_sensitivity=at
)
np.testing.assert_allclose(sens, at, atol=0.003)
np.testing.assert_allclose(spec, 1.0 - sens, atol=0.003)
# --- top N accuracy -------------------------------------------------------------------
def test_top_n_accuracy_all_correct():
np.random.seed(42)
n_classes = 30
for i in range(5):
target_ints = np.random.randint(0, n_classes, size=100)
pred_probas = np.eye(n_classes)[target_ints] + np.random.rand(
len(target_ints), n_classes
)
for n in [1, 2, 3, 40, 100]:
assert top_n_accuracy(target_ints, pred_probas, n) == 1.0
def test_top_n_accuracy():
target_ints = np.array([3, 1, 4])
# fmt:off
pred_probas = np.array([
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
])
# fmt:on
assert 1 / 3 == top_n_accuracy(target_ints, pred_probas, n=1)
assert 2 / 3 == top_n_accuracy(target_ints, pred_probas, n=2)
assert 2 / 3 == top_n_accuracy(target_ints, pred_probas, n=3)
assert 3 / 3 == top_n_accuracy(target_ints, pred_probas, n=4)
assert 3 / 3 == top_n_accuracy(target_ints, pred_probas, n=5)
assert 3 / 3 == top_n_accuracy(target_ints, pred_probas, n=999)
def test_top_n_accuracy__sample_weights_default():
"""
Checks that passing in a uniform sample_weights vector does the same as passing
`None` or using the default.
"""
target_ints = np.array([3, 1, 4])
# fmt:off
pred_probas = np.array([
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
])
# fmt:on
assert top_n_accuracy(target_ints, pred_probas, n=1) == top_n_accuracy(
target_ints, pred_probas, n=1, sample_weights=np.ones_like(target_ints)
)
assert top_n_accuracy(
target_ints, pred_probas, n=1, sample_weights=None
) == top_n_accuracy(
target_ints, pred_probas, n=1, sample_weights=np.ones_like(target_ints)
)
def test_top_n_accuracy__sample_weights():
"""
Same test as above, with additional zero-weighted samples, should get same output.
"""
target_ints = np.array([3, 1, 4, 1, 1, 1])
sample_weights = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
# fmt:off
pred_probas = np.array([
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
])
# fmt:on
assert 1 / 3 == top_n_accuracy(
target_ints, pred_probas, n=1, sample_weights=sample_weights
)
assert 2 / 3 == top_n_accuracy(
target_ints, pred_probas, n=2, sample_weights=sample_weights
)
assert 2 / 3 == top_n_accuracy(
target_ints, pred_probas, n=3, sample_weights=sample_weights
)
assert 3 / 3 == top_n_accuracy(
target_ints, pred_probas, n=4, sample_weights=sample_weights
)
assert 3 / 3 == top_n_accuracy(
target_ints, pred_probas, n=5, sample_weights=sample_weights
)
assert 3 / 3 == top_n_accuracy(
target_ints, pred_probas, n=999, sample_weights=sample_weights
)
def test_top_n_accuracy__sample_weights_scaled():
"""
Checks that scaling the weight vector does not change the results.
"""
target_ints = np.array([3, 1, 4, 1, 1, 1])
sample_weights = np.array([2.4, 0.5, 2.1, 0.01, 0.9, 35.7])
# fmt:off
pred_probas = np.array([
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
])
# fmt:on
assert top_n_accuracy(
target_ints, pred_probas, n=1, sample_weights=sample_weights
) == top_n_accuracy(
target_ints, pred_probas, n=1, sample_weights=42.0 * sample_weights
)
def test_top_n_accuracy__sample_weights_all_zeros():
"""
Checks that passing in zero vector `sample_weights` raises `ZeroDivisionError`.
"""
target_ints = np.array([3, 1, 4, 1, 1, 1])
sample_weights = np.zeros_like(target_ints)
# fmt:off
pred_probas = np.array([
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
])
# fmt:on
with pytest.raises(ZeroDivisionError):
_ = top_n_accuracy(target_ints, pred_probas, n=1, sample_weights=sample_weights)
def test_top_n_accuracy__sample_weights_negative():
"""
Checks that an exception is raised if at least one of the sample weights is
negative.
"""
target_ints = np.array([3, 1, 4, 1, 1, 1])
sample_weights = np.array([1.0, 1.0, -1.0, 1.0, 1.0, 1.0])
# fmt:off
pred_probas = np.array([
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
[.3, .4, .2, .1, .0], # target in top 4
[.4, .3, .2, .1, .0], # target in top 2
[.0, .1, .2, .3, .4], # target in top 1
])
# fmt:on
with pytest.raises(AssertionError):
_ = top_n_accuracy(target_ints, pred_probas, n=1, sample_weights=sample_weights)
PK ! | metriculous/test_resources.pyfrom typing import List
from typing import Tuple
import numpy as np
def noisy_prediction(targets_one_hot: np.array, noise_factor: float):
"""Simulates a classifier prediction on the dataset."""
assert targets_one_hot.ndim == 2
# Add some noise to the predictions to simulate a classifier
noisy_target = targets_one_hot + noise_factor * np.random.random(
size=targets_one_hot.shape
)
# Normalize the rows, making sure they are valid probability distributions
probability_distributions = noisy_target / noisy_target.sum(axis=1, keepdims=True)
return probability_distributions
def generate_input(
num_classes: int, num_samples: int, num_models: int
) -> Tuple[np.ndarray, List[np.ndarray]]:
target_class_indices = np.random.randint(0, high=num_classes, size=num_samples)
targets_one_hot = np.eye(num_classes)[target_class_indices]
# For each model that goes into the comparison, let's generate a prediction.
# Note that we pick a random noise factor to make sure some models have more noise
# than others.
predicted_probabilities = [
noisy_prediction(targets_one_hot, noise_factor=3 * np.random.random())
for i_model in range(num_models)
]
return targets_one_hot, predicted_probabilities
PK ! }Kk
k
metriculous/utilities.pyfrom typing import Sequence
import numpy as np
from assertpy import assert_that
def sample_weights_simulating_class_distribution(
y_true: Sequence[int], hypothetical_class_distribution: Sequence[float]
) -> np.ndarray:
"""
Computes a 1D array of sample weights that results in the requested
`hypothetical_class_distribution` if applied to the dataset. This is useful when you
know that the class distribution in your dataset deviates from the distribution you
expect to encounter in the environment where your machine learning model is going to
be deployed.
Example:
You have a data set with 40% spam 60% ham emails. However, you expect that
only 4% of the emails in the deployment environment will be spam, and you would
like to measure various performance characteristics on a dataset with 4% spam
and 96% ham. This function will return an array with
* sample weights 4% / 40% = 0.1 for all of the spam examples
* sample weights 96% / 60% = 1.6 for all of the ham examples
if called with:
>>> weights = sample_weights_simulating_class_distribution(
... y_true=[0, 1, 1, 0, 1, 0, 1, 1, 0, 1], # zeros for spam
... hypothetical_class_distribution=[0.04, 0.96]
... )
>>> print(weights)
array([0.1 , 1.6])
Args:
y_true:
1D array of integers with class indices of the dataset. There must be at
least one sample for each class.
hypothetical_class_distribution:
Sequence of floats describing the distribution you assume to encounter in
your deployment environment.
Returns:
1D numpy array with sample weights, same length as `y_true`.
"""
# --- check input ---
assert_that(set(y_true)).is_equal_to(
set(range(len(hypothetical_class_distribution)))
)
assert_that(len(set(y_true))).is_equal_to(len(hypothetical_class_distribution))
y_true = np.asarray(y_true)
hypothetical_class_distribution = np.asarray(hypothetical_class_distribution)
np.testing.assert_allclose(hypothetical_class_distribution.sum(), 1.0)
assert_that(y_true.ndim).is_equal_to(1)
assert_that(hypothetical_class_distribution.ndim).is_equal_to(1)
# --- compute output ---
class_distribution = np.bincount(y_true) / len(y_true)
np.testing.assert_equal(class_distribution > 0.0, True)
np.testing.assert_allclose(class_distribution.sum(), 1.0)
weights = [
hypothetical_class_distribution[y] / class_distribution[y] for y in y_true
]
return np.array(weights)
PK ! metriculous/utilities_test.pyimport numpy as np
import pytest
from sklearn import metrics as sklmetrics
from . import utilities
def test_sample_weights():
y_true = np.array([0, 0, 0, 0, 1, 1, 2, 2, 2, 2])
weights = utilities.sample_weights_simulating_class_distribution(
y_true=y_true, # distribution: [0.4, 0.2, 0.4]
hypothetical_class_distribution=[0.90, 0.08, 0.02],
)
expected_weights = np.array(
[
0.90 / 0.4,
0.90 / 0.4,
0.90 / 0.4,
0.90 / 0.4,
0.08 / 0.2,
0.08 / 0.2,
0.02 / 0.4,
0.02 / 0.4,
0.02 / 0.4,
0.02 / 0.4,
]
)
assert np.shape(weights) == np.shape(y_true)
np.testing.assert_allclose(weights, expected_weights)
# Now use the sample weights and see if they have the desired effect:
# Use predictions where first four entries,
# which correspond to true class 0, are correct.
some_prediction = np.array([0, 0, 0, 0, 2, 2, 1, 1, 1, 1])
accuracy_with_weights = sklmetrics.accuracy_score(
y_true=y_true, y_pred=some_prediction, sample_weight=weights
)
accuracy_without_weights = 0.4
assert accuracy_with_weights == pytest.approx(
accuracy_without_weights * 0.90 / 0.4, abs=1e-9
)
def test_sample_weights__distribution_not_normalized():
"""
Checks that an exception is raised if the hypothetical class distribution is not
normalized.
"""
not_normalized = [0.4, 0.3, 0.1]
with pytest.raises(AssertionError):
_ = utilities.sample_weights_simulating_class_distribution(
y_true=[0, 1, 2, 0, 1, 2], hypothetical_class_distribution=not_normalized
)
@pytest.mark.parametrize(
"y_true, hypothetical_class_weights",
[
([0, 0, 1, 3], [0.5, 0.3, 0.1, 0.1]),
([0, 0, 1, 3], [0.5, 0.3, 0.2]),
([0, 0, 1, 3], [0.5, 0.1, 0.2, 0.1, 0.1]),
([0, 1, 2, 3], [0.5, 0.1, 0.2, 0.1, 0.1]),
([3], [0.5, 0.3, 0.2]),
([0], [0.5, 0.5]),
([1], [1.0]),
],
)
def test_sample_weights__class_not_represented(y_true, hypothetical_class_weights):
"""
Checks that an exception is raised if at least one class is not represented in the
input.
"""
np.testing.assert_allclose(sum(hypothetical_class_weights), 1.0)
with pytest.raises(AssertionError):
_ = utilities.sample_weights_simulating_class_distribution(
y_true=y_true, hypothetical_class_distribution=hypothetical_class_weights
)
PK ! ;]. . # metriculous-0.1.0.dist-info/LICENSEMIT License
Copyright (c) 2019 Luminovo GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
PK !HڽT U ! metriculous-0.1.0.dist-info/WHEEL
A
н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK !Ho $ metriculous-0.1.0.dist-info/METADATAUQo6~篸% k,Y`I%^Z"YR)9i~%wwwG_"Z`ʏ'Vm[1?bmπ{h"jYZ\+/Zz&FZ TttKe.j0Vy%֎Ĉ7nxntSfB.eѹ%mSd:Dv#?d!_6 p!
~mE\#'og{Dz4<ܒ"@v," SCuXw*ksҊ *^
U$^YPߋ;'g@<J-?:|;Ǔj^j$hӷCW.k?T}nh'{]n4*
,{COifօ~P
Pdhl.xjɺ4E[eSawF@?l$օR(JLs:(K(~JbOcp{Vk:k=-7јr%*D(h >о!=ц||i|ͳK \^eC4-zԈ5*`GT! Hug#=>l{ZRQmxGUk$uC{@=H|L\=[VBCH;w2[1aU(XU*'8 U,!';LeAdEX7pXF|DnE[VlHkKlAꝴPѓVF-vYOE$h*p6ü%J
"0ɉQ֑(T)i<8qEbM04JGM!
!:K L?$ .OoQ:$YDX~ Eft߾n8-cy9:sC\Y8dpC6Ċbſ}:Y/PK !H-? p " metriculous-0.1.0.dist-info/RECORDIJ-Ёb^da0f7GxU6宅!vGu%tmPBTgT65:M[yR3&Gl.jhcܞw8OaVMI ?Y6x31x=nFn^hgV R/6Q<cS\Mm[ZK6EZ4P$'n0`B
=O}^tNuX,!N3ߚ[kb #\N1{\ZCɑy҃
&Ѵ3WvsN!?@ g:{rFfT=,+eͼ=jQSjyw_j8{F"1淪Mzh%s0W
#<5ՓT*"}OE;Ⴊگd;wk(Bw
1lQ)!d#_iu/N/4 UfCpi8}$#UθEX/w*M/ R$I4f$6QT%0TWyRVe}^'U\&4bx5I:'vM/]AVU:p+ #6i;$ݦCyןI:.i:G ! 0p4sA_PK ! ;]. . LICENSEPK ! Mz S metriculous/__init__.pyPK ! a% % ( > metriculous/__init___import_star_test.pyPK ! @i metriculous/__init___test.pyPK ! L,Q$$ $$ metriculous/_comparison.pyPK ! |{ { 3 metriculous/_evaluation.pyPK ! _Y- = metriculous/_evaluation_test.pyPK ! jr " @ metriculous/evaluators/__init__.pyPK ! y[R [R 3 A metriculous/evaluators/_classification_evaluator.pyPK ! 1~/ / 8 metriculous/evaluators/_classification_evaluator_test.pyPK ! W'E E 7 metriculous/evaluators/_classification_figures_bokeh.pyPK ! 2 / metriculous/evaluators/_classification_utils.pyPK ! &߉ 1 metriculous/evaluators/_segmentation_evaluator.pyPK ! ɔ 6 + metriculous/evaluators/_segmentation_evaluator_test.pyPK ! ld 5 L metriculous/evaluators/_segmentation_figures_bokeh.pyPK ! 2Ů fW metriculous/metrics.pyPK ! ϩ*v* v* Gv metriculous/metrics_test.pyPK ! | metriculous/test_resources.pyPK ! }Kk
k
4 metriculous/utilities.pyPK ! հ metriculous/utilities_test.pyPK ! ;]. . # metriculous-0.1.0.dist-info/LICENSEPK !HڽT U ! n metriculous-0.1.0.dist-info/WHEELPK !Ho $ metriculous-0.1.0.dist-info/METADATAPK !H-? p " W metriculous-0.1.0.dist-info/RECORDPK