PK e=O5) superintendent/__init__.py"""Interactive machine learning supervision."""
from .clustersupervisor import ClusterSupervisor # noqa
from .multioutput import MultiLabeller # noqa
from .semisupervisor import SemiSupervisor # noqa
__version__ = "0.4.3" # noqa
PK e=OJt6 6 superintendent/_compatibility.pyimport contextlib
import warnings
@contextlib.contextmanager
def ignore_widget_on_submit_warning():
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message=r".*on_submit is deprecated.*",
category=DeprecationWarning,
)
yield
PK e=O1 superintendent/base.py"""Base class to inherit from."""
import abc
from functools import partial
from typing import Any, Callable, Dict, Optional, Tuple
import ipyevents
import IPython.display
import ipywidgets as widgets
import numpy as np
import traitlets
from . import controls, display
# class AbstractTraitletMetaclass(traitlets.HasTraits, metaclass=abc.ABCMeta):
# pass
class Labeller(traitlets.HasTraits):
"""
Data point labelling.
This class allows you to label individual data points.
Parameters
----------
features : np.array | pd.DataFrame | list
The input array for your model
labels : np.array, pd.Series, pd.DataFrame, optional
The labels for your data.
options : Tuple[str]
The label options you'd like the user to be shown. These will be
presented as either buttons or in a dropdown.
other_option : bool
Whether or not a text field for supplying a different label should
be shown.
max_buttons : int
How many buttons should be displayed before it switches to a non-
button based interface.
display_func : str, func, optional
Either a function that accepts one row of features and returns
what should be displayed with IPython's `display`, or a string
that is any of 'img', 'image'.
keyboard_shortcuts : bool, optional
If you want to enable ipyevent-mediated keyboard capture to use the
keyboard rather than the mouse to submit data.
hint_function : func, optional
The function to display these hints. By default, the same function as
display_func is used.
hints : np.array | pd.DataFrame | list
The hints to start off with.
"""
options = traitlets.List(list(), allow_none=True)
def __init__(
self,
features: Optional[Any] = None,
labels: Optional[Any] = None,
options: Tuple[str, ...] = (),
other_option: bool = True,
max_buttons: int = 12,
display_func: Callable = None,
keyboard_shortcuts: bool = False,
hint_function: Optional[Callable] = None,
hints: Optional[Dict[str, Any]] = None,
):
"""
Make a class that allows you to label data points.
"""
# the widget elements
self.layout = widgets.VBox([])
self.feature_output = widgets.Output()
self.feature_display = widgets.Box(
(self.feature_output,),
layout=widgets.Layout(
justify_content="center",
padding="5% 0",
display="flex",
width="100%",
min_height="150px",
),
)
self.input_widget = controls.Submitter(
hint_function=hint_function,
hints=hints,
options=options,
other_option=other_option,
max_buttons=max_buttons,
)
self.input_widget.on_submission(self._apply_annotation)
self.options = self.input_widget.options
traitlets.link((self, "options"), (self.input_widget, "options"))
# self.features = validation.valid_data(features)
self.features = features
# if labels is not None:
# self.labels = validation.valid_data(labels)
# elif self.features is not None:
# self.labels = np.full(
# self.features.shape[0], np.nan, dtype=float)
self.labels = labels
self.progressbar = widgets.FloatProgress(
max=1, description="Progress:"
)
self.top_bar = widgets.HBox([])
self.top_bar.children = [self.progressbar]
if display_func is not None:
self._display_func = display_func
else:
self._display_func = display.functions["default"]
if keyboard_shortcuts:
self.event_manager = ipyevents.Event(
source=self.layout, watched_events=["keydown", "keyup"]
)
self.event_manager.on_dom_event(self.input_widget._on_key_down)
else:
self.event_manager = None
self.timer = controls.Timer()
@abc.abstractmethod
def _annotation_iterator(self):
pass
@classmethod
def from_images(cls, *args, image_size=None, **kwargs):
"""Generate a labelling widget from an image array.
Parameters
----------
features : np.ndarray
A numpy array of shape n_images, n_pixels
image_size : tuple
The actual size to reshape each row of the features into.
Returns
-------
type
Description of returned object.
"""
if image_size is None and "features" in kwargs:
features = kwargs["features"]
# check the input is in the correct format:
if not isinstance(features, np.ndarray):
raise TypeError(
"When using from_images, input features "
"needs to be a numpy array with shape "
"(n_features, n_pixel)."
)
# check if image is square
if int(np.sqrt(features.shape[1])) ** 2 == features.shape[1]:
image_size = "square"
else:
raise ValueError(
"If image_size is None, the image needs to be square, but "
"yours has " + str(features.shape[1]) + " pixels."
)
elif image_size is None and "features" not in kwargs:
# just assume images will be square
image_size = "square"
kwargs["display_func"] = kwargs.get(
"display_func",
partial(display.functions["image"], imsize=image_size),
)
instance = cls(*args, **kwargs)
return instance
def _apply_annotation(self, sender):
self._annotation_loop.send(sender)
def add_features(self, features, labels=None):
"""
Add data to the widget.
This adds the data provided to the queue of data to be labelled. You
Can optionally provide labels for each data point.
Parameters
----------
features : Any
The data you'd like to add to the labelling widget.
labels : Any, optional
The labels for the data you're adding; if you have labels.
"""
self.queue.enqueue_many(features, labels=labels)
# reset the iterator
self._annotation_loop = self._annotation_iterator()
self.queue.undo()
next(self._annotation_loop)
self._compose()
def _display(self, feature):
if feature is not None:
if self.timer > 0.5:
self._render_processing()
with self.timer:
with self.feature_output:
IPython.display.clear_output(wait=True)
self._display_func(feature)
def _compose(self):
self.layout.children = [
self.top_bar,
self.feature_display,
self.input_widget,
]
return self
def _render_processing(self, message="Rendering..."):
with self.feature_output:
IPython.display.clear_output(wait=True)
IPython.display.display(
widgets.HTML(
value=(
"
{}".format(message)
+ ''
)
)
)
def _render_finished(self):
self.progressbar.bar_style = "success"
with self.feature_output:
IPython.display.clear_output(wait=True)
IPython.display.display(
widgets.HTML(value=u"Finished labelling 🎉!")
)
self.layout.children = [self.progressbar, self.feature_display]
return self
@property
def new_labels(self):
_, _, labels = self.queue.list_all()
return labels
def _ipython_display_(self):
IPython.display.display(self.layout)
PK e=O-
# superintendent/clustersupervisor.py# -*- coding: utf-8 -*-
"""Tools to supervise your clustering."""
from . import base
from .queueing import ClusterLabellingQueue
class ClusterSupervisor(base.Labeller):
"""
A widget for labelling clusters.
Parameters
----------
features : np.ndarray, pd.Series. pd.DataFrame
Your features.
cluster_indices : np.ndarray, pd.Series
The cluster label for each data point.
representativeness : np.ndarray, pd.Series
How representative of a cluster your data points are. This can be the
probability of cluster membership (as in e.g. HDBSCAN), or cluster
centrality (as in e.g. K-Means).
ignore : tuple, list
Which clusters should be ignored. By default, this is -1, as most
clustering algorithms assign -1 to points not in any cluster.
"""
def __init__(
self,
features,
cluster_indices,
representativeness=None,
ignore=(-1,),
**kwargs
):
"""Create a labelling widget."""
super().__init__(features, **kwargs)
self.queue = ClusterLabellingQueue(
features, cluster_indices, representativeness
)
self._annotation_loop = self._annotation_iterator()
next(self._annotation_loop)
self._compose()
def _annotation_iterator(self):
"""
The method that iterates over the clusters and presents them for
annotation.
"""
self.progressbar.bar_style = ""
for cluster_index, data in self.queue:
self._display(data)
sender = yield
if sender["source"] == "__undo__":
# unpop the current item:
self.queue.undo()
# unpop and unlabel the previous item:
self.queue.undo()
# try to remove any labels not in the assigned labels:
self.input_widget.remove_options(
set(self.input_widget.options) - self.queue.list_labels()
)
elif sender["source"] == "__skip__":
pass
else:
new_label = sender["value"]
self.queue.submit(cluster_index, new_label)
# self.input_widget.add_hint(new_label, datapoint)
self.progressbar.value = self.queue.progress
if self.event_manager is not None:
self.event_manager.close()
yield self._render_finished()
@property
def new_clusters(self):
return self.queue.cluster_labels
PK f=OYR R superintendent/display.py"""Helper functions for displaying types of data."""
from typing import Callable, Dict
import IPython.display
import numpy as np
from matplotlib import pyplot as plt
def default_display_func(feature):
"""
A default function that prints the object.
If the data is not numerical, the function prints the data to screen as
text.
Parameters
----------
feature : np.ndarray, pd.Series, pd.DataFrame
The feature(s) you want to display
"""
# n_samples = min(n_samples, feature.shape[0])
IPython.display.display(feature)
def image_display_func(image, imsize=None):
"""
Image display function.
Iterates over the rows in the array and uses matplotlib imshow to actually
reveal the image.
Parameters
----------
image : np.ndarray
The data, in the shape of n_samples, n_pixels
imsize : tuple, optional
A tuple of width, height that gets passed to np.reshape
"""
fig, ax = plt.subplots(1, 1)
if imsize == "square":
image = image.reshape(2 * [int(np.sqrt(image.size))])
elif imsize is not None:
image = image.reshape(imsize)
ax.imshow(image, cmap="binary")
ax.axis("off")
plt.show()
functions: Dict[str, Callable] = {
"default": default_display_func,
"image": image_display_func,
"img": image_display_func,
}
PK f=OUP
P
superintendent/prioritisation.py"""
Functions to prioritise labelling data points (to drive active learning).
"""
import numpy as np
import scipy.stats
def _shuffle_subset(data: np.ndarray, shuffle_prop: float) -> np.ndarray:
to_shuffle = np.nonzero(np.random.rand(data.shape[0]) < shuffle_prop)[0]
data[to_shuffle, ...] = data[np.random.permutation(to_shuffle), ...]
return data
def entropy(
probabilities: np.ndarray, shuffle_prop: float = 0.1
) -> np.ndarray:
"""
Sort by the entropy of the probabilities (high to low).
Parameters
----------
probabilities : np.ndarray
An array of probabilities, with the shape n_samples,
n_classes
shuffle_prop : float
The proportion of data points that should be randomly shuffled. This
means the sorting retains some randomness, to avoid biasing your
new labels and catching any minority classes the algorithm currently
classifies as a different label.
"""
ordered = np.argsort(-scipy.stats.entropy(probabilities.T))
return _shuffle_subset(ordered.argsort(), shuffle_prop)
def margin(probabilities, shuffle_prop=0.1):
"""
Sort by the margin between the top two predictions (low to high).
Parameters
----------
probabilities : np.ndarray
An array of probabilities, with the shape n_samples,
n_classes
shuffle_prop : float
The proportion of data points that should be randomly shuffled. This
means the sorting retains some randomness, to avoid biasing your
new labels and catching any minority classes the algorithm currently
classifies as a different label.
"""
ordered = np.argsort(
np.sort(probabilities, axis=1)[:, -1]
- np.sort(probabilities, axis=1)[:, -2]
)
return _shuffle_subset(ordered.argsort(), shuffle_prop)
def certainty(probabilities, shuffle_prop=0.1):
"""
Sort by the certainty of the maximum prediction.
Parameters
----------
probabilities : np.ndarray
An array of probabilities, with the shape n_samples,
n_classes
shuffle_prop : float
The proportion of data points that should be randomly shuffled. This
means the sorting retains some randomness, to avoid biasing your
new labels and catching any minority classes the algorithm currently
classifies as a different label.
"""
ordered = np.argsort(np.max(probabilities, axis=1))
return _shuffle_subset(ordered.argsort(), shuffle_prop)
functions = {"entropy": entropy, "margin": margin, "certainty": certainty}
"""A dictionary of functions to prioritise data."""
PK f=O/jH H superintendent/queueing.pyimport abc
import itertools
import operator
from collections import defaultdict, deque, namedtuple
from functools import reduce
from random import shuffle
from typing import Any, DefaultDict, Deque, Dict, List, Optional, Set, Tuple
import numpy as np
import pandas as pd
class BaseLabellingQueue(abc.ABC): # pragma: no cover
@abc.abstractmethod
def enqueue(self, feature: Any, label: Optional[Any] = None) -> None:
"""Add a data point to the queue.
Parameters
----------
feature : Any
A data point to be added to the queue
label : str, list, optional
The label, if you already have one (the default is None)
Returns
-------
None
"""
pass
@abc.abstractmethod
def pop(self) -> Tuple[int, Any]:
"""Pop an item off the queue.
Returns
-------
int
The ID of the item just popped
Any
The item itself.
"""
pass
@abc.abstractmethod
def submit(self, id_: int, label: str) -> None:
"""Label a data point.
Parameters
----------
id_ : int
The ID of the datapoint to submit a label for
label : str
The label to apply for the data point
Raises
------
ValueError
If you attempt to label an item that hasn't been popped in this
queue.
Returns
-------
None
"""
pass
@abc.abstractmethod
def reorder(self, new_order: Dict[int, int]) -> None:
"""Reorder the data still in the queue
Parameters
----------
new_order : Dict[int, int]
A mapping from ID of an item to the order of the item. For example,
a dictionary {1: 2, 2: 1, 3: 3} would place the item with ID 2
first, then the item with id 1, then the item with ID 3.
Returns
-------
None
"""
pass
@abc.abstractmethod
def undo(self) -> None:
"""Un-pop the latest item.
Returns
-------
None
"""
pass
@abc.abstractmethod
def list_completed(self):
"""List all items with a label.
Returns
-------
ids : List[int]
The IDs of the returned items.
x : Any
The data points that have labels.
y : Any
The labels.
"""
pass
@abc.abstractmethod
def list_uncompleted(self):
"""List all items without a label.
Returns
-------
ids : List[int]
The IDs of the returned items.
x : Any
The data points that don't have labels.
"""
pass
@abc.abstractmethod
def list_labels(self):
"""List all the labels.
Returns
-------
Set[str]
All the labels.
"""
pass
@abc.abstractmethod
def __iter__(self):
pass
@abc.abstractmethod
def __next__(self):
pass
class SimpleLabellingQueue(BaseLabellingQueue):
item = namedtuple("QueueItem", ["id", "data", "label"])
def __init__(self, features: Any = None, labels: Any = None):
"""Create an in-memory labelling queue.
Parameters
----------
features : Any, optional
Features to be added to the queue. You can either provide them
here, or later using the enqueue_many method (the default is None).
labels : Any, optional
Labels for the features to be added to the queue. You can either
provide them here, or later using the enqueue_many method
(the default is None).
"""
self.data: Dict[int, Any] = dict()
self.labels: Dict[int, Any] = dict()
self.order: Deque[int] = deque([])
self._popped: Deque[int] = deque([])
if features is not None:
self.enqueue_many(features, labels)
def enqueue(self, feature: Any, label: Optional[Any] = None) -> None:
"""Add a data point to the queue.
Parameters
----------
feature : Any
A data point to be added to the queue
label : str, list, optional
The label, if you already have one (the default is None)
Returns
-------
None
"""
if len(self.data) > 0:
idx = max(self.data.keys()) + 1
else:
idx = 0
self.data[idx] = feature
if label is not None:
self.labels[idx] = label
else:
self.order.appendleft(idx)
def enqueue_many(self, features: Any, labels=None) -> None:
"""Add a bunch of items to the queue.
Parameters
----------
features : Any
[description]
labels : [type], optional
[description] (the default is None, which [default_description])
Returns
-------
None
[description]
"""
if isinstance(features, pd.DataFrame):
features = [row for _, row in features.iterrows()]
if labels is None:
labels = itertools.cycle([None])
for feature, label in zip(features, labels):
self.enqueue(feature, label)
def pop(self) -> Tuple[int, Any]:
"""Pop an item off the queue.
Returns
-------
int
The ID of the item you just popped
Any
The item itself.
"""
id_ = self.order.pop()
self._popped.append(id_)
return id_, self.data[id_]
def submit(self, id_: int, label: str) -> None:
"""Label a data point.
Parameters
----------
id_ : int
The ID of the datapoint to submit a label for
label : str
The label to apply for the data point
Raises
------
ValueError
If you attempt to label an item that hasn't been popped in this
queue.
Returns
-------
None
"""
if id_ not in self._popped:
raise ValueError("This item was not popped; you cannot label it.")
self.labels[id_] = label
def reorder(self, new_order: Dict[int, int]) -> None:
"""Reorder the data still in the queue
Parameters
----------
new_order : Dict[int, int]
A mapping from ID of an item to the order of the item. For example,
a dictionary {1: 2, 2: 1, 3: 3} would place the item with ID 2
first, then the item with id 1, then the item with ID 3.
Returns
-------
None
"""
self.order = deque(
[
idx
for idx, _ in sorted(
new_order.items(), key=lambda item: -item[1]
)
]
)
def shuffle(self) -> None:
"""Shuffle the queue.
Returns
-------
None
"""
_order = list(self.order)
shuffle(_order)
self.order = deque(_order)
def undo(self) -> None:
"""Un-pop the latest item.
Returns
-------
None
"""
if len(self._popped) > 0:
id_ = self._popped.pop()
self.labels.pop(id_, None)
self.order.append(id_)
def list_completed(self):
"""List all items with a label.
Returns
-------
ids : List[int]
The IDs of the returned items.
x : Any
The data points that have labels.
y : Any
The labels.
"""
items = [
self.item(id=id_, data=self.data[id_], label=self.labels.get(id_))
for id_ in sorted(self._popped)
if id_ in self.labels
]
ids = [item.id for item in items]
x = _features_to_array([item.data for item in items])
y = [item.label for item in items]
return ids, x, y
def list_uncompleted(self):
"""List all items without a label.
Returns
-------
ids : List[int]
The IDs of the returned items.
x : Any
The data points that don't have labels.
"""
items = [
self.item(id=id_, data=self.data[id_], label=None)
for id_ in sorted(self.order)
if id_ not in self.labels
]
ids = [item.id for item in items]
x = _features_to_array([item.data for item in items])
return ids, x
def list_all(self):
"""List all items.
Returns
-------
ids : List[int]
The IDs of the returned items.
x : Any
The data points.
y : Any
The labels.
"""
items = [
self.item(id=id_, data=self.data[id_], label=self.labels.get(id_))
for id_ in self.data
]
ids = [item.id for item in items]
x = _features_to_array([item.data for item in items])
y = [item.label for item in items]
return ids, x, y
def list_labels(self) -> Set[str]:
"""List all the labels.
Returns
-------
Set[str]
All the labels.
"""
try:
return set(sorted(self.labels.values()))
except TypeError:
return reduce(operator.or_, map(set, self.labels.values()))
@property
def progress(self) -> float:
"""The queue progress."""
if len(self.data) > 0:
return len(self.labels) / len(self.data)
else:
return 0
def __len__(self):
return len(self.order)
def __iter__(self):
return self
def __next__(self):
try:
return self.pop()
except IndexError:
raise StopIteration
class ClusterLabellingQueue(BaseLabellingQueue):
def __init__(
self,
features: Any = None,
cluster_indices: Any = None,
representativeness=None,
):
"""Create a queue for labelling clusters.
Parameters
----------
features : Any
The features you'd like to add to the queue.
cluster_indices : Any
The clusters that each of the data points belong to. This should
match the features in length.
representativeness : Any, optional
The respective cluster representativeness of each data point. This
could be distance from cluster center, probability of cluster
membership, or a similar metric.
"""
self.data: DefaultDict[Any, List[Any]] = defaultdict(list)
self.representativeness: DefaultDict[Any, List[Any]] = defaultdict(
list
)
self.cluster_labels: Dict[Any, str] = dict()
self.order: Deque[int] = deque([])
self._popped: Deque[int] = deque([])
if features is not None:
self.enqueue_many(features, cluster_indices, representativeness)
def enqueue_many(self, features, cluster_indices, representativeness=None):
"""Add items to the queue.
Parameters
----------
features : Any
The features you'd like to add to the queue.
cluster_indices : Any
The clusters that each of the data points belong to. This should
match the features in length.
representativeness : Any, optional
The respective cluster representativeness of each data point. This
could be distance from cluster center, probability of cluster
membership, or a similar metric.
Returns
-------
None
"""
if isinstance(features, pd.DataFrame):
features = [row for _, row in features.iterrows()]
if representativeness is None:
representativeness = np.full(len(features), np.nan)
for cluster_index, feature, represents in zip(
cluster_indices, features, representativeness
):
self.enqueue(cluster_index, feature, represents)
def enqueue(self, cluster_index, feature, representativeness=None):
"""Add an item to the queue.
Parameters
----------
cluster_index : Any
The cluster index
feature : Any
The data to be added to the queue.
representativeness : float, optional
The respective representativeness of the data point. This
could be distance from cluster center, probability of cluster
membership, or a similar metric. (the default is None)
"""
self.data[cluster_index].append(feature)
if representativeness is None:
representativeness = np.nan
self.representativeness[cluster_index].append(representativeness)
if cluster_index not in self.order:
self.order.appendleft(cluster_index)
def pop(self):
"""Pop an item off the queue.
Returns
-------
id_ : int
The ID of the cluster.
features : Any
The data points that are in this cluster.
"""
id_ = self.order.pop()
self._popped.append(id_)
features = [
x
for _, x in sorted(
zip(self.representativeness[id_], self.data[id_]),
key=lambda pair: pair[0],
)
]
return id_, _features_to_array(features)
def submit(self, cluster_index, cluster_label):
"""Submit a label for a cluster.
Parameters
----------
cluster_index : Any
The cluster you are trying to label.
cluster_label : str
The label for the cluster
Raises
------
ValueError
If you are trying to label a cluster you haven't popped off the
queue.
"""
if cluster_index not in self._popped:
raise ValueError("This item was not popped; you cannot label it.")
self.cluster_labels[cluster_index] = cluster_label
def reorder(self):
"""Re-order the queue. This is currently not implemented."""
pass
def shuffle(self) -> None:
"""Shuffle the queue."""
_order = list(self.order)
shuffle(_order)
self.order = deque(_order)
def undo(self) -> None:
"""Unpop the most recently popped item."""
if len(self._popped) > 0:
cluster_index = self._popped.pop()
self.cluster_labels.pop(cluster_index, None)
self.order.append(cluster_index)
def list_completed(self):
"""List the data that has been assigned a cluster label.
Returns
-------
cluster_indices
The indices of the clusters.
features
The features that have been assigned a label.
cluster_labels
The assigned cluster labels.
"""
features = [
data
for idx, values in self.data.items()
for data in values
if idx in self.cluster_labels
]
cluster_indices = [
idx
for idx, values in self.data.items()
for data in values
if idx in self.cluster_labels
]
cluster_labels = [
self.cluster_labels[idx]
for idx, values in self.data.items()
for data in values
if idx in self.cluster_labels
]
return cluster_indices, _features_to_array(features), cluster_labels
def list_uncompleted(self):
"""List the data that has not yet been assigned a label.
Returns
-------
cluster_indices
The indices of the clusters the data points are in.
features
The data in the unlabelled features.
"""
features = [
data
for idx, values in self.data.items()
for data in values
if idx not in self.cluster_labels
]
cluster_indices = [
idx
for idx, values in self.data.items()
for data in values
if idx not in self.cluster_labels
]
return cluster_indices, _features_to_array(features)
def list_all(self):
"""List all data.
Returns
-------
cluster_indices
The indices of the clusters the data points are in.
features
The data.
cluster_labels
The assigned cluster labels.
"""
features = [
data for idx, values in self.data.items() for data in values
]
cluster_indices = [
idx for idx, values in self.data.items() for data in values
]
cluster_labels = [
self.cluster_labels.get(idx)
for idx, values in self.data.items()
for data in values
]
return cluster_indices, _features_to_array(features), cluster_labels
@property
def progress(self):
"""How much of the queue has been completed.
Returns
-------
progress : float
The progress.
"""
try:
return len(self.cluster_labels) / len(self.data)
except ZeroDivisionError:
return np.nan
def list_labels(self):
try:
return set(sorted(self.cluster_labels.values()))
except TypeError:
return reduce(operator.or_, map(set, self.cluster_labels.values()))
def __iter__(self):
return self
def __len__(self):
return len(self.order)
def __next__(self):
try:
return self.pop()
except IndexError:
raise StopIteration
def _features_to_array(features: list):
"""Convert a list of features to a 2D array.
Parameters
----------
features : list
A list of features to be converted to an array or dataframe.
Returns
-------
features : Any
The array of features.
"""
if len(features) > 0:
if all(isinstance(feature, pd.Series) for feature in features):
features = pd.concat([item.to_frame().T for item in features])
elif all(isinstance(feature, pd.DataFrame) for feature in features):
features = pd.concat(features)
elif all(isinstance(feature, np.ndarray) for feature in features):
features = np.stack(features)
return features
PK f=O:+# # superintendent/semisupervisor.py"""Tools to supervise classification."""
import warnings
from collections import OrderedDict
from functools import partial
import ipywidgets as widgets
import sklearn.model_selection
from . import base, prioritisation, validation
from .queueing import SimpleLabellingQueue
class SemiSupervisor(base.Labeller):
"""
A widget for labelling your data.
This class is designed to label data for (semi-)supervised learning
algorithms. It allows you to label data. In the future, it will also allow
you to re-train an algorithm.
Parameters
----------
features : list, np.ndarray, pd.Series, pd.DataFrame, optional
An array or sequence of data in which each element (if 1D) or each row
(if 2D) represents one data point for which you'd like to generate
labels.
labels : list, np.ndarray, pd.Series, pd.DataFrame, optional
If you already have some labels, but would like to re-label some, then
you can pass these in as labels.
options : tuple, list
The options presented for labelling.
display_func : callable, optional
A function that will be used to display the data. This function should
take in two arguments, first the data to display, and second the number
of data points to display (set to 1 for this class).
classifier : sklearn.base.ClassifierMixin, optional
An object that implements the standard sklearn fit/predict methods. If
provided, a button for retraining the model is shown, and the model
performance under k-fold crossvalidation can be read as you go along.
eval_method : callable, optional
A function that accepts the classifier, features, and labels as input
and returns a dictionary of values that contain the key 'test_score'.
The default is sklearn.model_selection.cross_validate, with cv=3. Use
functools.partial to create a function with its parameters fixed.
reorder : str, callable, optional
One of the reordering algorithms specified in
:py:mod:`superintendent.prioritisation`. This describes a function that
receives input in the shape of n_samples, n_labels and calculates the
priority in terms of information value in labelling a data point.
shuffle_prop : float
The proportion of points that are shuffled when the data points are
re-ordered (see reorder keyword-argument). This controls the
"exploration vs exploitation" trade-off - the higher, the more you
explore the feature space randomly, the lower, the more you exploit
your current weak points.
hints : dict, optional
A dictionary mapping class labels to example data points from that
class. Hints are displayed with the same function as the main data, so
should be in the same format.
keyboard_shortcuts : bool
Whether keyboard shortcuts should be enabled for this widget.
"""
def __init__(
self,
features=None,
labels=None,
options=(),
classifier=None,
display_func=None,
eval_method=None,
reorder=None,
shuffle_prop=0.1,
hints=None,
keyboard_shortcuts=False,
*args,
**kwargs
):
"""
A class for labelling your data.
This class is designed to label data for (semi-)supervised learning
algorithms. It allows you to label data, periodically re-train your
algorithm and assess its performance, and determine which data points
to label next based on your model's predictions.
"""
super().__init__(
features=features,
labels=labels,
display_func=display_func,
options=options,
keyboard_shortcuts=keyboard_shortcuts,
hints=hints,
*args,
**kwargs
)
self.queue = SimpleLabellingQueue(features, labels)
self.shuffle_prop = shuffle_prop
self.classifier = validation.valid_classifier(classifier)
if self.classifier is not None:
self.retrain_button = widgets.Button(
description="Retrain",
disabled=False,
button_style="",
tooltip="Click me",
icon="refresh",
)
self.retrain_button.on_click(self.retrain)
self.model_performance = widgets.HTML(value="")
self.top_bar.children = (
widgets.HBox(
[*self.top_bar.children],
layout=widgets.Layout(width="50%"),
),
widgets.HBox(
[self.retrain_button, self.model_performance],
layout=widgets.Layout(width="50%"),
),
)
if eval_method is None:
self.eval_method = partial(
sklearn.model_selection.cross_validate,
cv=3,
# n_jobs=-1,
return_train_score=False,
)
elif not callable(eval_method):
raise ValueError("The eval_method needs to be a callable.")
else:
self.eval_method = eval_method
if reorder is not None and isinstance(reorder, str):
if reorder not in prioritisation.functions:
raise NotImplementedError(
"Unknown reordering function '{}'.".format(reorder)
)
self.reorder = prioritisation.functions[reorder]
elif reorder is not None and callable(reorder):
self.reorder = reorder
elif reorder is None:
self.reorder = None
else:
raise ValueError(
"The reorder argument needs to be either a function or the "
"name of a function listed in superintendent.prioritisation."
)
self._annotation_loop = self._annotation_iterator()
next(self._annotation_loop)
self._compose()
def _annotation_iterator(self):
"""Relabel should be integer indices"""
self.progressbar.bar_style = ""
for id_, datapoint in self.queue:
self._display(datapoint)
sender = yield
if sender["source"] == "__undo__":
# unpop the current item:
self.queue.undo()
# unpop and unlabel the previous item:
self.queue.undo()
# try to remove any labels not in the assigned labels:
self.input_widget.remove_options(
set(self.input_widget.options) - self.queue.list_labels()
)
elif sender["source"] == "__skip__":
pass
else:
new_label = sender["value"]
self.queue.submit(id_, new_label)
# self.input_widget.add_hint(new_label, datapoint)
self.progressbar.value = self.queue.progress
if self.event_manager is not None:
self.event_manager.close()
yield self._render_finished()
def retrain(self, *args):
"""Retrain the classifier you passed when creating this widget.
This calls the fit method of your class with the data that you've
labelled. It will also score the classifier and display the
performance.
"""
if self.classifier is None:
raise ValueError("No classifier to retrain.")
if len(self.queue.list_labels()) < 2:
self.model_performance.value = (
"Score: Not enough labels to retrain."
)
return
_, labelled_X, labelled_y = self.queue.list_completed()
self._render_processing(message="Retraining... ")
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.performance = self.eval_method(
self.classifier, labelled_X, labelled_y
)
self.model_performance.value = "Score: {:.2f}".format(
self.performance["test_score"].mean()
)
except ValueError: # pragma: no cover
self.performance = "Could not evaluate"
self.model_performance.value = "Score: {}".format(self.performance)
self.classifier.fit(labelled_X, labelled_y)
if self.reorder is not None:
ids, unlabelled_X = self.queue.list_uncompleted()
reordering = list(
self.reorder(
self.classifier.predict_proba(unlabelled_X),
shuffle_prop=self.shuffle_prop,
)
)
new_order = OrderedDict(
[(id_, index) for id_, index in zip(ids, list(reordering))]
)
self.queue.reorder(new_order)
# undo the previously popped item and pop the next one
self.queue.undo()
self._annotation_loop.send({"source": "__skip__"})
# self._compose()
PK f=O superintendent/validation.py"""Functions to validate arguments."""
from typing import Any
# import numpy as np
# import pandas as pd
def valid_classifier(classifier: Any):
"""
Check if an object conforms to sklearns fit / predict interface.
Parameters
----------
classifier : sklearn.base.ClassifierMixin
A classification model compliant with sklearn interfaces.
"""
if (
classifier is not None
and hasattr(classifier, "fit")
and hasattr(classifier, "predict_proba")
):
return classifier
elif classifier is None:
return None
else:
raise ValueError(
"The classifier needs to conform to "
"the sklearn interface (fit/predict_proba)."
)
# def valid_data(features: Optional[Any]):
# """
# Check if an object is an array or can be turned into one.
# Parameters
# ----------
# features : pd.DataFrame, pd.Series, np.ndarray
# the data to double-check.
# """
# if features is None:
# return None
# if isinstance(
# features, (pd.DataFrame, pd.Series, np.ndarray, list, tuple)
# ):
# return features
# # elif isinstance(features, (list, tuple)):
# # return np.array(features)
# else:
# raise ValueError(
# "The features need to be an array, sequence, or "
# "a pandas DataFrame / Series."
# )
PK e=O+ # superintendent/controls/__init__.pyfrom .multiclasssubmitter import MulticlassSubmitter # noqa
from .submitter import Submitter # noqa
from .timer import Timer # noqa
PK e=O2k" " &