PK!superintendent/__init__.pyfrom .clustersupervisor import ClusterSupervisor # noqa from .semisupervisor import SemiSupervisor # noqa from .multioutput import MultiLabeller # noqa from .version import __version__ # noqa PK!%xxsuperintendent/base.py"""Base class to inherit from.""" import abc from functools import partial from typing import Any, Callable, Dict, Optional, Tuple import ipyevents import IPython.display import ipywidgets as widgets import numpy as np import traitlets from . import controls, display # class AbstractTraitletMetaclass(traitlets.HasTraits, metaclass=abc.ABCMeta): # pass class Labeller(traitlets.HasTraits): """ Data point labelling. This class allows you to label individual data points. Parameters ---------- features : np.array | pd.DataFrame | list The input array for your model labels : np.array, pd.Series, pd.DataFrame, optional The labels for your data. options : Tuple[str] The label options you'd like the user to be shown. These will be presented as either buttons or in a dropdown. other_option : bool Whether or not a text field for supplying a different label should be shown. max_buttons : int How many buttons should be displayed before it switches to a non- button based interface. display_func : str, func, optional Either a function that accepts one row of features and returns what should be displayed with IPython's `display`, or a string that is any of 'img', 'image'. keyboard_shortcuts : bool, optional If you want to enable ipyevent-mediated keyboard capture to use the keyboard rather than the mouse to submit data. hint_function : func, optional The function to display these hints. By default, the same function as display_func is used. hints : np.array | pd.DataFrame | list The hints to start off with. """ options = traitlets.List(list(), allow_none=True) def __init__( self, features: Optional[Any] = None, labels: Optional[Any] = None, options: Tuple[str] = (), other_option: bool = True, max_buttons: int = 12, display_func: Callable = None, keyboard_shortcuts: bool = False, hint_function: Optional[Callable] = None, hints: Optional[Dict[str, Any]] = None, ): """ Make a class that allows you to label data points. """ # the widget elements self.layout = widgets.VBox([]) self.feature_output = widgets.Output() self.feature_display = widgets.Box( (self.feature_output,), layout=widgets.Layout( justify_content="center", padding="5% 0", display="flex", width="100%", min_height="150px", ), ) self.input_widget = controls.Submitter( hint_function=hint_function, hints=hints, options=options, other_option=other_option, max_buttons=max_buttons, ) self.input_widget.on_submission(self._apply_annotation) self.options = self.input_widget.options traitlets.link((self, "options"), (self.input_widget, "options")) # self.features = validation.valid_data(features) self.features = features # if labels is not None: # self.labels = validation.valid_data(labels) # elif self.features is not None: # self.labels = np.full( # self.features.shape[0], np.nan, dtype=float) self.labels = labels self.progressbar = widgets.FloatProgress( max=1, description="Progress:" ) self.top_bar = widgets.HBox([]) self.top_bar.children = [self.progressbar] if display_func is not None: self._display_func = display_func else: self._display_func = display.functions["default"] if keyboard_shortcuts: self.event_manager = ipyevents.Event( source=self.layout, watched_events=["keydown", "keyup"] ) self.event_manager.on_dom_event(self.input_widget._on_key_down) else: self.event_manager = None self.timer = controls.Timer() @abc.abstractmethod def _annotation_iterator(self): pass @classmethod def from_images(cls, *args, image_size=None, **kwargs): """Generate a labelling widget from an image array. Parameters ---------- features : np.ndarray A numpy array of shape n_images, n_pixels image_size : tuple The actual size to reshape each row of the features into. Returns ------- type Description of returned object. """ if image_size is None and "features" in kwargs: features = kwargs["features"] # check the input is in the correct format: if not isinstance(features, np.ndarray): raise TypeError( "When using from_images, input features " "needs to be a numpy array with shape " "(n_features, n_pixel)." ) # check if image is square if int(np.sqrt(features.shape[1])) ** 2 == features.shape[1]: image_size = "square" else: raise ValueError( "If image_size is None, the image needs to be square, but " "yours has " + str(features.shape[1]) + " pixels." ) elif image_size is None and "features" not in kwargs: # just assume images will be square image_size = "square" kwargs["display_func"] = kwargs.get( "display_func", partial(display.functions["image"], imsize=image_size), ) instance = cls(*args, **kwargs) return instance def _apply_annotation(self, sender): self._annotation_loop.send(sender) def add_features(self, features, labels=None): """ Add data to the widget. This adds the data provided to the queue of data to be labelled. You Can optionally provide labels for each data point. Parameters ---------- features : Any The data you'd like to add to the labelling widget. labels : Any, optional The labels for the data you're adding; if you have labels. """ self.queue.enqueue_many(features, labels=labels) # reset the iterator self._annotation_loop = self._annotation_iterator() self.queue.undo() next(self._annotation_loop) self._compose() def _display(self, feature): if feature is not None: if self.timer > 0.5: self._render_processing() with self.timer: with self.feature_output: IPython.display.clear_output(wait=True) self._display_func(feature) def _compose(self): self.layout.children = [ self.top_bar, self.feature_display, self.input_widget, ] return self def _render_processing(self, message="Rendering..."): with self.feature_output: IPython.display.clear_output(wait=True) IPython.display.display( widgets.HTML( "

{}".format(message) + '' ) ) def _render_finished(self): self.progressbar.bar_style = "success" with self.feature_output: IPython.display.clear_output(wait=True) IPython.display.display(widgets.HTML(u"

Finished labelling 🎉!")) self.layout.children = [self.progressbar, self.feature_display] return self @property def new_labels(self): _, _, labels = self.queue.list_all() return labels def _ipython_display_(self): IPython.display.display(self.layout) PK! -  #superintendent/clustersupervisor.py# -*- coding: utf-8 -*- """Tools to supervise your clustering.""" from . import base from .queueing import ClusterLabellingQueue class ClusterSupervisor(base.Labeller): """ A widget for labelling clusters. Parameters ---------- features : np.ndarray, pd.Series. pd.DataFrame Your features. cluster_indices : np.ndarray, pd.Series The cluster label for each data point. representativeness : np.ndarray, pd.Series How representative of a cluster your data points are. This can be the probability of cluster membership (as in e.g. HDBSCAN), or cluster centrality (as in e.g. K-Means). ignore : tuple, list Which clusters should be ignored. By default, this is -1, as most clustering algorithms assign -1 to points not in any cluster. """ def __init__( self, features, cluster_indices, representativeness=None, ignore=(-1,), **kwargs ): """Create a labelling widget.""" super().__init__(features, **kwargs) self.queue = ClusterLabellingQueue( features, cluster_indices, representativeness ) self._annotation_loop = self._annotation_iterator() next(self._annotation_loop) self._compose() def _annotation_iterator(self): """ The method that iterates over the clusters and presents them for annotation. """ self.progressbar.bar_style = "" for cluster_index, data in self.queue: self._display(data) sender = yield if sender["source"] == "__undo__": # unpop the current item: self.queue.undo() # unpop and unlabel the previous item: self.queue.undo() # try to remove any labels not in the assigned labels: self.input_widget.remove_options( set(self.input_widget.options) - self.queue.list_labels() ) elif sender["source"] == "__skip__": pass else: new_label = sender["value"] self.queue.submit(cluster_index, new_label) # self.input_widget.add_hint(new_label, datapoint) self.progressbar.value = self.queue.progress if self.event_manager is not None: self.event_manager.close() yield self._render_finished() @property def new_clusters(self): return self.queue.cluster_labels PK!AQ܇#superintendent/controls/__init__.pyfrom .submitter import Submitter # noqa from .multiclasssubmitter import MulticlassSubmitter # noqa from .timer import Timer # noqa PK!50&superintendent/controls/buttongroup.pyfrom numbers import Number from typing import Callable, Dict, List, Optional, Text, Union import ipywidgets as widgets import traitlets class ButtonGroup(widgets.HBox): """A group of buttons with output widgets underneath. Parameters ---------- options : list A list of options for this button group. button_width : str, int, float, optional The width of each button as an HTML compatible string or a number. (the default is None, which leads to the width being divided between the buttons.) """ options = traitlets.List( trait=traitlets.Unicode(), default=list(), allow_none=True ) submission_functions = traitlets.List(list(), allow_none=True) button_width = traitlets.Union( [traitlets.Float(), traitlets.Integer(), traitlets.Unicode()], allow_none=True, ) def __init__( self, options: List[str], button_width: Optional[Union[Number, Text]] = None, *args, **kwargs ): super().__init__(children=[], **kwargs) if button_width is None and len(options) > 0: self.button_width = max(1 / len(options), 0.1) else: self.button_width = button_width self.options = options @traitlets.observe("options") def rearrange_buttons(self, change): """Rearrange the buttons. Parameters ---------- change : Any Any ol' change. """ self.buttons = self.hints = { option: ButtonWithHint(option, self.button_width) for option in self.options } for button in self.buttons.values(): button.on_click(self._handle_click) self.children = [self.buttons[option] for option in self.options] def on_click(self, func: Callable) -> None: """Add a function to the list of calls made after a click. Parameters ---------- func : Callable The function to call when the button is clicked. """ if not callable(func): raise ValueError( "You need to provide a callable object, but you provided " + str(func) + "." ) self.submission_functions.append(func) def _handle_click(self, owner: widgets.Button) -> None: for func in self.submission_functions: func(owner) @traitlets.validate("button_width") def _valid_value(self, proposal: Dict): if isinstance(proposal["value"], Number) and proposal["value"] <= 1: return "{}%".format(int(100 * proposal["value"])) elif isinstance(proposal["value"], Number): return "{}px".format(int(proposal["value"])) elif isinstance(proposal["value"], str): return proposal["value"] else: # pragma: no cover raise traitlets.TraitError( "Button_width can only be a float, an integer, or a string." ) class ButtonWithHint(widgets.VBox): description = traitlets.Unicode() def __init__(self, label: str, button_width: str, *args, **kwargs): """Create a button. Parameters ---------- label : str The label for this button. button_width : str How wide you'd like this button to be. """ kwargs["layout"] = kwargs.get( "layout", widgets.Layout(width=button_width) ) super().__init__(children=[], *args, **kwargs) self.button = widgets.Button( description=str(label), layout=widgets.Layout(width="95%") ) self.hint = widgets.Output() self.children = [self.button, self.hint] self.description = label widgets.link((self, "description"), (self.button, "description")) def on_click(self, func: Callable): """Add a function to the list of calls made after a click. Parameters ---------- func : Callable The function to call when the button is clicked. """ self.button.on_click(func) def __enter__(self): return self.hint.__enter__() def __exit__(self, *args, **kwargs): return self.hint.__exit__(*args, **kwargs) PK!_0 )superintendent/controls/dropdownbutton.pyfrom collections import defaultdict from typing import Callable, Sequence import ipywidgets as widgets import traitlets class DropdownButton(widgets.VBox): options = traitlets.List( trait=traitlets.Unicode(), default=list(), allow_none=True ) submission_functions = traitlets.List(list(), allow_none=True) def __init__(self, options: Sequence[str], *args, **kwargs): """Create a dropdown button. Parameters ---------- options : Sequence[str] The options to display in the widget. """ super().__init__(*args, **kwargs) self.options = options self.dropdown = widgets.Dropdown( options=[str(option) for option in self.options], description="Label:", ) widgets.dlink((self, "options"), (self.dropdown, "options")) self.dropdown.observe(self._change_selection) self.button = widgets.Button( description="Submit.", tooltip="Submit label.", button_style="success", ) self.button.on_click(self._handle_click) self.hints = defaultdict(widgets.Output) self.children = [ widgets.HBox([self.dropdown, self.button]), self.hints[self.dropdown.value], ] def on_click(self, func: Callable) -> None: """Add a function to the list of calls made after a click. Parameters ---------- func : Callable The function to call when the button is clicked. """ if not callable(func): raise ValueError( "You need to provide a callable object, but you provided " + str(func) + "." ) self.submission_functions.append(func) def _handle_click(self, owner: widgets.Button) -> None: for func in self.submission_functions: func(owner) def _change_selection(self, change=None): if self.dropdown.value is not None: self.button.description = self.dropdown.value self.button.disabled = False else: self.button.description = "Submit." self.button.disabled = True self.children = [ widgets.HBox([self.dropdown, self.button]), self.hints[self.dropdown.value], ] @traitlets.validate("options") def _check_options(self, proposal): seen = set() return [x for x in proposal["value"] if not (x in seen or seen.add(x))] PK!AT̪,superintendent/controls/hintedmultiselect.pyfrom collections import defaultdict import ipywidgets as widgets import traitlets class HintedMultiselect(widgets.HBox): options = traitlets.List(list()) value = traitlets.List(list()) def __init__(self, options, *args, **kwargs): """Created a Multi-select widget Parameters ---------- options : Sequence[str] The options to show in the multi-select widget. """ super().__init__([]) self.options = [str(option) for option in options] self.multi_select = widgets.SelectMultiple( options=self.options, description="Label:" ) widgets.link((self, "options"), (self.multi_select, "options")) widgets.link((self, "value"), (self.multi_select, "value")) self.hints = defaultdict(widgets.Output) self.children = [ self.multi_select, widgets.HBox( children=[self.hints[option] for option in self.value], layout=widgets.Layout(flex_flow="row wrap"), ), ] def _reset(self): self.value = [] def _toggle(self, option: str): if option in self.value: new_value = list(self.value) new_value.remove(option) self.multi_select.value = new_value else: self.multi_select.value = self.value + [option] @traitlets.observe("value") def _refresh_hints(self, change=None): self.children = [ self.multi_select, widgets.HBox( children=[self.hints[option] for option in self.value], layout=widgets.Layout(flex_flow="row wrap"), ), ] PK!%superintendent/controls/keycapture.pyDEFAULT_SHORTCUTS = ( [str(i) for i in range(1, 10)] + ["0"] + ["q", "w", "e", "r", "t", "y", "u", "i", "o", "p"] ) PK!w7 7 .superintendent/controls/multiclasssubmitter.pyimport ipywidgets as widgets import traitlets from .hintedmultiselect import HintedMultiselect from .keycapture import DEFAULT_SHORTCUTS from .submitter import Submitter from .togglebuttongroup import ToggleButtonGroup class MulticlassSubmitter(Submitter): def _on_key_down(self, event): if event["type"] == "keyup": pressed_option = self._key_option_mapping.get(event.get("key")) if pressed_option is not None: self._toggle_option(pressed_option) elif event.get("key") == "Enter": self._when_submitted({"source": "enter"}) elif event.get("key") == "Backspace": self._when_submitted({"source": "backspace"}) def _toggle_option(self, option): self.control_elements._toggle(option) def _when_submitted(self, sender): value = self.control_elements.value if sender is self.skip_button: value = None source = "__skip__" elif sender is self.undo_button or ( isinstance(sender, dict) and sender.get("source") == "backspace" ): value = None source = "__undo__" elif sender is self.submission_button: source = "multi-selector" elif isinstance(sender, widgets.Text): if sender.value is not None and sender.value not in self.options: self.options = self.options + [sender.value] self._toggle_option(sender.value) return elif isinstance(sender, dict) and sender.get("source") == "enter": source = "multi-selector" for func in self.submission_functions: func({"value": value, "source": source}) self.control_elements._reset() @traitlets.observe("other_option", "options", "max_buttons") def _compose(self, change=None): self.options = [str(option) for option in self.options] self._key_option_mapping = { key: option for key, option in zip(DEFAULT_SHORTCUTS, self.options) } if len(self.options) <= self.max_buttons: # if we can display all options: self.control_elements = ToggleButtonGroup(self.options) else: self.control_elements = HintedMultiselect(self.options) self.submission_button = widgets.Button( description="Apply", button_style="success" ) self.submission_button.on_click(self._when_submitted) if self.other_option: self.other_widget = widgets.Text( value="", description="Other:", placeholder="Hit enter to submit.", ) self.other_widget.on_submit(self._when_submitted) else: self.other_widget = widgets.HBox([]) self.children = [ self.control_elements, widgets.HBox( [ self.other_widget, widgets.HBox( [ self.sort_button, self.skip_button, self.undo_button, self.submission_button, ] ), ], layout=widgets.Layout(justify_content="space-between"), ), ] PK!4XX$superintendent/controls/submitter.py"""Input and timing control widgets.""" from typing import Any, Callable, Dict, List, Optional, Tuple, Union import ipywidgets as widgets import traitlets from .buttongroup import ButtonGroup, ButtonWithHint from .dropdownbutton import DropdownButton from .keycapture import DEFAULT_SHORTCUTS class Submitter(widgets.VBox): """ A flexible data submission widget. Submitter allows you to specifiy options, which can be chosen either via buttons or a dropdown, and a text field for "other" values. Parameters ---------- options : list, tuple, optional The data submission options. max_buttons : int The number buttons you want to display. If len(options) > max_buttons, the options will be displayed in a dropdown instead. other_option : bool, optional Whether the widget should contain a text box for users to type in a value not in options. hint_function : fun A function that will be passed the hint for each label, that displays some output that will be displayed under each label and can be considered a hint or more in-depth description of a label. During image labelling tasks, this might be a function that displays an example image. hints : dict A dictionary with each element of options as a key, and the data that gets passed to hint_function as input. update_hints : bool Whether to update hints as you go through - for options that don't have any hints yet. """ other_option = traitlets.Bool(True) options = traitlets.List(list(), allow_none=True) max_buttons = traitlets.Integer(12) def __init__( self, options: Optional[Union[List[str], Tuple[str]]] = (), max_buttons: int = 12, other_option: bool = True, hint_function: Optional[Callable] = None, hints: Optional[Dict[str, Any]] = None, update_hints: bool = True, # shortcuts=None, ): """ Create a widget that will render submission options. Note that all parameters can also be changed through assignment after you create the widget. """ super().__init__([]) self.submission_functions = [] self.hint_function = hint_function # self.shortcuts = shortcuts self.hints = dict() if hints is None else hints if hint_function is not None: for option, feature in self.hints.items(): self.hints[option] = widgets.Output() with self.hints[option]: self.hint_function(feature) self.sort_button = widgets.Button( description="Sort options", icon="sort" ) self.sort_button.on_click(self._sort_options) self.skip_button = widgets.Button( description="Skip", icon="fast-forward" ) self.skip_button.on_click(self._when_submitted) self.undo_button = widgets.Button(description="Undo", icon="undo") self.undo_button.on_click(self._when_submitted) self.options = [str(option) for option in options] self.fixed_options = [str(option) for option in options] self.max_buttons = max_buttons self.other_option = other_option self._compose() def _when_submitted(self, sender): if sender is self.skip_button: value = None source = "__skip__" elif sender is self.undo_button: value = None source = "__undo__" elif isinstance(sender, (widgets.Button, ButtonWithHint)): value = sender.description source = "button" elif isinstance(sender, widgets.Text): value = sender.value source = "textfield" elif isinstance(sender, dict) and sender.get("source") == "keystroke": value = sender.get("value") source = "keystroke" if value is not None and value not in self.options: self.options = self.options + [value] for func in self.submission_functions: func({"value": value, "source": source}) self._compose() def _on_key_down(self, event): if event["type"] == "keyup": pressed_option = self._key_option_mapping.get(event.get("key")) if pressed_option is not None: self._when_submitted( {"value": pressed_option, "source": "keystroke"} ) def add_hint(self, value, hint): """Add a hint to the widget. Parameters ---------- value : str The label for which this hint applies. hint : Any The data point to use for the hint. """ if ( self.hint_function is not None and self.hints is not None and value not in self.hints ): with self.control_elements.hints[value]: self.hint_function(hint) def remove_options(self, values): """Remove options from the widget. Parameters ---------- values : Sequence[str] The options to remove. """ self.options = [ option for option in self.options if option not in values or option in self.fixed_options ] def on_submission(self, func): """ Add a function to call when the user submits a value. Parameters ---------- func : callable The function to be called when the widget is submitted. """ if not callable(func): raise ValueError( "You need to provide a callable object, but you provided " + str(func) + "." ) self.submission_functions.append(func) def _sort_options(self, change=None): self.options = list(sorted(self.options)) @traitlets.observe("other_option", "options", "max_buttons") def _compose(self, change=None): # self.options = [str(option) for option in self.options] self._key_option_mapping = { key: option for key, option in zip(DEFAULT_SHORTCUTS, self.options) } if len(self.options) <= self.max_buttons: self.control_elements = ButtonGroup(self.options) else: self.control_elements = DropdownButton(self.options) self.control_elements.on_click(self._when_submitted) if self.other_option: self.other_widget = widgets.Text( value="", description="Other:", placeholder="Hit enter to submit.", ) self.other_widget.on_submit(self._when_submitted) else: self.other_widget = widgets.HBox([]) self.children = [ self.control_elements, widgets.HBox( [ self.other_widget, widgets.HBox( [self.sort_button, self.skip_button, self.undo_button] ), ], layout=widgets.Layout(justify_content="space-between"), ), ] PK!N7ii superintendent/controls/timer.pyimport time from functools import total_ordering @total_ordering class Timer: """ A timer object. Use as a context manager to time operations, and compare to numerical values (seconds) to run conditional code. Usage: .. code-block:: python from superintendent.controls import Timer timer = Timer() with timer: print('some quick computation') if timer < 1: print('quick computation took less than a second') """ def __init__(self): self._time = 0 def __enter__(self): self._t0 = time.time() def __exit__(self, *args): self._time = time.time() - self._t0 def __eq__(self, other): return self._time == other def __lt__(self, other): return self._time < other def __repr__(self): return "{} s".format(self._time) PK!,superintendent/controls/togglebuttongroup.pyfrom numbers import Number from typing import Dict, List, Optional, Text, Union import ipywidgets as widgets import traitlets class ToggleButtonGroup(widgets.HBox): """A group of buttons with output widgets underneath. Parameters ---------- options : list A list of options for this button group. button_width : str, int, float, optional The width of each button as an HTML compatible string or a number. (the default is None, which leads to the width being divided between the buttons.) """ options = traitlets.List( trait=traitlets.Unicode(), default=list(), allow_none=True ) submission_functions = traitlets.List(list(), allow_none=True) button_width = traitlets.Union( [traitlets.Float(), traitlets.Integer(), traitlets.Unicode()], allow_none=True, ) def __init__( self, options: List[str], button_width: Optional[Union[Number, Text]] = None, *args, **kwargs ): super().__init__(children=[], **kwargs) if button_width is None and len(options) > 0: self.button_width = max(1 / len(options), 0.1) else: self.button_width = button_width self.options = options @traitlets.observe("options") def rearrange_buttons(self, change): """Rearrange the button layout. Parameters ---------- change : Any Any ol' change. """ self.buttons = self.hints = { option: ToggleButtonWithHint(option, self.button_width) for option in self.options } self.children = [self.buttons[option] for option in self.options] def _toggle(self, option: str): self.buttons[option].value = not self.buttons[option].value def _reset(self): for button in self.buttons.values(): button.value = False @property def value(self): return [ option for option, button in self.buttons.items() if button.value ] @traitlets.validate("button_width") def _valid_value(self, proposal: Dict): if isinstance(proposal["value"], Number) and proposal["value"] <= 1: return "{}%".format(int(100 * proposal["value"])) elif isinstance(proposal["value"], Number): return "{}px".format(int(proposal["value"])) elif isinstance(proposal["value"], str): return proposal["value"] else: # pragma: no cover raise traitlets.TraitError( "Button_width can only be a float, an integer, or a string." ) class ToggleButtonWithHint(widgets.VBox): value = traitlets.Bool(default_value=False) description = traitlets.Unicode() def __init__(self, label: str, button_width: str, *args, **kwargs): """Create a Toggle-button. Parameters ---------- label : str The button label. button_width : str The width of the button. """ kwargs["layout"] = kwargs.get( "layout", widgets.Layout(width=button_width) ) super().__init__(children=[], *args, **kwargs) self.button = widgets.ToggleButton( description=str(label), layout=widgets.Layout(width="95%") ) widgets.link((self, "value"), (self.button, "value")) self.hint = widgets.Output() self.children = [self.button, self.hint] self.description = label widgets.link((self, "description"), (self.button, "description")) def __enter__(self): return self.hint.__enter__() def __exit__(self, *args, **kwargs): return self.hint.__exit__(*args, **kwargs) PK!&superintendent/display.py"""Helper functions for displaying types of data.""" import IPython.display import numpy as np from matplotlib import pyplot as plt def default_display_func(feature): """ A default function that prints the object. If the data is not numerical, the function prints the data to screen as text. Parameters ---------- feature : np.ndarray, pd.Series, pd.DataFrame The feature(s) you want to display """ # n_samples = min(n_samples, feature.shape[0]) IPython.display.display(feature) def image_display_func(image, imsize=None): """ Image display function. Iterates over the rows in the array and uses matplotlib imshow to actually reveal the image. Parameters ---------- image : np.ndarray The data, in the shape of n_samples, n_pixels imsize : tuple, optional A tuple of width, height that gets passed to np.reshape """ fig, ax = plt.subplots(1, 1) if imsize == "square": image = image.reshape(2 * [int(np.sqrt(image.size))]) elif imsize is not None: image = image.reshape(imsize) ax.imshow(image, cmap="binary") ax.axis("off") plt.show() functions = { "default": default_display_func, "image": image_display_func, "img": image_display_func, } PK!~fhbb&superintendent/distributed/__init__.pyfrom .semisupervisor import SemiSupervisor # noqa from .multioutput import MultiLabeller # noqa PK!77%superintendent/distributed/dbqueue.pyimport configparser import itertools import operator import warnings from collections import deque, namedtuple from contextlib import contextmanager from datetime import datetime, timedelta from functools import reduce from typing import Any, Dict, Sequence, Set, Tuple import sqlalchemy as sa import sqlalchemy.ext.declarative from sqlalchemy.exc import OperationalError, ProgrammingError import cachetools import numpy as np import pandas as pd from ..queueing import BaseLabellingQueue, _features_to_array from .serialization import data_dumps, data_loads def _construct_orm_object(table_name): DeclarativeBase = sqlalchemy.ext.declarative.declarative_base() class Superintendent(DeclarativeBase): __tablename__ = table_name id = sa.Column(sa.Integer, primary_key=True) # noqa: A003 input = sa.Column(sa.Text) # noqa: A003 output = sa.Column(sa.Text, nullable=True) inserted_at = sa.Column(sa.DateTime) priority = sa.Column(sa.Integer) popped_at = sa.Column(sa.DateTime, nullable=True) completed_at = sa.Column(sa.DateTime, nullable=True) worker_id = sa.Column(sa.String, nullable=True) return Superintendent deserialisers = {"json": data_loads} serialisers = {"json": data_dumps} class DatabaseQueue(BaseLabellingQueue): """Implements a queue for distributed labelling. >>> from superintendent.distributed.dbqueue import Backend >>> q = Backend(storage_type='integer_index') >>> q.insert(1) >>> id_, integer_index = q.pop() >>> # ... >>> q.submit(id_, value) Attributes ---------- data : sqlalchemy.ext.declarative.api.DeclarativeMeta deserialiser : builtin_function_or_method serialiser : builtin_function_or_method """ worker_id = None item = namedtuple("QueueItem", ["id", "data", "label"]) def __init__( self, connection_string="sqlite:///:memory:", table_name="superintendent", storage_type="json", ): """Instantiate queue for distributed labelling. Parameters ---------- connection_string : str, optional dialect+driver://username:password@host:port/database. Default: 'sqlite:///:memory:' (NB: Use only for debugging purposes) table_name : str The name of the table in SQL where to store the data. storage_type : str, optional One of 'integer_index', 'pickle' (default) or 'json'. """ self.data = _construct_orm_object(table_name) self.deserialiser = deserialisers[storage_type] self.serialiser = serialisers[storage_type] self.engine = sa.create_engine(connection_string) self._popped = deque([]) if not self.engine.dialect.has_table( self.engine, self.data.__tablename__ ): self.data.metadata.create_all(bind=self.engine) try: # create index for priority ix_labelling = sa.Index("ix_labelling", self.data.priority) ix_labelling.create(self.engine) except OperationalError: pass except ProgrammingError: pass @classmethod def from_config_file(cls, config_path): """Instantiate with database credentials from a configuration file. The config file should be an INI file with the following contents: [database] ; dialect+driver://username:password@host:port/database dialect=xxx driver=xxx username=xxx password=xxx host=xxx port=xxx database=xxx Parameters ---------- config_path : str Path to database configuration file. """ config = configparser.ConfigParser() config.read(config_path) connection_string_template = ( "{dialect}+{driver}://" "{username}:{password}@{host}:{port}/{database}" ) connection_string = connection_string_template.format( **config["database"] ) return cls(connection_string) @contextmanager def session(self): session = sa.orm.Session(bind=self.engine) try: yield session session.commit() except Exception: session.rollback() raise finally: session.close() def enqueue(self, feature, label=None, priority=None): """Add a feature to the queue. Parameters ---------- feature : Any The feature to add. label : Any, optional The label for the feature. priority : int, optional The priority of this label in relation to all other priorities in the queue. """ now = datetime.now() with self.session() as session: session.add( self.data( input=self.serialiser(feature), inserted_at=now, priority=priority, output=label, completed_at=None if label is None else now, ) ) def enqueue_many(self, features, labels=None, priorities=None): """ Add items to the queue. Parameters ---------- features : Any The features to add. labels : Any, optional The labels for the features. priorities : Sequence[int], optional The priorities of this label in relation to all other priorities in the queue. """ now = datetime.now() if isinstance(features, pd.DataFrame): features = [row for _, row in features.iterrows()] with self.session() as session: if priorities is None: priorities = itertools.cycle([None]) if labels is None: labels = itertools.cycle([None]) for feature, label, priority in zip(features, labels, priorities): session.add( self.data( input=self.serialiser(feature), inserted_at=now, priority=priority, output=self.serialiser(label), completed_at=None if label is None else now, ) ) def reorder(self, priorities: Dict[int, int]) -> None: """Re-assign priorities for labels. Parameters ---------- priorities : Dict[int, int] A mapping from id -> priority. """ self.set_priorities( [int(id_) for id_ in priorities.keys()], [int(priority) for priority in priorities.values()], ) def set_priorities(self, ids: Sequence[int], priorities: Sequence[int]): """Set the priorities for data points. Parameters ---------- ids : Sequence[int] The IDs for which to change the priority. priorities : Sequence[int] The priorities. """ with self.session() as session: rows = session.query(self.data).filter(self.data.id.in_(ids)).all() for row in rows: row.priority = priorities[ids.index(row.id)] def pop(self, timeout: int = 600) -> Tuple[int, Any]: """Pop an item from the queue. Parameters ---------- timeout : int How long ago an item must have been popped in order for it to be popped again. Raises ------ IndexError If there are no more items to pop. Returns ------- id : int The ID of the popped data point data : Any The datapoint. """ with self.session() as session: row = ( session.query(self.data) .filter( self.data.completed_at.is_(None) & ( self.data.popped_at.is_(None) | ( self.data.popped_at < (datetime.now() - timedelta(seconds=timeout)) ) ) ) .order_by(self.data.priority) .first() ) if row is None: raise IndexError("Trying to pop off an empty queue.") else: row.popped_at = datetime.now() id_ = row.id value = row.input self._popped.append(id_) return id_, self.deserialiser(value) def submit(self, id_: int, label: str) -> None: """Submit a label for a data point. Parameters ---------- id_ : int The ID for which you are submitting a data point. label : str The label you want to submit. Raises ------ ValueError If you haven't popped an item yet. """ if id_ not in self._popped: raise ValueError("This item was not popped; you cannot label it.") with self.session() as session: row = session.query(self.data).filter_by(id=id_).first() row.output = self.serialiser(label) row.worker_id = self.worker_id row.completed_at = datetime.now() def undo(self) -> None: """Undo the most recently popped item.""" if len(self._popped) > 0: id_ = self._popped.pop() self._reset(id_) def _reset(self, id_: int) -> None: with self.session() as session: row = session.query(self.data).filter_by(id=id_).first() row.output = None row.completed_at = None row.popped_at = None def list_all(self): with self.session() as session: objects = session.query(self.data).all() items = [ self.item( id=obj.id, data=self.deserialiser(obj.input), label=self.deserialiser(obj.output), ) for obj in objects ] ids = [item.id for item in items] x = _features_to_array([item.data for item in items]) y = [item.label for item in items] return ids, x, y def list_completed(self): with self.session() as session: objects = ( session.query(self.data) .filter( self.data.output.isnot(None) & self.data.completed_at.isnot(None) ) .all() ) items = [ self.item( id=obj.id, data=self.deserialiser(obj.input), label=self.deserialiser(obj.output), ) for obj in objects ] ids = [item.id for item in items] x = _features_to_array([item.data for item in items]) y = [item.label for item in items] return ids, x, y def list_labels(self) -> Set[str]: with self.session() as session: rows = ( session.query(self.data.output) .filter(self.data.output.isnot(None)) .distinct() ) try: return set([self.deserialiser(row.output) for row in rows]) except TypeError: return reduce( operator.or_, [ set(self.deserialiser(row.output)) if row.output is not None else set() for row in rows ], ) def list_uncompleted(self): with self.session() as session: objects = ( session.query(self.data) .filter(self.data.output.is_(None)) .all() ) items = [ self.item( id=obj.id, data=self.deserialiser(obj.input), label=obj.output, ) for obj in objects ] ids = [obj.id for obj in objects] x = _features_to_array([item.data for item in items]) return ids, x def clear_queue(self): self._popped = deque([]) with self.session() as session: session.query(self.data).delete() def drop_table(self, sure=False): # noqa: D001 if sure: self.data.metadata.drop_all(bind=self.engine) else: warnings.warn("To actually drop the table, pass sure=True") def _unlabelled_count(self): with self.session() as session: return ( session.query(self.data) .filter( self.data.completed_at.is_(None) & self.data.output.is_(None) ) .count() ) def _labelled_count(self): with self.session() as session: return ( session.query(self.data) .filter( self.data.completed_at.isnot(None) & self.data.output.isnot(None) ) .count() ) @cachetools.cached(cachetools.TTLCache(1, 60)) def _total_count(self): with self.session() as session: n_total = session.query(self.data).count() return n_total @property def progress(self) -> float: try: return self._labelled_count() / self._total_count() except ZeroDivisionError: return np.nan def __len__(self): with self.session() as session: return ( session.query(self.data) .filter(self.data.completed_at.is_(None)) .count() ) def __iter__(self): return self def __next__(self): try: return self.pop() except IndexError: raise StopIteration PK!-[ [ )superintendent/distributed/multioutput.pyfrom ..multioutput import MultiLabeller as NonDistributedMultiLabeller from .semisupervisor import SemiSupervisor class MultiLabeller(NonDistributedMultiLabeller, SemiSupervisor): """ A class for labelling your data. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data. In the future, it will also allow you to re-train an algorithm. Parameters ---------- connection_string: str A SQLAlchemy-compatible database connection string. This is where the data for this widget will be stored, and where it will be retrieved from for labelling. features : list, np.ndarray, pd.Series, pd.DataFrame, optional An array or sequence of data in which each element (if 1D) or each row (if 2D) represents one data point for which you'd like to generate labels. labels : list, np.ndarray, pd.Series, pd.DataFrame, optional If you already have some labels, but would like to re-label some, then you can pass these in as labels. options : tuple, list The options presented for labelling. classifier : sklearn.base.ClassifierMixin, optional An object that implements the standard sklearn fit/predict methods. If provided, a button for retraining the model is shown, and the model performance under k-fold crossvalidation can be read as you go along. display_func : callable, optional A function that will be used to display the data. This function should take in two arguments, first the data to display, and second the number of data points to display (set to 1 for this class). eval_method : callable, optional A function that accepts the classifier, features, and labels as input and returns a dictionary of values that contain the key 'test_score'. The default is sklearn.model_selection.cross_validate, with cv=3. Use functools.partial to create a function with its parameters fixed. reorder : str, callable, optional One of the reordering algorithms specified in :py:mod:`superintendent.prioritisation`. This describes a function that receives input in the shape of n_samples, n_labels and calculates the priority in terms of information value in labelling a data point. shuffle_prop : float The proportion of points that are shuffled when the data points are re-ordered (see reorder keyword-argument). This controls the "exploration vs exploitation" trade-off - the higher, the more you explore the feature space randomly, the lower, the more you exploit your current weak points. keyboard_shortcuts : bool, optional If you want to enable ipyevent-mediated keyboard capture to use the keyboard rather than the mouse to submit data. """ pass PK!ܢ2j,superintendent/distributed/semisupervisor.py"""Tools to supervise classification.""" import time from typing import Optional import ipywidgets as widgets import traitlets # import ipywidgets as widgets # import numpy as np # import sklearn.model_selection # # from . import base from .. import semisupervisor from .dbqueue import DatabaseQueue class SemiSupervisor(semisupervisor.SemiSupervisor): """ A class for labelling your data. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data. In the future, it will also allow you to re-train an algorithm. Parameters ---------- connection_string : str A SQLAlchemy-compatible database connection string. This is where the data for this widget will be stored, and where it will be retrieved from for labelling. features : list, np.ndarray, pd.Series, pd.DataFrame, optional An array or sequence of data in which each element (if 1D) or each row (if 2D) represents one data point for which you'd like to generate labels. labels : list, np.ndarray, pd.Series, pd.DataFrame, optional If you already have some labels, but would like to re-label some, then you can pass these in as labels. worker_id : bool, str Whether or not to prompt for a worker_id (if it's boolean), or a specific worker_id for this widget (if it's a string). The default is False, which means worker_id will not be recorded at all. table_name : str The name for the table in the SQL database. If the table doesn't exist, it will be created. options : tuple, list The options presented for labelling. classifier : sklearn.base.ClassifierMixin, optional An object that implements the standard sklearn fit/predict methods. If provided, a button for retraining the model is shown, and the model performance under k-fold crossvalidation can be read as you go along. display_func : callable, optional A function that will be used to display the data. This function should take in two arguments, first the data to display, and second the number of data points to display (set to 1 for this class). eval_method : callable, optional A function that accepts the classifier, features, and labels as input and returns a dictionary of values that contain the key 'test_score'. The default is sklearn.model_selection.cross_validate, with cv=3. Use functools.partial to create a function with its parameters fixed. reorder : str, callable, optional One of the reordering algorithms specified in :py:mod:`superintendent.prioritisation`. This describes a function that receives input in the shape of n_samples, n_labels and calculates the priority in terms of information value in labelling a data point. shuffle_prop : float The proportion of points that are shuffled when the data points are re-ordered (see reorder keyword-argument). This controls the "exploration vs exploitation" trade-off - the higher, the more you explore the feature space randomly, the lower, the more you exploit your current weak points. keyboard_shortcuts : bool, optional If you want to enable ipyevent-mediated keyboard capture to use the keyboard rather than the mouse to submit data. """ worker_id = traitlets.Unicode(allow_none=True) def __init__( self, connection_string="sqlite:///:memory:", *args, worker_id=False, table_name="superintendent", **kwargs ): super().__init__(*args, **kwargs) self.queue = DatabaseQueue( connection_string=connection_string, table_name=table_name ) if kwargs.get("features") is not None: self.add_features(kwargs.get("features"), kwargs.get("labels")) self._annotation_loop = self._annotation_iterator() self.queue.undo() next(self._annotation_loop) if worker_id and not isinstance(worker_id, str): self._get_worker_id() else: self.queue.worker_id = worker_id if worker_id else None self._compose() def _get_worker_id(self): worker_id_field = widgets.Text( placeholder="Please enter your name or ID." ) self.layout.children = [ widgets.HTML("

Please enter your name:

"), widgets.Box( children=[worker_id_field], layout=widgets.Layout( justify_content="center", padding="5% 0", display="flex", width="100%", min_height="150px", ), ), ] worker_id_field.on_submit(self._set_worker_id) def _set_worker_id(self, worker_id_field): if len(worker_id_field.value) > 0: self.queue.worker_id = worker_id_field.value self._compose() def _run_orchestration( self, interval_seconds: int = 30, interval_n_labels: Optional[int] = 0, shuffle_prop: float = 0.1, ): if ( not hasattr(self, "_last_n_labelled") or interval_n_labels >= self.queue._labelled_count() - self._last_n_labelled ): self._last_n_labelled = self.queue._labelled_count self.shuffle_prop = shuffle_prop self.retrain() print(self.model_performance.value) time.sleep(interval_seconds) def orchestrate( self, interval_seconds: Optional[int] = 60, interval_n_labels: Optional[int] = 0, shuffle_prop: float = 0.1, ): """Orchestrate the active learning process. This method can either re-train the classifier and re-order the data once, or it can run a never-ending loop to re-train the model at regular intervals, both in time and in the size of labelled data. Parameters ---------- interval_seconds : int, optional How often the retraining should occur, in seconds. If this is None, the retraining only happens once, then returns (this is suitable) if you want the retraining schedule to be maintained e.g. by a cron job). The default is 60 seconds. interval_n_labels : int, optional How many new data points need to have been labelled in between runs in order for the re-training to occur. shuffle_prop : float What proportion of the data should be randomly sampled on each re- training run. Returns ------- None """ if interval_seconds is None: self._run_orchestration( interval_seconds=0, interval_n_labels=interval_n_labels, shuffle_prop=shuffle_prop, ) else: while True: # pragma: no cover self._run_orchestration( interval_seconds=interval_seconds, interval_n_labels=interval_n_labels, shuffle_prop=shuffle_prop, ) PK!zz+superintendent/distributed/serialization.pyimport json from typing import Any, Optional import numpy as np import pandas as pd class DataEncoder(json.JSONEncoder): def default(self, obj): """ Serialize numpy or pandas objects to json. Parameters ---------- obj : Any The object to serialise. """ if isinstance(obj, np.ndarray): return {"__type__": "__np.ndarray__", "__content__": obj.tolist()} elif isinstance(obj, pd.DataFrame): return { "__type__": "__pd.DataFrame__", "__content__": obj.to_dict(orient="split"), } elif isinstance(obj, pd.Series): return { "__type__": "__pd.Series__", "__content__": { "dtype": str(obj.dtype), "index": list(obj.index), "data": obj.tolist(), "name": obj.name, }, } def data_decoder(obj): """Deserialise an object. Parameters ---------- obj : Any The object to serialise. """ if "__type__" in obj: if obj["__type__"] == "__np.ndarray__": return np.array(obj["__content__"]) elif obj["__type__"] == "__pd.DataFrame__": return pd.DataFrame(**obj["__content__"]) elif obj["__type__"] == "__pd.Series__": return pd.Series(**obj["__content__"]) return obj def data_dumps(obj: Any) -> Optional[str]: """Serialise an object. Parameters ---------- obj : Any The object to serialise. """ if obj is None: return None return json.dumps(obj, cls=DataEncoder) def data_loads(obj: Optional[str]) -> Any: """Serialise an object. Parameters ---------- obj : str The string to deserialise. """ if obj is None: return None return json.loads(obj, object_hook=data_decoder) PK!11&superintendent/multioutput/__init__.pyfrom .multilabeller import MultiLabeller # noqa PK!LEΆ+superintendent/multioutput/multilabeller.pyimport warnings from collections import OrderedDict from sklearn.multioutput import MultiOutputClassifier from sklearn.preprocessing import MultiLabelBinarizer from . import prioritisation from .. import controls, semisupervisor class MultiLabeller(semisupervisor.SemiSupervisor): """ A widget for assigning more than one label to each data point. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data. In the future, it will also allow you to re-train an algorithm. Parameters ---------- connection_string: str A SQLAlchemy-compatible database connection string. This is where the data for this widget will be stored, and where it will be retrieved from for labelling. features : list, np.ndarray, pd.Series, pd.DataFrame, optional An array or sequence of data in which each element (if 1D) or each row (if 2D) represents one data point for which you'd like to generate labels. labels : list, np.ndarray, pd.Series, pd.DataFrame, optional If you already have some labels, but would like to re-label some, then you can pass these in as labels. options : tuple, list The options presented for labelling. classifier : sklearn.base.ClassifierMixin, optional An object that implements the standard sklearn fit/predict methods. If provided, a button for retraining the model is shown, and the model performance under k-fold crossvalidation can be read as you go along. display_func : callable, optional A function that will be used to display the data. This function should take in two arguments, first the data to display, and second the number of data points to display (set to 1 for this class). eval_method : callable, optional A function that accepts the classifier, features, and labels as input and returns a dictionary of values that contain the key 'test_score'. The default is sklearn.model_selection.cross_validate, with cv=3. Use functools.partial to create a function with its parameters fixed. reorder : str, callable, optional One of the reordering algorithms specified in :py:mod:`superintendent.prioritisation`. This describes a function that receives input in the shape of n_samples, n_labels and calculates the priority in terms of information value in labelling a data point. shuffle_prop : float The proportion of points that are shuffled when the data points are re-ordered (see reorder keyword-argument). This controls the "exploration vs exploitation" trade-off - the higher, the more you explore the feature space randomly, the lower, the more you exploit your current weak points. keyboard_shortcuts : bool, optional If you want to enable ipyevent-mediated keyboard capture to use the keyboard rather than the mouse to submit data. """ def __init__(self, *args, **kwargs): """ A class for labelling your data. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data, periodically re-train your algorithm and assess its performance, and determine which data points to label next based on your model's predictions. """ reorder = kwargs.pop("reorder", None) super().__init__(*args, **kwargs) if self.event_manager is not None: self.event_manager.on_dom_event( self.input_widget._on_key_down, remove=True ) if ( not isinstance(self.classifier, MultiOutputClassifier) and self.classifier is not None ): self.classifier = MultiOutputClassifier(self.classifier, n_jobs=-1) if reorder is not None and isinstance(reorder, str): if reorder not in prioritisation.functions: raise NotImplementedError( "Unknown reordering function '{}'.".format(reorder) ) self.reorder = prioritisation.functions[reorder] elif reorder is not None and callable(reorder): self.reorder = reorder elif reorder is None: self.reorder = None else: raise ValueError( "The reorder argument needs to be either a function or the " "name of a function listed in superintendent.prioritisation." ) self.input_widget = controls.MulticlassSubmitter( hint_function=kwargs.get("hint_function"), hints=kwargs.get("hints"), options=kwargs.get("options", ()), max_buttons=kwargs.get("max_buttons", 12), ) self.input_widget.on_submission(self._apply_annotation) if self.event_manager is not None: self.event_manager.on_dom_event(self.input_widget._on_key_down) self._compose() def retrain(self, *args): """Retrain the classifier you passed when creating this widget. This calls the fit method of your class with the data that you've labelled. It will also score the classifier and display the performance. """ if self.classifier is None: raise ValueError("No classifier to retrain.") if len(self.queue.list_labels()) < 1: self.model_performance.value = ( "Score: Not enough labels to retrain." ) return _, labelled_X, labelled_y = self.queue.list_completed() preprocessor = MultiLabelBinarizer() labelled_y = preprocessor.fit_transform(labelled_y) self._render_processing(message="Retraining... ") try: with warnings.catch_warnings(): warnings.simplefilter("ignore") self.performance = self.eval_method( self.classifier, labelled_X, labelled_y ) self.model_performance.value = "Score: {:.2f}".format( self.performance["test_score"].mean() ) except ValueError: # pragma: no cover self.performance = "Could not evaluate" self.model_performance.value = "Score: {}".format(self.performance) self.classifier.fit(labelled_X, labelled_y) if self.reorder is not None: ids, unlabelled_X = self.queue.list_uncompleted() probabilities = self.classifier.predict_proba(unlabelled_X) # if len(preprocessor.classes_) > 1: # probabilities = sum(probabilities) / len(probabilities) reordering = list( self.reorder(probabilities, shuffle_prop=self.shuffle_prop) ) new_order = OrderedDict( [(id_, index) for id_, index in zip(ids, list(reordering))] ) self.queue.reorder(new_order) self.queue.undo() self._annotation_loop.send({"source": "__skip__"}) PK!p\ ,superintendent/multioutput/prioritisation.py""" Functions to prioritise labelling data points (to drive active learning). This module implements a range of functions that produce ordering of data based on class probabilities. """ from typing import List import numpy as np import scipy.stats from ..prioritisation import _shuffle_subset def entropy( probabilities: List[np.ndarray], shuffle_prop: float = 0.1 ) -> np.ndarray: """ Sort by the entropy of the probabilities (high to low). Parameters ---------- probabilities : np.ndarray An array of probabilities, with the shape n_samples, n_classes shuffle_prop : float The proportion of data points that should be randomly shuffled. This means the sorting retains some randomness, to avoid biasing your new labels and catching any minority classes the algorithm currently classifies as a different label. """ entropies = sum( [ -scipy.stats.entropy(probability_array.T) for probability_array in probabilities ] ) / len(probabilities) ordered = np.argsort(entropies) return _shuffle_subset(ordered.argsort(), shuffle_prop) def margin(probabilities: List[np.ndarray], shuffle_prop=0.1): """ Sort by the margin between the top two predictions (low to high). Parameters ---------- probabilities : np.ndarray An array of probabilities, with the shape n_samples, n_classes shuffle_prop : float The proportion of data points that should be randomly shuffled. This means the sorting retains some randomness, to avoid biasing your new labels and catching any minority classes the algorithm currently classifies as a different label. """ margins = sum( [ np.sort(probability_array, axis=1)[:, -1] - np.sort(probability_array, axis=1)[:, -2] for probability_array in probabilities ] ) / len(probabilities) ordered = np.argsort(margins) return _shuffle_subset(ordered.argsort(), shuffle_prop) def certainty(probabilities, shuffle_prop=0.1): """ Sort by the certainty of the maximum prediction. Parameters ---------- probabilities : np.ndarray An array of probabilities, with the shape n_samples, n_classes shuffle_prop : float The proportion of data points that should be randomly shuffled. This means the sorting retains some randomness, to avoid biasing your new labels and catching any minority classes the algorithm currently classifies as a different label. """ certainties = sum( [ np.max(probability_array, axis=1) for probability_array in probabilities ] ) / len(probabilities) ordered = np.argsort(certainties) return _shuffle_subset(ordered.argsort(), shuffle_prop) functions = {"entropy": entropy, "margin": margin, "certainty": certainty} """A dictionary of functions to prioritise data.""" PK!UP P superintendent/prioritisation.py""" Functions to prioritise labelling data points (to drive active learning). """ import numpy as np import scipy.stats def _shuffle_subset(data: np.ndarray, shuffle_prop: float) -> np.ndarray: to_shuffle = np.nonzero(np.random.rand(data.shape[0]) < shuffle_prop)[0] data[to_shuffle, ...] = data[np.random.permutation(to_shuffle), ...] return data def entropy( probabilities: np.ndarray, shuffle_prop: float = 0.1 ) -> np.ndarray: """ Sort by the entropy of the probabilities (high to low). Parameters ---------- probabilities : np.ndarray An array of probabilities, with the shape n_samples, n_classes shuffle_prop : float The proportion of data points that should be randomly shuffled. This means the sorting retains some randomness, to avoid biasing your new labels and catching any minority classes the algorithm currently classifies as a different label. """ ordered = np.argsort(-scipy.stats.entropy(probabilities.T)) return _shuffle_subset(ordered.argsort(), shuffle_prop) def margin(probabilities, shuffle_prop=0.1): """ Sort by the margin between the top two predictions (low to high). Parameters ---------- probabilities : np.ndarray An array of probabilities, with the shape n_samples, n_classes shuffle_prop : float The proportion of data points that should be randomly shuffled. This means the sorting retains some randomness, to avoid biasing your new labels and catching any minority classes the algorithm currently classifies as a different label. """ ordered = np.argsort( np.sort(probabilities, axis=1)[:, -1] - np.sort(probabilities, axis=1)[:, -2] ) return _shuffle_subset(ordered.argsort(), shuffle_prop) def certainty(probabilities, shuffle_prop=0.1): """ Sort by the certainty of the maximum prediction. Parameters ---------- probabilities : np.ndarray An array of probabilities, with the shape n_samples, n_classes shuffle_prop : float The proportion of data points that should be randomly shuffled. This means the sorting retains some randomness, to avoid biasing your new labels and catching any minority classes the algorithm currently classifies as a different label. """ ordered = np.argsort(np.max(probabilities, axis=1)) return _shuffle_subset(ordered.argsort(), shuffle_prop) functions = {"entropy": entropy, "margin": margin, "certainty": certainty} """A dictionary of functions to prioritise data.""" PK!??superintendent/queueing.pyimport abc import itertools import operator from collections import defaultdict, deque, namedtuple from functools import reduce from random import shuffle from typing import Any, Dict, Set import numpy as np import pandas as pd class BaseLabellingQueue(abc.ABC): # pragma: no cover @abc.abstractmethod def enqueue(self): pass @abc.abstractmethod def pop(self): pass @abc.abstractmethod def submit(self): pass @abc.abstractmethod def reorder(self): pass @abc.abstractmethod def undo(self): pass @abc.abstractmethod def list_completed(self): pass @abc.abstractmethod def list_uncompleted(self): pass @abc.abstractmethod def list_labels(self): pass @abc.abstractmethod def __iter__(self): pass @abc.abstractmethod def __next__(self): pass class SimpleLabellingQueue(BaseLabellingQueue): item = namedtuple("QueueItem", ["id", "data", "label"]) def __init__(self, features: Any = None, labels: Any = None): """Create an in-memory labelling queue. Parameters ---------- features : Any, optional Features to be added to the queue. You can either provide them here, or later using the enqueue_many method (the default is None). labels : Any, optional Labels for the features to be added to the queue. You can either provide them here, or later using the enqueue_many method (the default is None). """ self.data = dict() self.labels = dict() self.order = deque([]) self._popped = deque([]) if features is not None: self.enqueue_many(features, labels) def enqueue(self, feature, label=None) -> None: """Add a data point to the queue. Parameters ---------- feature : Any A data point to be added to the queue label : str, list, optional The label, if you already have one (the default is None) Returns ------- None """ if len(self.data) > 0: idx = max(self.data.keys()) + 1 else: idx = 0 self.data[idx] = feature if label is not None: self.labels[idx] = label else: self.order.appendleft(idx) def enqueue_many(self, features: Any, labels=None) -> None: """Add a bunch of items to the queue. Parameters ---------- features : Any [description] labels : [type], optional [description] (the default is None, which [default_description]) Returns ------- None [description] """ if isinstance(features, pd.DataFrame): features = [row for _, row in features.iterrows()] if labels is None: labels = itertools.cycle([None]) for feature, label in zip(features, labels): self.enqueue(feature, label) def pop(self) -> (int, Any): """Pop an item off the queue. Returns ------- int The ID of the item you just popped Any The item itself. """ id_ = self.order.pop() self._popped.append(id_) return id_, self.data[id_] def submit(self, id_: int, label: str) -> None: """Label a data point. Parameters ---------- id_ : int The ID of the datapoint to submit a label for label : str The label to apply for the data point Raises ------ ValueError If you attempt to label an item that hasn't been popped in this queue. Returns ------- None """ if id_ not in self._popped: raise ValueError("This item was not popped; you cannot label it.") self.labels[id_] = label def reorder(self, new_order: Dict[int, int]) -> None: """Reorder the data still in the queue Parameters ---------- new_order : Dict[int, int] A mapping from ID of an item to the order of the item. For example, a dictionary {1: 2, 2: 1, 3: 3} would place the item with ID 2 first, then the item with id 1, then the item with ID 3. Returns ------- None """ self.order = deque( [ idx for idx, _ in sorted( new_order.items(), key=lambda item: -item[1] ) ] ) def shuffle(self) -> None: """Shuffle the queue. Returns ------- None """ shuffle(self.order) def undo(self) -> None: """Un-pop the latest item. Returns ------- None """ if len(self._popped) > 0: id_ = self._popped.pop() self.labels.pop(id_, None) self.order.append(id_) def list_completed(self): """List all items with a label. Returns ------- ids : List[int] The IDs of the returned items. x : Any The data points that have labels. y : Any The labels. """ items = [ self.item(id=id_, data=self.data[id_], label=self.labels.get(id_)) for id_ in sorted(self._popped) if id_ in self.labels ] ids = [item.id for item in items] x = _features_to_array([item.data for item in items]) y = [item.label for item in items] return ids, x, y def list_uncompleted(self): """List all items without a label. Returns ------- ids : List[int] The IDs of the returned items. x : Any The data points that don't have labels. """ items = [ self.item(id=id_, data=self.data[id_], label=None) for id_ in sorted(self.order) if id_ not in self.labels ] ids = [item.id for item in items] x = _features_to_array([item.data for item in items]) return ids, x def list_all(self): """List all items. Returns ------- ids : List[int] The IDs of the returned items. x : Any The data points. y : Any The labels. """ items = [ self.item(id=id_, data=self.data[id_], label=self.labels.get(id_)) for id_ in self.data ] ids = [item.id for item in items] x = _features_to_array([item.data for item in items]) y = [item.label for item in items] return ids, x, y def list_labels(self) -> Set[str]: """List all the labels. Returns ------- Set[str] All the labels. """ try: return set(sorted(self.labels.values())) except TypeError: return reduce(operator.or_, map(set, self.labels.values())) @property def progress(self) -> float: """The queue progress.""" if len(self.data) > 0: return len(self.labels) / len(self.data) else: return 0 def __len__(self): return len(self.order) def __iter__(self): return self def __next__(self): try: return self.pop() except IndexError: raise StopIteration class ClusterLabellingQueue(BaseLabellingQueue): def __init__( self, features: Any = None, cluster_indices: Any = None, representativeness=None, ): """Create a queue for labelling clusters. Parameters ---------- features : Any The features you'd like to add to the queue. cluster_indices : Any The clusters that each of the data points belong to. This should match the features in length. representativeness : Any, optional The respective cluster representativeness of each data point. This could be distance from cluster center, probability of cluster membership, or a similar metric. """ self.data = defaultdict(list) self.representativeness = defaultdict(list) self.cluster_labels = dict() self.order = deque([]) self._popped = deque([]) if features is not None: self.enqueue_many(features, cluster_indices, representativeness) def enqueue_many(self, features, cluster_indices, representativeness=None): """Add items to the queue. Parameters ---------- features : Any The features you'd like to add to the queue. cluster_indices : Any The clusters that each of the data points belong to. This should match the features in length. representativeness : Any, optional The respective cluster representativeness of each data point. This could be distance from cluster center, probability of cluster membership, or a similar metric. Returns ------- None """ if isinstance(features, pd.DataFrame): features = [row for _, row in features.iterrows()] if representativeness is None: representativeness = np.full(len(features), np.nan) for cluster_index, feature, represents in zip( cluster_indices, features, representativeness ): self.enqueue(cluster_index, feature, represents) def enqueue(self, cluster_index, feature, representativeness=None): """Add an item to the queue. Parameters ---------- cluster_index : Any The cluster index feature : Any The data to be added to the queue. representativeness : float, optional The respective representativeness of the data point. This could be distance from cluster center, probability of cluster membership, or a similar metric. (the default is None) """ self.data[cluster_index].append(feature) if representativeness is None: representativeness = np.nan self.representativeness[cluster_index].append(representativeness) if cluster_index not in self.order: self.order.appendleft(cluster_index) def pop(self): """Pop an item off the queue. Returns ------- id_ : int The ID of the cluster. features : Any The data points that are in this cluster. """ id_ = self.order.pop() self._popped.append(id_) features = [ x for _, x in sorted( zip(self.representativeness[id_], self.data[id_]), key=lambda pair: pair[0], ) ] return id_, _features_to_array(features) def submit(self, cluster_index, cluster_label): """Submit a label for a cluster. Parameters ---------- cluster_index : Any The cluster you are trying to label. cluster_label : str The label for the cluster Raises ------ ValueError If you are trying to label a cluster you haven't popped off the queue. """ if cluster_index not in self._popped: raise ValueError("This item was not popped; you cannot label it.") self.cluster_labels[cluster_index] = cluster_label def reorder(self): """Re-order the queue. This is currently not implemented.""" pass def shuffle(self) -> None: """Shuffle the queue.""" shuffle(self.order) def undo(self): """Unpop the most recently popped item.""" if len(self._popped) > 0: cluster_index = self._popped.pop() self.cluster_labels.pop(cluster_index, None) self.order.append(cluster_index) def list_completed(self): """List the data that has been assigned a cluster label. Returns ------- cluster_indices The indices of the clusters. features The features that have been assigned a label. cluster_labels The assigned cluster labels. """ features = [ data for idx, values in self.data.items() for data in values if idx in self.cluster_labels ] cluster_indices = [ idx for idx, values in self.data.items() for data in values if idx in self.cluster_labels ] cluster_labels = [ self.cluster_labels[idx] for idx, values in self.data.items() for data in values if idx in self.cluster_labels ] return cluster_indices, _features_to_array(features), cluster_labels def list_uncompleted(self): """List the data that has not yet been assigned a label. Returns ------- cluster_indices The indices of the clusters the data points are in. features The data in the unlabelled features. """ features = [ data for idx, values in self.data.items() for data in values if idx not in self.cluster_labels ] cluster_indices = [ idx for idx, values in self.data.items() for data in values if idx not in self.cluster_labels ] return cluster_indices, _features_to_array(features) def list_all(self): """List all data. Returns ------- cluster_indices The indices of the clusters the data points are in. features The data. cluster_labels The assigned cluster labels. """ features = [ data for idx, values in self.data.items() for data in values ] cluster_indices = [ idx for idx, values in self.data.items() for data in values ] cluster_labels = [ self.cluster_labels.get(idx) for idx, values in self.data.items() for data in values ] return cluster_indices, _features_to_array(features), cluster_labels @property def progress(self): """How much of the queue has been completed. Returns ------- progress : float The progress. """ try: return len(self.cluster_labels) / len(self.data) except ZeroDivisionError: return np.nan def list_labels(self): try: return set(sorted(self.cluster_labels.values())) except TypeError: return reduce(operator.or_, map(set, self.cluster_labels.values())) def __iter__(self): return self def __len__(self): return len(self.order) def __next__(self): try: return self.pop() except IndexError: raise StopIteration def _features_to_array(features: list): """Convert a list of features to a 2D array. Parameters ---------- features : list A list of features to be converted to an array or dataframe. Returns ------- features : Any The array of features. """ if len(features) > 0: if all(isinstance(feature, pd.Series) for feature in features): features = pd.concat([item.to_frame().T for item in features]) elif all(isinstance(feature, pd.DataFrame) for feature in features): features = pd.concat(features) elif all(isinstance(feature, np.ndarray) for feature in features): features = np.stack(features) return features PK!ĉ## superintendent/semisupervisor.py"""Tools to supervise classification.""" import warnings from collections import OrderedDict from functools import partial import ipywidgets as widgets import sklearn.model_selection from . import base, prioritisation, validation from .queueing import SimpleLabellingQueue class SemiSupervisor(base.Labeller): """ A widget for labelling your data. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data. In the future, it will also allow you to re-train an algorithm. Parameters ---------- features : list, np.ndarray, pd.Series, pd.DataFrame, optional An array or sequence of data in which each element (if 1D) or each row (if 2D) represents one data point for which you'd like to generate labels. labels : list, np.ndarray, pd.Series, pd.DataFrame, optional If you already have some labels, but would like to re-label some, then you can pass these in as labels. options : tuple, list The options presented for labelling. display_func : callable, optional A function that will be used to display the data. This function should take in two arguments, first the data to display, and second the number of data points to display (set to 1 for this class). classifier : sklearn.base.ClassifierMixin, optional An object that implements the standard sklearn fit/predict methods. If provided, a button for retraining the model is shown, and the model performance under k-fold crossvalidation can be read as you go along. eval_method : callable, optional A function that accepts the classifier, features, and labels as input and returns a dictionary of values that contain the key 'test_score'. The default is sklearn.model_selection.cross_validate, with cv=3. Use functools.partial to create a function with its parameters fixed. reorder : str, callable, optional One of the reordering algorithms specified in :py:mod:`superintendent.prioritisation`. This describes a function that receives input in the shape of n_samples, n_labels and calculates the priority in terms of information value in labelling a data point. shuffle_prop : float The proportion of points that are shuffled when the data points are re-ordered (see reorder keyword-argument). This controls the "exploration vs exploitation" trade-off - the higher, the more you explore the feature space randomly, the lower, the more you exploit your current weak points. hints : dict, optional A dictionary mapping class labels to example data points from that class. Hints are displayed with the same function as the main data, so should be in the same format. keyboard_shortcuts : bool Whether keyboard shortcuts should be enabled for this widget. """ def __init__( self, features=None, labels=None, options=(), classifier=None, display_func=None, eval_method=None, reorder=None, shuffle_prop=0.1, hints=None, keyboard_shortcuts=False, *args, **kwargs ): """ A class for labelling your data. This class is designed to label data for (semi-)supervised learning algorithms. It allows you to label data, periodically re-train your algorithm and assess its performance, and determine which data points to label next based on your model's predictions. """ super().__init__( features=features, labels=labels, display_func=display_func, options=options, keyboard_shortcuts=keyboard_shortcuts, hints=hints, *args, **kwargs ) self.queue = SimpleLabellingQueue(features, labels) self.shuffle_prop = shuffle_prop self.classifier = validation.valid_classifier(classifier) if self.classifier is not None: self.retrain_button = widgets.Button( description="Retrain", disabled=False, button_style="", tooltip="Click me", icon="refresh", ) self.retrain_button.on_click(self.retrain) self.model_performance = widgets.HTML("") self.top_bar.children = ( widgets.HBox( [*self.top_bar.children], layout=widgets.Layout(width="50%"), ), widgets.HBox( [self.retrain_button, self.model_performance], layout=widgets.Layout(width="50%"), ), ) if eval_method is None: self.eval_method = partial( sklearn.model_selection.cross_validate, cv=3, # n_jobs=-1, return_train_score=False, ) elif not callable(eval_method): raise ValueError("The eval_method needs to be a callable.") else: self.eval_method = eval_method if reorder is not None and isinstance(reorder, str): if reorder not in prioritisation.functions: raise NotImplementedError( "Unknown reordering function '{}'.".format(reorder) ) self.reorder = prioritisation.functions[reorder] elif reorder is not None and callable(reorder): self.reorder = reorder elif reorder is None: self.reorder = None else: raise ValueError( "The reorder argument needs to be either a function or the " "name of a function listed in superintendent.prioritisation." ) self._annotation_loop = self._annotation_iterator() next(self._annotation_loop) self._compose() def _annotation_iterator(self): """Relabel should be integer indices""" self.progressbar.bar_style = "" for id_, datapoint in self.queue: self._display(datapoint) sender = yield if sender["source"] == "__undo__": # unpop the current item: self.queue.undo() # unpop and unlabel the previous item: self.queue.undo() # try to remove any labels not in the assigned labels: self.input_widget.remove_options( set(self.input_widget.options) - self.queue.list_labels() ) elif sender["source"] == "__skip__": pass else: new_label = sender["value"] self.queue.submit(id_, new_label) # self.input_widget.add_hint(new_label, datapoint) self.progressbar.value = self.queue.progress if self.event_manager is not None: self.event_manager.close() yield self._render_finished() def retrain(self, *args): """Retrain the classifier you passed when creating this widget. This calls the fit method of your class with the data that you've labelled. It will also score the classifier and display the performance. """ if self.classifier is None: raise ValueError("No classifier to retrain.") if len(self.queue.list_labels()) < 2: self.model_performance.value = ( "Score: Not enough labels to retrain." ) return _, labelled_X, labelled_y = self.queue.list_completed() self._render_processing(message="Retraining... ") try: with warnings.catch_warnings(): warnings.simplefilter("ignore") self.performance = self.eval_method( self.classifier, labelled_X, labelled_y ) self.model_performance.value = "Score: {:.2f}".format( self.performance["test_score"].mean() ) except ValueError: # pragma: no cover self.performance = "Could not evaluate" self.model_performance.value = "Score: {}".format(self.performance) self.classifier.fit(labelled_X, labelled_y) if self.reorder is not None: ids, unlabelled_X = self.queue.list_uncompleted() reordering = list( self.reorder( self.classifier.predict_proba(unlabelled_X), shuffle_prop=self.shuffle_prop, ) ) new_order = OrderedDict( [(id_, index) for id_, index in zip(ids, list(reordering))] ) self.queue.reorder(new_order) # undo the previously popped item and pop the next one self.queue.undo() self._annotation_loop.send({"source": "__skip__"}) # self._compose() PK!superintendent/validation.py"""Functions to validate arguments.""" from typing import Any # import numpy as np # import pandas as pd def valid_classifier(classifier: Any): """ Check if an object conforms to sklearns fit / predict interface. Parameters ---------- classifier : sklearn.base.ClassifierMixin A classification model compliant with sklearn interfaces. """ if ( classifier is not None and hasattr(classifier, "fit") and hasattr(classifier, "predict_proba") ): return classifier elif classifier is None: return None else: raise ValueError( "The classifier needs to conform to " "the sklearn interface (fit/predict_proba)." ) # def valid_data(features: Optional[Any]): # """ # Check if an object is an array or can be turned into one. # Parameters # ---------- # features : pd.DataFrame, pd.Series, np.ndarray # the data to double-check. # """ # if features is None: # return None # if isinstance( # features, (pd.DataFrame, pd.Series, np.ndarray, list, tuple) # ): # return features # # elif isinstance(features, (list, tuple)): # # return np.array(features) # else: # raise ValueError( # "The features need to be an array, sequence, or " # "a pandas DataFrame / Series." # ) PK!CZsssuperintendent/version.pyimport pkg_resources __version__ = pkg_resources.get_distribution("superintendent").version version = __version__ PK!HڽTU$superintendent-0.4.2.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!H` }'superintendent-0.4.2.dist-info/METADATAWmo6_l+o yk  AQQmH_i)swќ:ˌJNHh&V.9. FiUYRSOf/)iVpɈ`H.獏%G(],t 紝 *B̰:efe}3&-|~ dJNȊs@Д TJ娃PLTaJń@h9> Zqlx]}~u .OQZPk3րF@eA&5Lr9Cmoc&oF-ħ~16jDJ= E|C<=[p&$w잖Z0<}~5Fxp}teƜR"p>fjq4\=•L0r{u 4_ .z\l %lN zFEt:|NT[gB*JG+À=elzM&>Pld&2͔\2v8C"C:$J]7T=uҶΔa%;g> Pr‚ `6"5EQ<:T٢s;x_ET(b`Dx܅zBx=9<Ir?,فz99T/-[b'pdBU~8kܼ̅Nzws*SngO7fkV؅k #б{7Jr۵ ~{Uq[Ƚ#;Ca <l잧n%Y9Yf&VMqw㲱 %;u<oR2ISfdCʘڭ3km"O;߫5A.J#H5m u;jÄ78qObmIeMHOg/9SHفmH* I5Rr7 ЎA|;J0}5{1iꊫ [(15JO:D$I^6mdE<"L59 =m]]l6 -rŜ;^9{!+2:e&ea2 A4=>'A\\ݙep  0OM6 ߳")]{VOdfna 9U@D@p{)`X)l,K=ZvyPR !dpaF/X?M+xw*GU"1Mg(bS*#mzN@%o8hc~MZfcrj֘&hzr`*mdE_a Lڥ8aR}-nDTtCJDYuՅEWWb@ݤfU$9PN:f6]?_ ǖɾ{p ~~K(ߝVH:屳6UgtæhE0 grto'#O #_th<j0yS0R7?PK!superintendent/__init__.pyPK!%xxsuperintendent/base.pyPK! -  # superintendent/clustersupervisor.pyPK!AQ܇#*superintendent/controls/__init__.pyPK!50&+superintendent/controls/buttongroup.pyPK!_0 )<superintendent/controls/dropdownbutton.pyPK!AT̪,"Gsuperintendent/controls/hintedmultiselect.pyPK!%Nsuperintendent/controls/keycapture.pyPK!w7 7 .Nsuperintendent/controls/multiclasssubmitter.pyPK!4XX$\\superintendent/controls/submitter.pyPK!N7ii xsuperintendent/controls/timer.pyPK!,|superintendent/controls/togglebuttongroup.pyPK!&superintendent/display.pyPK!~fhbb& superintendent/distributed/__init__.pyPK!77%superintendent/distributed/dbqueue.pyPK!-[ [ )superintendent/distributed/multioutput.pyPK!ܢ2j,nsuperintendent/distributed/semisupervisor.pyPK!zz+^superintendent/distributed/serialization.pyPK!11&Hsuperintendent/multioutput/__init__.pyPK!LEΆ+superintendent/multioutput/multilabeller.pyPK!p\ ,superintendent/multioutput/prioritisation.pyPK!UP P "superintendent/prioritisation.pyPK!??-superintendent/queueing.pyPK!ĉ## lsuperintendent/semisupervisor.pyPK!superintendent/validation.pyPK!CZssRsuperintendent/version.pyPK!HڽTU$superintendent-0.4.2.dist-info/WHEELPK!H` }'superintendent-0.4.2.dist-info/METADATAPK!Hw%hJ %superintendent-0.4.2.dist-info/RECORDPKJ T