PK’…H¶$•¹““cluster/cluster.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # from __future__ import print_function from .util import fullyflatten class Cluster(object): """ A collection of items. This is internally used to detect clustered items in the data so we could distinguish other collection types (lists, dicts, ...) from the actual clusters. This means that you could also create clusters of lists with this class. """ def __repr__(self): return "" % (self.level, self.items) def __str__(self): return self.__str__() def __init__(self, level, *args): """ Constructor :param level: The level of this cluster. This is used in hierarchical clustering to retrieve a specific set of clusters. The higher the level, the smaller the count of clusters returned. The level depends on the difference function used. :param *args: every additional argument passed following the level value will get added as item to the cluster. You could also pass a list as second parameter to initialise the cluster with that list as content """ self.level = level if len(args) == 0: self.items = [] else: self.items = args def __iter__(self): for item in self.items: if isinstance(item, Cluster): for recursed_item in item: yield recursed_item else: yield item def display(self, depth=0): """ Pretty-prints this cluster. Useful for debuging. """ print(depth * " " + "[level %s]" % self.level) for item in self.items: if isinstance(item, Cluster): item.display(depth + 1) else: print(depth * " " + "%s" % item) def topology(self): """ Returns the structure (topology) of the cluster as tuples. Output from cl.data:: [])>, ])>])>])>])>])>])>] Corresponding output from cl.topo():: ('CVS', ('34.xls', (('0.txt', ('ChangeLog', 'ChangeLog.txt')), ('20060730.py', ('.cvsignore', ('About.py', ('.idlerc', '.pylint.d'))))))) """ left = self.items[0] right = self.items[1] if isinstance(left, Cluster): first = left.topology() else: first = left if isinstance(right, Cluster): second = right.topology() else: second = right return first, second def getlevel(self, threshold): """ Retrieve all clusters up to a specific level threshold. This level-threshold represents the maximum distance between two clusters. So the lower you set this threshold, the more clusters you will receive and the higher you set it, you will receive less but bigger clusters. :param threshold: The level threshold: .. note:: It is debatable whether the value passed into this method should really be as strongly linked to the real cluster-levels as it is right now. The end-user will not know the range of this value unless s/he first inspects the top-level cluster. So instead you might argue that a value ranging from 0 to 1 might be a more useful approach. """ left = self.items[0] right = self.items[1] # if this object itself is below the threshold value we only need to # return it's contents as a list if self.level <= threshold: return [fullyflatten(self.items)] # if this cluster's level is higher than the threshold we will # investgate it's left and right part. Their level could be below the # threshold if isinstance(left, Cluster) and left.level <= threshold: if isinstance(right, Cluster): return [fullyflatten(left.items)] + right.getlevel(threshold) else: return [fullyflatten(left.items)] + [[right]] elif isinstance(right, Cluster) and right.level <= threshold: if isinstance(left, Cluster): return left.getlevel(threshold) + [fullyflatten(right.items)] else: return [[left]] + [fullyflatten(right.items)] # Alright. We covered the cases where one of the clusters was below # the threshold value. Now we'll deal with the clusters that are above # by recursively applying the previous cases. if isinstance(left, Cluster) and isinstance(right, Cluster): return left.getlevel(threshold) + right.getlevel(threshold) elif isinstance(left, Cluster): return left.getlevel(threshold) + [[right]] elif isinstance(right, Cluster): return [[left]] + right.getlevel(threshold) else: return [[left], [right]] PK’…H>¦µúúcluster/matrix.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # import logging from multiprocessing import Process, Queue, current_process logger = logging.getLogger(__name__) class Matrix(object): """ Object representation of the item-item matrix. """ def __init__(self, data, combinfunc, symmetric=False, diagonal=None): """ Takes a list of data and generates a 2D-matrix using the supplied combination function to calculate the values. :param data: the list of items. :param combinfunc: the function that is used to calculate teh value in a cell. It has to cope with two arguments. :param symmetric: Whether it will be a symmetric matrix along the diagonal. For example, if the list contains integers, and the combination function is ``abs(x-y)``, then the matrix will be symmetric. :param diagonal: The value to be put into the diagonal. For some functions, the diagonal will stay constant. An example could be the function ``x-y``. Then each diagonal cell will be ``0``. If this value is set to None, then the diagonal will be calculated. """ self.data = data self.combinfunc = combinfunc self.symmetric = symmetric self.diagonal = diagonal def worker(self): """ Multiprocessing task function run by worker processes """ tasks_completed = 0 for task in iter(self.task_queue.get, 'STOP'): col_index, item, item2 = task if not hasattr(item, '__iter__') or isinstance(item, tuple): item = [item] if not hasattr(item2, '__iter__') or isinstance(item2, tuple): item2 = [item2] result = (col_index, self.combinfunc(item, item2)) self.done_queue.put(result) tasks_completed += 1 logger.info("Worker %s performed %s tasks", current_process().name, tasks_completed) def genmatrix(self, num_processes=1): """ Actually generate the matrix :param num_processes: If you want to use multiprocessing to split up the work and run ``combinfunc()`` in parallel, specify ``num_processes > 1`` and this number of workers will be spun up, the work is split up amongst them evenly. """ use_multiprocessing = num_processes > 1 if use_multiprocessing: self.task_queue = Queue() self.done_queue = Queue() self.matrix = [] logger.info("Generating matrix for %s items - O(n^2)", len(self.data)) if use_multiprocessing: logger.info("Using multiprocessing on %s processes!", num_processes) if use_multiprocessing: logger.info("Spinning up %s workers", num_processes) processes = [Process(target=self.worker) for i in range(num_processes)] [process.start() for process in processes] for row_index, item in enumerate(self.data): logger.debug("Generating row %s/%s (%0.2f%%)", row_index, len(self.data), 100.0 * row_index / len(self.data)) row = {} if use_multiprocessing: num_tasks_queued = num_tasks_completed = 0 for col_index, item2 in enumerate(self.data): if self.diagonal is not None and col_index == row_index: # This is a cell on the diagonal row[col_index] = self.diagonal elif self.symmetric and col_index < row_index: # The matrix is symmetric and we are "in the lower left # triangle" - fill this in after (in case of multiprocessing) pass # Otherwise, this cell is not on the diagonal and we do indeed # need to call combinfunc() elif use_multiprocessing: # Add that thing to the task queue! self.task_queue.put((col_index, item, item2)) num_tasks_queued += 1 # Start grabbing the results as we go, so as not to stuff all of # the worker args into memory at once (as Queue.get() is a # blocking operation) if num_tasks_queued > num_processes: col_index, result = self.done_queue.get() row[col_index] = result num_tasks_completed += 1 else: # Otherwise do it here, in line if not hasattr(item, '__iter__') or isinstance(item, tuple): item = [item] if not hasattr(item2, '__iter__') or isinstance(item2, tuple): item2 = [item2] row[col_index] = self.combinfunc(item, item2) if self.symmetric: # One more iteration to get symmetric lower left triangle for col_index, item2 in enumerate(self.data): if col_index >= row_index: break # post-process symmetric "lower left triangle" row[col_index] = self.matrix[col_index][row_index] if use_multiprocessing: # Grab the remaining worker task results while num_tasks_completed < num_tasks_queued: col_index, result = self.done_queue.get() row[col_index] = result num_tasks_completed += 1 row_indexed = [row[index] for index in range(len(self.data))] self.matrix.append(row_indexed) if use_multiprocessing: logger.info("Stopping/joining %s workers", num_processes) [self.task_queue.put('STOP') for i in range(num_processes)] [process.join() for process in processes] logger.info("Matrix generated") def __str__(self): """ Returns a 2-dimensional list of data as text-string which can be displayed to the user. """ # determine maximum length maxlen = 0 colcount = len(self.data[0]) for col in self.data: for cell in col: maxlen = max(len(str(cell)), maxlen) format = " %%%is |" % maxlen format = "|" + format * colcount rows = [format % tuple(row) for row in self.data] return "\n".join(rows) PK’…Hð¦o‘v v cluster/linkage.pyfrom __future__ import division from functools import wraps def cached(fun): """ memoizing decorator for linkage functions. Parameters have been hardcoded (no ``*args``, ``**kwargs`` magic), because, the way this is coded (interchangingly using sets and frozensets) is true for this specific case. For other cases that is not necessarily guaranteed. """ _cache = {} @wraps(fun) def newfun(a, b, distance_function): frozen_a = frozenset(a) frozen_b = frozenset(b) if (frozen_a, frozen_b) not in _cache: result = fun(a, b, distance_function) _cache[(frozen_a, frozen_b)] = result return _cache[(frozen_a, frozen_b)] return newfun @cached def single(a, b, distance_function): """ Given two collections ``a`` and ``b``, this will return the distance of the points which are closest together. ``distance_function`` is used to determine the distance between two elements. Example:: >>> single([1, 2], [3, 4], lambda x, y: abs(x-y)) 1 # (distance between 2 and 3) """ left_a, right_a = min(a), max(a) left_b, right_b = min(b), max(b) result = min(distance_function(left_a, right_b), distance_function(left_b, right_a)) return result @cached def complete(a, b, distance_function): """ Given two collections ``a`` and ``b``, this will return the distance of the points which are farthest apart. ``distance_function`` is used to determine the distance between two elements. Example:: >>> single([1, 2], [3, 4], lambda x, y: abs(x-y)) 3 # (distance between 1 and 4) """ left_a, right_a = min(a), max(a) left_b, right_b = min(b), max(b) result = max(distance_function(left_a, right_b), distance_function(left_b, right_a)) return result @cached def average(a, b, distance_function): """ Given two collections ``a`` and ``b``, this will return the mean of all distances. ``distance_function`` is used to determine the distance between two elements. Example:: >>> single([1, 2], [3, 100], lambda x, y: abs(x-y)) 26 """ distances = [distance_function(x, y) for x in a for y in b] return sum(distances) / len(distances) @cached def uclus(a, b, distance_function): """ Given two collections ``a`` and ``b``, this will return the *median* of all distances. ``distance_function`` is used to determine the distance between two elements. Example:: >>> single([1, 2], [3, 100], lambda x, y: abs(x-y)) 2.5 """ distances = sorted([distance_function(x, y) for x in a for y in b]) midpoint, rest = len(distances) // 2, len(distances) % 2 if not rest: return sum(distances[midpoint-1:midpoint+1]) / 2 else: return distances[midpoint] PK’…Hë°!jKKcluster/__init__.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # from pkg_resources import resource_string from .method.hierarchical import HierarchicalClustering from .method.kmeans import KMeansClustering from .util import ClusteringError __version__ = resource_string('cluster', 'version.txt').decode('ascii').strip() PK’…HPU ###cluster/util.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # from __future__ import print_function import logging logger = logging.getLogger(__name__) class ClusteringError(Exception): pass def flatten(L): """ Flattens a list. Example: >>> flatten([a,b,[c,d,[e,f]]]) [a,b,c,d,e,f] """ if not isinstance(L, list): return [L] if L == []: return L return flatten(L[0]) + flatten(L[1:]) def fullyflatten(container): """ Completely flattens out a cluster and returns a one-dimensional set containing the cluster's items. This is useful in cases where some items of the cluster are clusters in their own right and you only want the items. :param container: the container to flatten. """ flattened_items = [] for item in container: if hasattr(item, 'items'): flattened_items = flattened_items + fullyflatten(item.items) else: flattened_items.append(item) return flattened_items def median(numbers): """ Return the median of the list of numbers. see: http://mail.python.org/pipermail/python-list/2004-December/294990.html """ # Sort the list and take the middle element. n = len(numbers) copy = sorted(numbers) if n & 1: # There is an odd number of elements return copy[n // 2] else: return (copy[n // 2 - 1] + copy[n // 2]) / 2.0 def mean(numbers): """ Returns the arithmetic mean of a numeric list. see: http://mail.python.org/pipermail/python-list/2004-December/294990.html """ return float(sum(numbers)) / float(len(numbers)) def minkowski_distance(x, y, p=2): """ Calculates the minkowski distance between two points. :param x: the first point :param y: the second point :param p: the order of the minkowski algorithm. If *p=1* it is equal to the manhatten distance, if *p=2* it is equal to the euclidian distance. The higher the order, the closer it converges to the Chebyshev distance, which has *p=infinity*. """ from math import pow assert len(y) == len(x) assert len(x) >= 1 sum = 0 for i in range(len(x)): sum += abs(x[i] - y[i]) ** p return pow(sum, 1.0 / float(p)) def magnitude(a): "calculates the magnitude of a vecor" from math import sqrt sum = 0 for coord in a: sum += coord ** 2 return sqrt(sum) def dotproduct(a, b): "Calculates the dotproduct between two vecors" assert(len(a) == len(b)) out = 0 for i in range(len(a)): out += a[i] * b[i] return out def centroid(data, method=median): "returns the central vector of a list of vectors" out = [] for i in range(len(data[0])): out.append(method([x[i] for x in data])) return tuple(out) PK’…H9¶š9— — cluster/method/base.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # class BaseClusterMethod(object): """ The base class of all clustering methods. :param input: a list of objects :distance_function: a function returning the distance - or opposite of similarity ``(distance = -similarity)`` - of two items from the input. In other words, the closer the two items are related, the smaller this value needs to be. With 0 meaning they are exactly the same. .. note:: The distance function should always return the absolute distance between two given items of the list. Say:: distance(input[1], input[4]) = distance(input[4], input[1]) This is very important for the clustering algorithm to work! Naturally, the data returned by the distance function MUST be a comparable datatype, so you can perform arithmetic comparisons on them (``<`` or ``>``)! The simplest examples would be floats or ints. But as long as they are comparable, it's ok. """ def __init__(self, input, distance_function, progress_callback=None): self.distance = distance_function self._input = input # the original input self._data = input[:] # clone the input so we can work with it # without distroying the original data. self.progress_callback = progress_callback def topo(self): """ Returns the structure (topology) of the cluster. See :py:meth:`~cluster.cluster.Cluster.topology` for more information. """ return self.data[0].topology() @property def data(self): """ Returns the data that is currently in process. """ return self._data @property def raw_data(self): """ Returns the raw data (data without being clustered). """ return self._input PK’…H`­\^ÖÖcluster/method/kmeans.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # from cluster.util import ClusteringError, centroid, minkowski_distance class KMeansClustering(object): """ Implementation of the kmeans clustering method as explained in a tutorial_ by *matteucc*. .. _tutorial: http://www.elet.polimi.it/upload/matteucc/Clustering/tutorial_html/kmeans.html Example: >>> from cluster import KMeansClustering >>> cl = KMeansClustering([(1,1), (2,1), (5,3), ...]) >>> clusters = cl.getclusters(2) :param data: A list of tuples or integers. :param distance: A function determining the distance between two items. Default (if ``None`` is passed): It assumes the tuples contain numeric values and appiles a generalised form of the euclidian-distance algorithm on them. :param equality: A function to test equality of items. By default the standard python equality operator (``==``) is applied. :raises ValueError: if the list contains heterogeneous items or if the distance between items cannot be determined. """ def __init__(self, data, distance=None, equality=None): self.__clusters = [] self.__data = data self.distance = distance self.__initial_length = len(data) self.equality = equality # test if each item is of same dimensions if len(data) > 1 and isinstance(data[0], tuple): control_length = len(data[0]) for item in data[1:]: if len(item) != control_length: raise ValueError("Each item in the data list must have " "the same amount of dimensions. Item " "%r was out of line!" % (item,)) # now check if we need and have a distance function if (len(data) > 1 and not isinstance(data[0], tuple) and distance is None): raise ValueError("You supplied non-standard items but no " "distance function! We cannot continue!") # we now know that we have tuples, and assume therefore that it's # items are numeric elif distance is None: self.distance = minkowski_distance def getclusters(self, count): """ Generates *count* clusters. :param count: The amount of clusters that should be generated. count must be greater than ``1``. :raises ClusteringError: if *count* is out of bounds. """ # only proceed if we got sensible input if count <= 1: raise ClusteringError("When clustering, you need to ask for at " "least two clusters! " "You asked for %d" % count) # return the data straight away if there is nothing to cluster if (self.__data == [] or len(self.__data) == 1 or count == self.__initial_length): return self.__data # It makes no sense to ask for more clusters than data-items available if count > self.__initial_length: raise ClusteringError( "Unable to generate more clusters than " "items available. You supplied %d items, and asked for " "%d clusters." % (self.__initial_length, count)) self.initialise_clusters(self.__data, count) items_moved = True # tells us if any item moved between the clusters, # as we initialised the clusters, we assume that # is the case while items_moved is True: items_moved = False for cluster in self.__clusters: for item in cluster: res = self.assign_item(item, cluster) if items_moved is False: items_moved = res return self.__clusters def assign_item(self, item, origin): """ Assigns an item from a given cluster to the closest located cluster. :param item: the item to be moved. :param origin: the originating cluster. """ closest_cluster = origin for cluster in self.__clusters: if self.distance(item, centroid(cluster)) < self.distance( item, centroid(closest_cluster)): closest_cluster = cluster if id(closest_cluster) != id(origin): self.move_item(item, origin, closest_cluster) return True else: return False def move_item(self, item, origin, destination): """ Moves an item from one cluster to anoter cluster. :param item: the item to be moved. :param origin: the originating cluster. :param destination: the target cluster. """ if self.equality: item_index = 0 for i, element in enumerate(origin): if self.equality(element, item): item_index = i break else: item_index = origin.index(item) destination.append(origin.pop(item_index)) def initialise_clusters(self, input_, clustercount): """ Initialises the clusters by distributing the items from the data. evenly across n clusters :param input_: the data set (a list of tuples). :param clustercount: the amount of clusters (n). """ # initialise the clusters with empty lists self.__clusters = [] for _ in range(clustercount): self.__clusters.append([]) # distribute the items into the clusters count = 0 for item in input_: self.__clusters[count % clustercount].append(item) count += 1 PK’…HìòsÜ!Ü!cluster/method/hierarchical.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # from functools import partial import logging from cluster.cluster import Cluster from cluster.matrix import Matrix from cluster.method.base import BaseClusterMethod from cluster.linkage import single, complete, average, uclus logger = logging.getLogger(__name__) class HierarchicalClustering(BaseClusterMethod): """ Implementation of the hierarchical clustering method as explained in a tutorial_ by *matteucc*. Object prerequisites: * Items must be sortable (See `issue #11`_) * Items must be hashable. .. _issue #11: https://github.com/exhuma/python-cluster/issues/11 .. _tutorial: http://www.elet.polimi.it/upload/matteucc/Clustering/tutorial_html/hierarchical.html Example: >>> from cluster import HierarchicalClustering >>> # or: from cluster import * >>> cl = HierarchicalClustering([123,334,345,242,234,1,3], lambda x,y: float(abs(x-y))) >>> cl.getlevel(90) [[345, 334], [234, 242], [123], [3, 1]] Note that all of the returned clusters are more than 90 (``getlevel(90)``) apart. See :py:class:`~cluster.method.base.BaseClusterMethod` for more details. :param data: The collection of items to be clustered. :param distance_function: A function which takes two elements of ``data`` and returns a distance between both elements. :param linkage: The method used to determine the distance between two clusters. See :py:meth:`~.HierarchicalClustering.set_linkage_method` for possible values. :param num_processes: If you want to use multiprocessing to split up the work and run ``genmatrix()`` in parallel, specify num_processes > 1 and this number of workers will be spun up, the work split up amongst them evenly. :param progress_callback: A function to be called on each iteration to publish the progress. The function is called with two integer arguments which represent the total number of elements in the cluster, and the remaining elements to be clustered. """ def __init__(self, data, distance_function, linkage=None, num_processes=1, progress_callback=None): if not linkage: linkage = single logger.info("Initializing HierarchicalClustering object with linkage " "method %s", linkage) BaseClusterMethod.__init__(self, sorted(data), distance_function) self.set_linkage_method(linkage) self.num_processes = num_processes self.progress_callback = progress_callback self.__cluster_created = False def publish_progress(self, total, current): """ If a progress function was supplied, this will call that function with the total number of elements, and the remaining number of elements. :param total: The total number of elements. :param remaining: The remaining number of elements. """ if self.progress_callback: self.progress_callback(total, current) def set_linkage_method(self, method): """ Sets the method to determine the distance between two clusters. :param method: The method to use. It can be one of ``'single'``, ``'complete'``, ``'average'`` or ``'uclus'``, or a callable. The callable should take two collections as parameters and return a distance value between both collections. """ if method == 'single': self.linkage = single elif method == 'complete': self.linkage = complete elif method == 'average': self.linkage = average elif method == 'uclus': self.linkage = uclus elif hasattr(method, '__call__'): self.linkage = method else: raise ValueError('distance method must be one of single, ' 'complete, average of uclus') def cluster(self, matrix=None, level=None, sequence=None): """ Perform hierarchical clustering. :param matrix: The 2D list that is currently under processing. The matrix contains the distances of each item with each other :param level: The current level of clustering :param sequence: The sequence number of the clustering """ logger.info("Performing cluster()") if matrix is None: # create level 0, first iteration (sequence) level = 0 sequence = 0 matrix = [] # if the matrix only has two rows left, we are done linkage = partial(self.linkage, distance_function=self.distance) initial_element_count = len(self._data) while len(matrix) > 2 or matrix == []: item_item_matrix = Matrix(self._data, linkage, True, 0) item_item_matrix.genmatrix(self.num_processes) matrix = item_item_matrix.matrix smallestpair = None mindistance = None rowindex = 0 # keep track of where we are in the matrix # find the minimum distance for row in matrix: cellindex = 0 # keep track of where we are in the matrix for cell in row: # if we are not on the diagonal (which is always 0) # and if this cell represents a new minimum... cell_lt_mdist = cell < mindistance if mindistance else False if ((rowindex != cellindex) and (cell_lt_mdist or smallestpair is None)): smallestpair = (rowindex, cellindex) mindistance = cell cellindex += 1 rowindex += 1 sequence += 1 level = matrix[smallestpair[1]][smallestpair[0]] cluster = Cluster(level, self._data[smallestpair[0]], self._data[smallestpair[1]]) # maintain the data, by combining the the two most similar items # in the list we use the min and max functions to ensure the # integrity of the data. imagine: if we first remove the item # with the smaller index, all the rest of the items shift down by # one. So the next index will be wrong. We could simply adjust the # value of the second "remove" call, but we don't know the order # in which they come. The max and min approach clarifies that self._data.remove(self._data[max(smallestpair[0], smallestpair[1])]) # remove item 1 self._data.remove(self._data[min(smallestpair[0], smallestpair[1])]) # remove item 2 self._data.append(cluster) # append item 1 and 2 combined self.publish_progress(initial_element_count, len(self._data)) # all the data is in one single cluster. We return that and stop self.__cluster_created = True logger.info("Call to cluster() is complete") return def getlevel(self, threshold): """ Returns all clusters with a maximum distance of *threshold* in between each other :param threshold: the maximum distance between clusters. See :py:meth:`~cluster.cluster.Cluster.getlevel` """ # if it's not worth clustering, just return the data if len(self._input) <= 1: return self._input # initialize the cluster if not yet done if not self.__cluster_created: self.cluster() return self._data[0].getlevel(threshold) PK’…H¸ØÑ¡HHcluster/method/__init__.py# # This is part of "python-cluster". A library to group similar items together. # Copyright (C) 2006 Michel Albert # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by the # Free Software Foundation; either version 2.1 of the License, or (at your # option) any later version. # This library is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License # for more details. # You should have received a copy of the GNU Lesser General Public License # along with this library; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # PKÆ’…H«wößää'cluster-1.3.2.dist-info/DESCRIPTION.rstDESCRIPTION =========== .. image:: https://readthedocs.org/projects/python-cluster/badge/?version=latest :target: http://python-cluster.readthedocs.org :alt: Documentation Status python-cluster is a "simple" package that allows to create several groups (clusters) of objects from a list. It's meant to be flexible and able to cluster any object. To ensure this kind of flexibility, you need not only to supply the list of objects, but also a function that calculates the similarity between two of those objects. For simple datatypes, like integers, this can be as simple as a subtraction, but more complex calculations are possible. Right now, it is possible to generate the clusters using a hierarchical clustering and the popular K-Means algorithm. For the hierarchical algorithm there are different "linkage" (single, complete, average and uclus) methods available. Algorithms are based on the document found at http://www.elet.polimi.it/upload/matteucc/Clustering/tutorial_html/ .. note:: The above site is no longer avaialble, but you can still view it in the internet archive at: https://web.archive.org/web/20070912040206/http://home.dei.polimi.it//matteucc/Clustering/tutorial_html/ USAGE ===== A simple python program could look like this:: >>> from cluster import HierarchicalClustering >>> data = [12,34,23,32,46,96,13] >>> cl = HierarchicalClustering(data, lambda x,y: abs(x-y)) >>> cl.getlevel(10) # get clusters of items closer than 10 [96, 46, [12, 13, 23, 34, 32]] >>> cl.getlevel(5) # get clusters of items closer than 5 [96, 46, [12, 13], 23, [34, 32]] Note, that when you retrieve a set of clusters, it immediately starts the clustering process, which is quite complex. If you intend to create clusters from a large dataset, consider doing that in a separate thread. For K-Means clustering it would look like this:: >>> from cluster import KMeansClustering >>> cl = KMeansClustering([(1,1), (2,1), (5,3), ...]) >>> clusters = cl.getclusters(2) The parameter passed to getclusters is the count of clusters generated. .. image:: https://readthedocs.org/projects/python-cluster/badge/?version=latest :target: http://python-cluster.readthedocs.org :alt: Documentation Status PKÆ’…H˵NÁzz%cluster-1.3.2.dist-info/metadata.json{"classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Education", "Intended Audience :: Other Audience", "Intended Audience :: Science/Research", "License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Information Analysis"], "extensions": {"python.details": {"contacts": [{"email": "michel@albert.lu", "name": "Michel Albert", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://github.com/exhuma/python-cluster"}}}, "generator": "bdist_wheel (0.26.0)", "license": "LGPL", "metadata_version": "2.0", "name": "cluster", "summary": "UNKNOWN", "version": "1.3.2"}PKÆ’…H§”PÅ%cluster-1.3.2.dist-info/top_level.txtcluster PKÆ’…Hìndªnncluster-1.3.2.dist-info/WHEELWheel-Version: 1.0 Generator: bdist_wheel (0.26.0) Root-Is-Purelib: true Tag: py2-none-any Tag: py3-none-any PKÆ’…Hòi+ö ö cluster-1.3.2.dist-info/METADATAMetadata-Version: 2.0 Name: cluster Version: 1.3.2 Summary: UNKNOWN Home-page: https://github.com/exhuma/python-cluster Author: Michel Albert Author-email: michel@albert.lu License: LGPL Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Education Classifier: Intended Audience :: Other Audience Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2) Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2 Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Scientific/Engineering :: Information Analysis DESCRIPTION =========== .. image:: https://readthedocs.org/projects/python-cluster/badge/?version=latest :target: http://python-cluster.readthedocs.org :alt: Documentation Status python-cluster is a "simple" package that allows to create several groups (clusters) of objects from a list. It's meant to be flexible and able to cluster any object. To ensure this kind of flexibility, you need not only to supply the list of objects, but also a function that calculates the similarity between two of those objects. For simple datatypes, like integers, this can be as simple as a subtraction, but more complex calculations are possible. Right now, it is possible to generate the clusters using a hierarchical clustering and the popular K-Means algorithm. For the hierarchical algorithm there are different "linkage" (single, complete, average and uclus) methods available. Algorithms are based on the document found at http://www.elet.polimi.it/upload/matteucc/Clustering/tutorial_html/ .. note:: The above site is no longer avaialble, but you can still view it in the internet archive at: https://web.archive.org/web/20070912040206/http://home.dei.polimi.it//matteucc/Clustering/tutorial_html/ USAGE ===== A simple python program could look like this:: >>> from cluster import HierarchicalClustering >>> data = [12,34,23,32,46,96,13] >>> cl = HierarchicalClustering(data, lambda x,y: abs(x-y)) >>> cl.getlevel(10) # get clusters of items closer than 10 [96, 46, [12, 13, 23, 34, 32]] >>> cl.getlevel(5) # get clusters of items closer than 5 [96, 46, [12, 13], 23, [34, 32]] Note, that when you retrieve a set of clusters, it immediately starts the clustering process, which is quite complex. If you intend to create clusters from a large dataset, consider doing that in a separate thread. For K-Means clustering it would look like this:: >>> from cluster import KMeansClustering >>> cl = KMeansClustering([(1,1), (2,1), (5,3), ...]) >>> clusters = cl.getclusters(2) The parameter passed to getclusters is the count of clusters generated. .. image:: https://readthedocs.org/projects/python-cluster/badge/?version=latest :target: http://python-cluster.readthedocs.org :alt: Documentation Status PKÆ’…HV—_F³³cluster-1.3.2.dist-info/RECORDcluster/__init__.py,sha256=thDG1xnK99PGZ4A1b0WhiTL7n28UvnI38IPUuNHp7tc,1099 cluster/cluster.py,sha256=aa4tbD5sopwU2RVMMjsFtHby3V5QIw-O3sUEsrw2o8U,6291 cluster/linkage.py,sha256=Z2t-pFE6wuPqmxFJFaSfssF-Jf-LynPc12jPNiYL58Q,2934 cluster/matrix.py,sha256=eB2Qhfo18W9wgUI8EOd7yqK1cUwKZscRNlDkNOqk3sY,7418 cluster/util.py,sha256=lhZlXmxH18WWGgXFoktv_pI5gHC576eLzshZmnMhYVs,3619 cluster/method/__init__.py,sha256=udGUNnXd1Zvh1TYRU_1EbuW76ZwjCIsRY5om1eCm85Y,840 cluster/method/base.py,sha256=OBjWt_ZDB-SsHmCIrEjEvW_SBtNqc3ijvxzyGmhfv8o,2711 cluster/method/hierarchical.py,sha256=yd3BX4fRotryAnOKlkl3WrWfDs0L4ash1tYmxC5s0tM,8668 cluster/method/kmeans.py,sha256=nhr6RRraLjf35VG45H_mr1RGqH6nPCsGYgf5oXKwFsg,6614 cluster-1.3.2.dist-info/DESCRIPTION.rst,sha256=u4sINojT1KkY7o9Mr63VmEAF7wctl_cBPdVaHXcjFOA,2276 cluster-1.3.2.dist-info/METADATA,sha256=M7IDxf1l8rWxCBvwNr37DxRmVYJgkOrjap7ZL4WrUkE,3062 cluster-1.3.2.dist-info/RECORD,, cluster-1.3.2.dist-info/WHEEL,sha256=GrqQvamwgBV4nLoJe0vhYRSWzWsx7xjlt74FT0SWYfE,110 cluster-1.3.2.dist-info/metadata.json,sha256=BYs_QZI3NK8C0afnrG7zKTybAl2ebcQK8Iw8etEwuAY,890 cluster-1.3.2.dist-info/top_level.txt,sha256=TiVjC3oWnynlod6m-26PCB6EWHOYizQYuAsvf7Waj_k,8 PK’…H¶$•¹““cluster/cluster.pyPK’…H>¦µúúÃcluster/matrix.pyPK’…Hð¦o‘v v ì5cluster/linkage.pyPK’…Hë°!jKK’Acluster/__init__.pyPK’…HPU ###Fcluster/util.pyPK’…H9¶š9— — ^Tcluster/method/base.pyPK’…H`­\^ÖÖ)_cluster/method/kmeans.pyPK’…HìòsÜ!Ü!5ycluster/method/hierarchical.pyPK’…H¸ØÑ¡HHM›cluster/method/__init__.pyPKÆ’…H«wößää'Ížcluster-1.3.2.dist-info/DESCRIPTION.rstPKÆ’…H˵NÁzz%ö§cluster-1.3.2.dist-info/metadata.jsonPKÆ’…H§”PÅ%³«cluster-1.3.2.dist-info/top_level.txtPKÆ’…Hìndªnnþ«cluster-1.3.2.dist-info/WHEELPKÆ’…Hòi+ö ö §¬cluster-1.3.2.dist-info/METADATAPKÆ’…HV—_F³³Û¸cluster-1.3.2.dist-info/RECORDPK;ʽ