Source code for schrodinger.analysis.cluster

"""
Provides a class for clustering a set of values - for example, 3D coordinates.

@copyright: Schrodinger, LLC. All rights reserved.
"""

from collections import OrderedDict

import numpy
from sklearn.cluster import KMeans


[docs]class ClusterValues(object):
[docs] def __init__(self, values, n_clusters=8, **kmeans_args): """ Cluster the specified list of values (e.g. coordinates) into the given number of clusters. NOTE: This clustering algorithm is an inherintly random process, so results from different runs may not be consistent. :type values: List or numpy array of values to cluster. Each item can be a float or a list of floats (e.g. 3D coordinates). :param values: Values to cluster (will be cast into a numpy array) :type n_clusters: int :param n_clusters: Number of clusters to generate. Other arguments are passed directly to KMeans. """ if not values: raise ValueError("Empty value list specified") self._input_values = values # Cast to a number array (unless already an array): values_array = numpy.asarray(values) if len(values_array.shape) == 1: # If input are scalar values (ints/floats), re-shape the array: values_array = values_array.reshape(values_array.size, 1) self.kmeans_instance = KMeans(n_clusters=n_clusters, **kmeans_args) self.clust_memberships = self.kmeans_instance.fit_predict(values_array)
[docs] def getClusterMemberships(self): """ Returns a list corresponding to which cluster each value was assigned. The length of the list is equal to the number of the input values. Each value ranges from 0 to (number of output clusters-1). Used by the unit test to verify that the clustering works correctly. """ # Convert the numpy array into a python list to simplify the tests. return list(self.clust_memberships)
[docs] def getClusteredValues(self): """ Return a list of clustered values. Outer list represents clusters, each item (cluster) will consist of one or more input values. """ clustered_data = OrderedDict() for index, cluster in enumerate(self.clust_memberships): value = self._input_values[index] try: clustered_data[cluster].append(value) except KeyError: clustered_data[cluster] = [value] return list(clustered_data.values())
[docs] def getClusterCenters(self): """ Return a numpy array of cluster centroids. """ centers_array = self.kmeans_instance.cluster_centers_ if centers_array.shape[1] == 1: # Clustering scaler values; flatten the array: return centers_array.flatten() else: # Clustering 2+ dimentaional values return centers_array