Source code for schrodinger.analysis.cluster
"""
Provides a class for clustering a set of values - for example, 3D coordinates.
@copyright: Schrodinger, LLC. All rights reserved.
"""
from collections import OrderedDict
import numpy
from sklearn.cluster import KMeans
[docs]class ClusterValues(object):
[docs]    def __init__(self, values, n_clusters=8, **kmeans_args):
        """
        Cluster the specified list of values (e.g. coordinates) into the given
        number of clusters. NOTE: This clustering algorithm is an inherintly
        random process, so results from different runs may not be consistent.
        :type values: List or numpy array of values to cluster. Each item can
                be a float or a list of floats (e.g. 3D coordinates).
        :param values: Values to cluster (will be cast into a numpy array)
        :type n_clusters: int
        :param n_clusters: Number of clusters to generate.
        Other arguments are passed directly to KMeans.
        """
        if not values:
            raise ValueError("Empty value list specified")
        self._input_values = values
        # Cast to a number array (unless already an array):
        values_array = numpy.asarray(values)
        if len(values_array.shape) == 1:
            # If input are scalar values (ints/floats), re-shape the array:
            values_array = values_array.reshape(values_array.size, 1)
        self.kmeans_instance = KMeans(n_clusters=n_clusters, **kmeans_args)
        self.clust_memberships = self.kmeans_instance.fit_predict(values_array) 
[docs]    def getClusterMemberships(self):
        """
        Returns a list corresponding to which cluster each value was assigned.
        The length of the list is equal to the number of the input values.
        Each value ranges from 0 to (number of output clusters-1).
        Used by the unit test to verify that the clustering works correctly.
        """
        # Convert the numpy array into a python list to simplify the tests.
        return list(self.clust_memberships) 
[docs]    def getClusteredValues(self):
        """
        Return a list of clustered values.  Outer list represents clusters,
        each item (cluster) will consist of one or more input values.
        """
        clustered_data = OrderedDict()
        for index, cluster in enumerate(self.clust_memberships):
            value = self._input_values[index]
            try:
                clustered_data[cluster].append(value)
            except KeyError:
                clustered_data[cluster] = [value]
        return list(clustered_data.values()) 
[docs]    def getClusterCenters(self):
        """
        Return a numpy array of cluster centroids.
        """
        centers_array = self.kmeans_instance.cluster_centers_
        if centers_array.shape[1] == 1:
            # Clustering scaler values; flatten the array:
            return centers_array.flatten()
        else:
            # Clustering 2+ dimentaional values
            return centers_array