Source code for schrodinger.analysis.cluster
"""
Provides a class for clustering a set of values - for example, 3D coordinates.
@copyright: Schrodinger, LLC. All rights reserved.
"""
from collections import OrderedDict
import numpy
from sklearn.cluster import KMeans
[docs]class ClusterValues(object):
[docs] def __init__(self, values, n_clusters=8, **kmeans_args):
"""
Cluster the specified list of values (e.g. coordinates) into the given
number of clusters. NOTE: This clustering algorithm is an inherintly
random process, so results from different runs may not be consistent.
:type values: List or numpy array of values to cluster. Each item can
be a float or a list of floats (e.g. 3D coordinates).
:param values: Values to cluster (will be cast into a numpy array)
:type n_clusters: int
:param n_clusters: Number of clusters to generate.
Other arguments are passed directly to KMeans.
"""
if not values:
raise ValueError("Empty value list specified")
self._input_values = values
# Cast to a number array (unless already an array):
values_array = numpy.asarray(values)
if len(values_array.shape) == 1:
# If input are scalar values (ints/floats), re-shape the array:
values_array = values_array.reshape(values_array.size, 1)
self.kmeans_instance = KMeans(n_clusters=n_clusters, **kmeans_args)
self.clust_memberships = self.kmeans_instance.fit_predict(values_array)
[docs] def getClusterMemberships(self):
"""
Returns a list corresponding to which cluster each value was assigned.
The length of the list is equal to the number of the input values.
Each value ranges from 0 to (number of output clusters-1).
Used by the unit test to verify that the clustering works correctly.
"""
# Convert the numpy array into a python list to simplify the tests.
return list(self.clust_memberships)
[docs] def getClusteredValues(self):
"""
Return a list of clustered values. Outer list represents clusters,
each item (cluster) will consist of one or more input values.
"""
clustered_data = OrderedDict()
for index, cluster in enumerate(self.clust_memberships):
value = self._input_values[index]
try:
clustered_data[cluster].append(value)
except KeyError:
clustered_data[cluster] = [value]
return list(clustered_data.values())
[docs] def getClusterCenters(self):
"""
Return a numpy array of cluster centroids.
"""
centers_array = self.kmeans_instance.cluster_centers_
if centers_array.shape[1] == 1:
# Clustering scaler values; flatten the array:
return centers_array.flatten()
else:
# Clustering 2+ dimentaional values
return centers_array