"""
Support for Canvas fingerprint similarity operations.
There are classes to perform similarity calculations and to support command
line interfaces for similarity options.
Copyright Schrodinger, LLC. All rights reserved.
"""
# Contributors: Quentin McDonald
from collections import OrderedDict
from textwrap import dedent
from schrodinger.infra import canvas
[docs]class CanvasSimilarityNotImplemented(Exception):
""" For CanvasSimilarity method not yet implemented """
[docs] def __init__(self, *args):
Exception.__init__(self, *args)
############# Canvas classes begin here ##################################
[docs]class CanvasFingerprintSimilarity(object):
"""
A class which encapsulates the Canvas fingerprint similarity tools.
This includes recording and implementing the available similarity metrics.
Currently the metrics are implemented at the Python level as this
demonstrates how the fingerprint manipulations are performed however
ultimately these will be replaced with wrappers to the underlying
Canvaslibs tools which should be more efficient
"""
SIMILARITY_METRICS = [
"Tanimoto", "Modified Tanimoto", "Hamming", "Soergel", "McConnaughey",
"Dice", "Cosine", "Simpson", "Petke", "Kulczynski", "Euclidean",
"Tversky", "Buser", "Variance", "Size", "Shape", "Pattern Difference",
"Hamann", "Matching", "Pearson", "Rogers Tanimoto", "Yule", "Dixon",
"MinMax"
]
[docs] def __init__(self, logger):
"""
Initialize the similarity class
"""
self._logger = logger
self.SIMILARITY_METRICS.sort()
# Create a mapping between "short" names (as might be used
# in a command line application) and the full metric names.
self.SHORT_SIMILARITY_METRICS = []
self.SHORT_TO_LONG_NAMES = {}
for metric in self.SIMILARITY_METRICS:
# Convert spaces to underscores
short = metric.lower().replace(" ", "_")
self.SHORT_SIMILARITY_METRICS.append(short)
self.SHORT_TO_LONG_NAMES[short] = metric
# Initialize the Tversky alpha and Beta
self._alpha = 0.5
self._beta = 0.5
# Create a dispatch table which associates each metric name
# with a callable method to implement it:
self._metric_funcs = OrderedDict()
self._metric_funcs["Tanimoto"] = self.simTanimoto
self._metric_funcs["Tversky"] = self.simTversky
self._metric_funcs["Soergel"] = self.simSoergel
self._metric_funcs["McConnaughey"] = self.simMcConnaughey
self._metric_funcs["Dice"] = self.simDice
self._metric_funcs["Cosine"] = self.simCosine
self._metric_funcs["Simpson"] = self.simSimpson
self._metric_funcs["Petke"] = self.simPetke
self._metric_funcs["Kulczynski"] = self.simKulczynski
self._metric_funcs["Buser"] = self.simBuser
self._metric_funcs["Hamann"] = self.simHamann
self._metric_funcs["Matching"] = self.simMatching
self._metric_funcs["Pearson"] = self.simPearson
self._metric_funcs["Rogers Tanimoto"] = self.simRogersTanimoto
self._metric_funcs["Yule"] = self.simYule
self._metric_funcs["Euclidean"] = self.simEuclidean
self._metric_funcs["Hamming"] = self.simHamming
self._metric_funcs["Modified Tanimoto"] = self.simModifiedTanimoto
self._metric_funcs["Pattern Difference"] = self.simPatternDifference
self._metric_funcs["Shape"] = self.simShape
self._metric_funcs["Size"] = self.simSize
self._metric_funcs["Variance"] = self.simVariance
self._metric_funcs["Dixon"] = self.simDixon
self._metric_funcs["MinMax"] = self.simMinMax
# Default is Tanimoto
self.setMetric("Tanimoto")
# Create a table which associates metric names with
# enums used in pairwise distance matrix generation
self._metric_style = OrderedDict()
self._metric_style["Tanimoto"] = canvas.ChmMetricStyle_tanimoto
self._metric_style[
"Modified Tanimoto"] = canvas.ChmMetricStyle_modifiedTanimoto
self._metric_style["Hamming"] = canvas.ChmMetricStyle_hamming
self._metric_style["Soergel"] = canvas.ChmMetricStyle_soergel
self._metric_style["McConnaughey"] = canvas.ChmMetricStyle_mcConnaughey
self._metric_style["Dice"] = canvas.ChmMetricStyle_dice
self._metric_style["Cosine"] = canvas.ChmMetricStyle_cosine
self._metric_style["Simpson"] = canvas.ChmMetricStyle_simpson
self._metric_style["Petke"] = canvas.ChmMetricStyle_petke
self._metric_style["Kulczynski"] = canvas.ChmMetricStyle_kulczynski
self._metric_style["Euclidean"] = canvas.ChmMetricStyle_euclidean
self._metric_style["Tversky"] = canvas.ChmMetricStyle_tversky
self._metric_style["Buser"] = canvas.ChmMetricStyle_buser
self._metric_style["Variance"] = canvas.ChmMetricStyle_variance
self._metric_style["Size"] = canvas.ChmMetricStyle_size
self._metric_style["Shape"] = canvas.ChmMetricStyle_shape
self._metric_style[
"Pattern Difference"] = canvas.ChmMetricStyle_patternDifference
self._metric_style["Hamann"] = canvas.ChmMetricStyle_hamann
self._metric_style["Matching"] = canvas.ChmMetricStyle_matching
self._metric_style["Pearson"] = canvas.ChmMetricStyle_pearson
self._metric_style[
"Rogers Tanimoto"] = canvas.ChmMetricStyle_rogersTanimoto
self._metric_style["Yule"] = canvas.ChmMetricStyle_yule
self._metric_style["Dixon"] = canvas.ChmMetricStyle_dixon
self._metric_style["MinMax"] = canvas.ChmMetricStyle_cminmax
[docs] def debug(self, output):
"""
Wrapper for debug logging, just to simplify logging
"""
self._logger.debug(output)
[docs] def getDescription(self):
"""
Returns a string representing a summary of the current
similarity settings
"""
desc = "%s" % (self._current_metric)
return desc
[docs] def setMetric(self, metric_name):
"""
Set the current metric based on the metric name
"""
# Convert to the long name if necessary:
name = self.SHORT_TO_LONG_NAMES.get(metric_name, metric_name)
if (name in self.SIMILARITY_METRICS):
self.debug("FPSim - setting metric to %s" % name)
self._current_metric = name
self._current_metric_func = self._metric_funcs.get(name, None)
else:
raise Exception("Unknown similarity metric name: %s" % metric_name)
[docs] def getMetric(self):
"""
Returns the currently set metric
"""
return self._current_metric
[docs] def calculateSimilarity(self, fp1, fp2):
"""
Calculate the similarity between the two fingerprints and return
the value. The similarity is calculated using the similarity
method which is current for this object (as set by setMetric())
"""
if not self._current_metric_func:
raise CanvasSimilarityNotImplemented(
"No implementation currently available for %s" %
(self._current_metric))
else:
return self._current_metric_func(fp1, fp2)
[docs] def setAlpha(self, alpha):
"""
Set the value of Alpha as used in the tversky similarity
"""
self._alpha = alpha
[docs] def setBeta(self, beta):
"""
Set the value of Alpha as used in the tversky similarity
"""
self._beta = beta
[docs] def getAlpha(self):
"""
Get the value of Alpha as used in the tversky similarity
"""
return self._alpha
[docs] def getBeta(self):
"""
Get the value of Alpha as used in the tversky similarity
"""
return self._beta
[docs] def getMetricStyle(self):
"""
Return a value corresponding to the current metric style. This
is used in difference matrix construction as part of clustering
"""
return self._metric_style[self._current_metric]
def _getABC(self, fp1, fp2):
"""
Most similarity methods use three quantities calculated from
the input fingerprints. This private method calculates these
so as to avoid duplicated code everywhere. The a, b and c
are returned as a tuple of floats
"""
a = float(fp1.count())
b = float(fp2.count())
c = fp1.countCommonOn(fp2)
return (a, b, c)
# Similarity implementations follow:
[docs] def simHamming(self, fp1, fp2):
return fp1.distHamming(fp2)
[docs] def simModifiedTanimoto(self, fp1, fp2):
return fp1.simModifiedTanimoto(fp2)
[docs] def simPatternDifference(self, fp1, fp2):
return fp1.distPatternDifference(fp2)
[docs] def simShape(self, fp1, fp2):
return fp1.distShape(fp2)
[docs] def simSize(self, fp1, fp2):
return fp1.distSize(fp2)
[docs] def simVariance(self, fp1, fp2):
return fp1.distVariance(fp2)
[docs] def simEuclidean(self, fp1, fp2):
return fp1.distEuclidean(fp2)
[docs] def simTanimoto(self, fp1, fp2):
return fp1.simTanimoto(fp2)
[docs] def simTversky(self, fp1, fp2):
return fp1.simTversky(fp2, self._alpha, self._beta)
[docs] def simSoergel(self, fp1, fp2):
return fp1.distSoergel(fp2)
[docs] def simMcConnaughey(self, fp1, fp2):
return fp1.simMcConnaughey(fp2)
[docs] def simDice(self, fp1, fp2):
return fp1.simDice(fp2)
[docs] def simCosine(self, fp1, fp2):
return fp1.simCosine(fp2)
[docs] def simSimpson(self, fp1, fp2):
return fp1.simSimpson(fp2)
[docs] def simPetke(self, fp1, fp2):
return fp1.simPetke(fp2)
[docs] def simKulczynski(self, fp1, fp2):
return fp1.simKulczynski(fp2)
[docs] def simBuser(self, fp1, fp2):
return fp1.simBuser(fp2)
[docs] def simHamann(self, fp1, fp2):
return fp1.simHamann(fp2)
[docs] def simMatching(self, fp1, fp2):
return fp1.simMatching(fp2)
[docs] def simPearson(self, fp1, fp2):
return fp1.simPearson(fp2)
[docs] def simRogersTanimoto(self, fp1, fp2):
return fp1.simRogersTanimoto(fp2)
[docs] def simYule(self, fp1, fp2):
return fp1.simYule(fp2)
[docs] def simDixon(self, fp1, fp2):
return fp1.distDixon(fp2)
[docs] def simMinMax(self, fp1, fp2):
return fp1.simMinMax(fp2)
############# Command line specific classes start here:
[docs]class CanvasFingerprintSimilarityCLI(CanvasFingerprintSimilarity):
"""
A subclass of the CanvasFingerprintSimilarity class which supports
operations from the command line. In particular the parsing and
applying of options and the printing of a description of the
available similarity metrics
"""
[docs] def __init__(self, logger):
super(CanvasFingerprintSimilarityCLI, self).__init__(logger)
[docs] def addOptions(self, parser):
"""
Add options for similarity type, alpha and beta
"""
parser.add_argument("-sim_type",
action="store",
type=str,
default="tanimoto",
choices=self.SHORT_SIMILARITY_METRICS,
metavar="type",
help="Similarity metric to be used")
parser.add_argument("-sim_alpha",
action="store",
metavar="<alpha>",
default=0.5,
help="Alpha for Tversky similarity (default = 0.5)")
parser.add_argument("-sim_beta",
action="store",
metavar="<alpha>",
default=0.5,
help="Beta for Tversky similarity (default = 0.5)")
[docs] def parseOptions(self, options):
"""
Examine the options and set the internal state to reflect
them
"""
self.setMetric(options.sim_type)
self.setAlpha(float(options.sim_alpha))
self.setBeta(float(options.sim_beta))
[docs] def getSimilarityMetricDescription(self):
"""
Return a string which contains a description available similarity
"""
desc = """
Available similarity metrics are : \n"""
for metric in self.SHORT_SIMILARITY_METRICS:
desc = "%s %s (%s)\n" % (
desc, self.SHORT_TO_LONG_NAMES[metric], metric)
return dedent(desc)