"""
This module provides an API for performing GPU-based fingerprint similarity
queries against compound databases on FPsim servers. Two variants of the API are
provided: a query function and the FPsimSearcher class.
The module-level query function can be used to query either all or a specific
subset of the public databses available on a single FPsim server.
The FPsimSearcher class allows multiple servers to be specified, each with
either all public databases or a specific subset. The entire collection of
databases may then be queried in a single call.
The server url should be the base url without the endpoint extension (e.g.
http://fpsim.foo.com/, *not* http://fpsim.foo.com/similarity_search)
"""
import collections
import functools
import urllib
from typing import Union
import pandas
import requests
from schrodinger.infra import licensing
DEFAULT = 'FPSIM_DEFAULT'
DEFAULT_MAX_RESULTS = 100
DEFAULT_SIMILARITY_CUTOFF = 0.5
CORP_ID_SPLITTER = ';:;'
SEARCH_ERROR_STRING = 'SERVER_ERROR_ON_SEARCH'
#===============================================================================
# Functional API
#===============================================================================
[docs]def query(smiles,
url=DEFAULT,
max_results=DEFAULT_MAX_RESULTS,
similarity_cutoff=DEFAULT_SIMILARITY_CUTOFF,
dbnames=DEFAULT):
"""
Perform an FPSim query for a given smiles on a specific server.
:param smiles: the query smiles to search against
:param url: the base FPsim server URL
:param max_results: max number of matches to return
:param similarity_cutoff: minimum similarity cutoff for matches (0 - 1.0)
:param dbnames: names of databases to search. Default: all public databases
:return: a pandas.DataFrame of similar smiles with corresponding corporate
ids and similarity scores.
"""
if dbnames is DEFAULT:
dbnames = get_public_databases(url)
dbkeys = ['' for _ in dbnames]
similarity_cutoff = _convert_cutoff_to_fraction(similarity_cutoff)
data = {
'smiles': smiles,
'return_count': str(max_results),
'similarity_cutoff': str(similarity_cutoff),
'dbnames': ','.join(dbnames),
'dbkeys': ','.join(dbkeys)
}
response = _raw_query(url, data)
if not response.ok:
raise ServerError(f'Bad server response: {url}')
json_results = response.json()
if json_results == 'Server error':
raise QueryError(f'Problem with query: {data}')
if (len(json_results) == 1 and json_results[0][0] == SEARCH_ERROR_STRING):
raise SearchError(f'Problem with search: {smiles}')
return _json_to_pandas(json_results)
[docs]@functools.lru_cache()
def get_public_databases(url=DEFAULT):
"""
Retrieves a list of database names for the public databases on an FPsim
server.
:param url: the base FPsim server url. By default this will be the
Schrodinger FPsim server.
:return: a list of database names, which may be used as the dbnames
parameter to a query.
:raises requests.exception.ConnectionTimeout: when the get request, which
should be instantaneous (up to network delays), takes longer than 3s.
"""
if url is DEFAULT:
# Also performs license check
url = licensing.gpusim_current_url(
licensing.SimilarityEndpoint.DATABASES)
timeout = None
else:
licensing.licenseExists(licensing.GPUSIMILARITY)
url = urllib.parse.urljoin(url, 'dbnames')
timeout = 3
response = requests.get(url, verify=True, timeout=timeout)
if not response.ok:
raise ServerError(f"Can't fetch db list from {url}")
dbnames = response.json()
for idx, name in enumerate(dbnames):
if name.endswith('.fsim'):
dbnames[idx] = name.rstrip('.fsim')
return dbnames
#===============================================================================
# Searcher API
#===============================================================================
[docs]class FPsimSearcher:
"""
A search context for FPsim queries. Specify a collection of data sources on
the object to perform multiple queries on the same servers and databases.
"""
[docs] def __init__(self, url=DEFAULT, dbnames=DEFAULT):
self._server_dbs = collections.defaultdict(set)
if url is not None:
self.addSource(url, dbnames)
[docs] def addSource(self, url=DEFAULT, dbnames=DEFAULT):
"""
Add databases to the searcher specified by URL and database name(s).
:param url: the base FPsim server url. By default this will be the
Schrodinger FPsim server.
:param dbnames: the database name to add for the specified server. This
may be a single name or a list of names.
"""
if dbnames is DEFAULT:
dbnames = get_public_databases(url)
if isinstance(dbnames, str):
dbnames = set(dbnames)
if url:
self._server_dbs[url].update(dbnames)
[docs] def getSources(self):
"""
Gets all the sources that have been added to this searcher.
:return: a dictionary of {url:dbnames}.
"""
sources = {}
for url, dbnames in self._server_dbs.items():
if url is DEFAULT:
url = _get_default_url()
sources[url] = dbnames
return sources
[docs] def query(self,
smiles,
max_results=DEFAULT_MAX_RESULTS,
similarity_cutoff=DEFAULT_SIMILARITY_CUTOFF):
"""
Perform a query on all the sources in this searcher. Each server will be
queried in the order it was added via addSource until max_results
results have been found. The results from each server are simply
concatenated, with no de-duplication of results.
See module function query() for parameter docs.
"""
results = _json_to_pandas([])
for url, dbnames in self._server_dbs.items():
df = query(smiles,
url,
max_results=max_results,
similarity_cutoff=similarity_cutoff,
dbnames=dbnames)
results = pandas.concat([results, df])
max_results = max_results - len(df)
if max_results < 1:
break
return results
#===============================================================================
# Exception classes
#===============================================================================
[docs]class ServerError(RuntimeError):
pass
[docs]class QueryError(RuntimeError):
pass
[docs]class SearchError(RuntimeError):
pass
#===============================================================================
# Utility functions
#===============================================================================
def _json_to_pandas(json_results):
df = pandas.DataFrame(json_results,
columns=['corp_ids', 'smiles', 'similarity'])
df['corp_ids'] = df['corp_ids'].apply(_parse_corp_id_str)
return df
def _parse_corp_id_str(corp_id_str):
return corp_id_str.split(CORP_ID_SPLITTER)
def _get_default_url():
return licensing.gpusim_current_url(licensing.SimilarityEndpoint.FASTSIM)
def _raw_query(url, data):
if url is DEFAULT:
url = _get_default_url()
else:
url = urllib.parse.urljoin(url, 'similarity_search_json')
return requests.post(url, data, verify=True)
def _convert_cutoff_to_fraction(cutoff: Union[int, float]) -> float:
"""
Return a similarity cutoff between 0 and 1. Converts a percentage
cutoff to a fraction. A fractional cutoff is returned untouched if
already in the range [0.0, 1.0].
This function is required in order to always pass a fractional similarity
cutoff argument into the FPSim search while preserving compatibility with
the previous convention of accepting a percentage.
:param cutoff: minimum similarity cutoff for matches. Assumed to be a number
greater than or equal to 0.
:return: A similarity cutoff between 0 and 1.
"""
# Account for floating point noise
epsilon = 1.e-15
if 0 <= cutoff <= 1.0 + epsilon:
return min(cutoff, 1.0)
return min(cutoff, 100) / 100