"""
Utilities for working with SMARTS patterns
Copyright Schrodinger, LLC. All rights reserved.
"""
import string
import warnings
from collections import namedtuple
from rdkit import Chem
from schrodinger.application.matsci import msutils
from schrodinger.structutils import analyze
SMARTSGroupData = namedtuple('SMARTSGroupData',
['number', 'name', 'pattern', 'indexes'],
defaults=[None, None])
GROUP_STRUCT_PROP_BASE = 's_matsci_smarts_group_%s'
GROUP_STRUCT_COUNT = 'i_matsci_smarts_group_matches_%s'
# see MATSCI-9181 - keep these for backwards compatibility
GROUP_NAME_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEX_PROP = 'i_matsci_smarts_group_atom_index'
GROUP_NUMBER_PROP = 'i_matsci_smarts_group_number'
GROUP_NAMES_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEXES_PROP = 's_matsci_smarts_group_atom_index'
GROUP_NUMBERS_PROP = 's_matsci_smarts_group_number'
ATOM_PROP_DELIM = ','
VALID_NAME_PUNCTUATION = '_-()[]'
VALID_NAME_CHARS = {
x for x in string.ascii_letters + string.digits + VALID_NAME_PUNCTUATION
}
VALID_NAME_DESC_TEXT = ('letters, numbers and the special characters %s' %
VALID_NAME_PUNCTUATION)
[docs]def defines_integer_atom_props(st):
"""
Return True if the given structure has SMARTS-related integer atom
properties defined. These properties were originally used when each
atom was forced to belong to a single SMARTS group. That condition
has been relaxed but these properties are kept for backwards
compatibility.
:type st: schrodinger.structure.Structure
:param st: the structure
:rtype: bool
:return: True if such properties are defined
"""
return msutils.has_atom_property(st, GROUP_ATOM_INDEX_PROP) or \
msutils.has_atom_property(st, GROUP_NUMBER_PROP)
[docs]def get_group_names(atom):
"""
Return a list of group names for the given atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:rtype: list
:return: group names
"""
tokens = atom.property.get(GROUP_NAMES_PROP)
if tokens:
return tokens.split(ATOM_PROP_DELIM)
return []
[docs]def get_group_atom_indices(atom):
"""
Return a list of group atom indices for the given atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:rtype: list
:return: group atom indices
"""
index = atom.property.get(GROUP_ATOM_INDEX_PROP)
if index:
return [index]
tokens = atom.property.get(GROUP_ATOM_INDEXES_PROP)
if tokens:
return list(map(int, tokens.split(ATOM_PROP_DELIM)))
return []
[docs]def get_group_numbers(atom):
"""
Return a list of group numbers for the given atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:rtype: list
:return: group numbers
"""
number = atom.property.get(GROUP_NUMBER_PROP)
if number:
return [number]
tokens = atom.property.get(GROUP_NUMBERS_PROP)
if tokens:
return list(map(int, tokens.split(ATOM_PROP_DELIM)))
return []
[docs]def append_property(atom, key, value):
"""
Append the given property to the atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:type key: str
:param key: the property key
:type value: str
:param value: the property value
"""
old_value = atom.property.get(key)
if old_value:
value = f'{old_value}{ATOM_PROP_DELIM}{value}'
atom.property[key] = str(value)
[docs]def validate_name(name):
"""
Make sure name has the correct set of characters
:type name: str
:param name: The string to check
:rtype: bool
:return: True if name has no invalid characters, False if any characters are
invalid
"""
return all(x in VALID_NAME_CHARS for x in name)
[docs]class SMARTSGroupError(Exception):
""" Class for exceptions related to SMARTS group finding """
[docs]def delete_group_properties(struct):
"""
Delete all SMARTS group properties (structure and atom) from the structure
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to delete properties from
"""
# Delete any structure properties that give group names or count
sbase = GROUP_STRUCT_PROP_BASE % ""
cbase = GROUP_STRUCT_COUNT % ""
tuple_base = (sbase, cbase)
for prop in list(struct.property):
if prop.startswith(tuple_base):
del struct.property[prop]
# Delete any atom-based properties
for atom in struct.atom:
for prop in [
GROUP_NAMES_PROP, GROUP_ATOM_INDEXES_PROP, GROUP_NUMBERS_PROP,
GROUP_ATOM_INDEX_PROP, GROUP_NUMBER_PROP
]:
if prop in atom.property:
del atom.property[prop]
[docs]def find_group_data(struct):
"""
Find an SMARTS group data on the structure
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to find groups on
:rtype: dict
:return: A dictionary. Keys are smarts group numbers, values are
`SMARTSGroupData` named tuples for the SMARTS group with that number
:raise `SMARTSGroupError`: If something in the data is not consistent
"""
groups = {}
for atom in struct.atom:
# validate
gindices = get_group_numbers(atom)
if not gindices:
continue
names = get_group_names(atom)
if not names:
raise SMARTSGroupError('Atom has at least a single '
'group index but no group names.')
if len(gindices) != len(names):
raise SMARTSGroupError('Atom belongs to %s groups but '
'has %s group names.' %
(len(gindices), len(names)))
for name in names:
pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name)
if not pattern:
raise SMARTSGroupError('There is no pattern specified '
'for group name %s.' % name)
# build groups
for gindex, name in zip(gindices, names):
if gindex in groups:
# New atom for existing group
group = groups[gindex]
if name != group.name:
raise SMARTSGroupError('At least two names are used for '
'group number %d: %s and %s' %
(gindex, name, group.name))
groups[gindex].indexes.append(atom.index)
else:
# New group
pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name)
inds = [atom.index]
groups[gindex] = SMARTSGroupData(number=gindex,
name=name,
pattern=pattern,
indexes=inds)
# Test that all groups with the same name have the same number of atoms
numat = {}
for group in groups.values():
total = numat.setdefault(group.name, len(group.indexes))
if len(group.indexes) != total:
raise SMARTSGroupError('Group %s has an inconsistent number of '
'atoms. Found groups with this name '
'containing %d and %d atoms.' %
(group.name, len(group.indexes), total))
return groups
[docs]def get_rdkit_atoms(smarts):
"""
Return a collection of rdkit atoms for the given
SMARTS. The return value has the length of a potential
match group, for example for 'cc' this length is 2, for
'[$([NH]([CH2])[CH2])]C' it is 2, for [n-0X2].[n-0X2] it
is 2, etc., even though there might be any number of
matches if the pattern was matched.
:type smarts: str
:param smarts: the SMARTS pattern
:raise RuntimeError: if rdkit has a problem with the SMARTS
:rtype: rdkit.Chem.rdchem._ROAtomSeq
:return: the rdkit atoms
"""
# recursive and component SMARTS supported
amol = Chem.MolFromSmarts(smarts)
if amol is None:
msg = ('Rdkit fails for the given SMARTS of '
'{smarts}.').format(smarts=smarts)
raise RuntimeError(msg)
return amol.GetAtoms()
[docs]def is_smarts_bonding_pair(smarts):
"""
Return True if the given SMARTS would match a bonding pair,
False otherwise.
:type smarts: str
:param smarts: the SMARTS pattern
:rtype: bool
:return: True if the SMARTS would match a bonding pair,
False otherwise
"""
# the smarts could be a component pattern like [n-0X2].[n-0X2]
# which is a non-bonding pair
atoms = get_rdkit_atoms(smarts)
if len(atoms) != 2:
return False
bonds = atoms[0].GetBonds()
return bool(bonds)
[docs]class SMARTSGroup(object):
"""
Handles matching and record-keeping for a SMARTS patter
"""
[docs] def __init__(self, name, pattern, logger=None):
"""
Create a SMARTSGroup object
:type name: str
:param name: The name of this SMARTS group
:type pattern: str
:param pattern: The SMARTS pattern for this group
:raise ValueError: If name has invalid characters
:raise ValueError: If the SMARTS is invalid
"""
if not validate_name(name):
raise ValueError(
'The SMARTS group name, %s, should only contain %s' %
(name, VALID_NAME_DESC_TEXT))
with warnings.catch_warnings():
warnings.filterwarnings(action="ignore",
category=DeprecationWarning,
message=r"analyze\.validate_smarts")
valid, msg = analyze.validate_smarts(pattern)
if not valid:
raise ValueError('The SMARTS pattern %s is not valid. Error was: '
'\n%s' % (pattern, msg))
self.name = name
self.pattern = pattern
self.sprop = GROUP_STRUCT_PROP_BASE % self.name
self.cprop = GROUP_STRUCT_COUNT % self.name
self.last_number = 0
self.logger = logger
[docs] def nextNumber(self, numbers_used):
"""
Get the next unused group number
:type numbers_used: set
:param numbers_used: Each member is a number that has already been used
for a group and is unavailable. The number returned by this function
is added to the numbers_used set.
:rtype: int
:return: The lowest available number. This number will have been added
to the numbers_used set.
"""
while True:
self.last_number += 1
if self.last_number not in numbers_used:
numbers_used.add(self.last_number)
return self.last_number
[docs] def orderedMatches(self, struct, backbone_atoms):
"""
Evaluate the smarts pattern matches, where matches are ordered to follow
network sequence. Consider backbone atoms matches first in the sequence
and the side chian matches are then ordered according to atom index.
:param `schrodinger.structure.Structure` struct: The structure to
delete properties from
:param dict backbone_atoms: dictionary with key as molecule number and
backbone atoms index ordered in a list
:return list(list): List of list containing the smart pattern matches
"""
sorted_matches = []
# Find matches in for the smart pattern
with warnings.catch_warnings():
warnings.filterwarnings(action="ignore",
category=DeprecationWarning,
message=r"analyze\.evaluate_smarts")
matches = analyze.evaluate_smarts_by_molecule(struct,
self.pattern,
canvas=False,
matches_by_mol=True,
unique_sets=True)
for mol_num in sorted(matches):
# get matches and backbone atoms for the molecule
mol_matches = matches[mol_num]
mol_back_atoms = backbone_atoms.get(mol_num, [])
backbone_positions = {y: x for x, y in enumerate(mol_back_atoms)}
# List to hold positions for matches where at least one atom is in
# in the backbone
pos_backbone_matches = []
# List to hold positions for matches where there is no match in the
# backbone of the molecule
index_sidechain_matches = []
for match in mol_matches:
if not match:
continue
# Loop through atom indexes of the smarts match and store their
# postion in the backbone
match_bb_positions = [
backbone_positions[x]
for x in match
if x in backbone_positions
]
if match_bb_positions:
pos_backbone_matches.append(
(min(match_bb_positions), match))
else:
# In case none of the match atoms are in backbone, use the
# lowest atom index of the match
index_sidechain_matches.append((min(match), match))
for value_match in [pos_backbone_matches, index_sidechain_matches]:
sorted_matches.extend([y for x, y in sorted(value_match)])
return sorted_matches
[docs] def match(self,
struct,
numbers_used,
backbone_atoms,
allow_partial_overlap=False):
"""
Find all the SMARTS groups matching the SMARTS pattern and mark them
with the appropriate properties
:param `schrodinger.structure.Structure` struct: The structure to
delete properties from
:param set numbers_used: A set of all the group numbers that have been
used and are unavailable
:param dict backbone_atoms: dictionary with key as molecule number and
backbone atoms index ordered in a list
:param bool allow_partial_overlap: whether atoms can belong to multiple
groups
"""
self.last_number = 0
if self.logger:
self.logger('Matching Group: name = %s, pattern = %s' %
(self.name, self.pattern))
matches = self.orderedMatches(struct, backbone_atoms)
msg = '%d matches found' % len(matches)
valid_matches = 0
for match in matches:
# some backwards compatibility is supported but not here
# when allowing partial overlaps
incompatible = False
smarts_group_data_dict = {}
for idx in match:
atom = struct.atom[idx]
if allow_partial_overlap and (
atom.property.get(GROUP_ATOM_INDEX_PROP) or
atom.property.get(GROUP_NUMBER_PROP)):
incompatible = True
break
for group_name, group_number in zip(get_group_names(atom),
get_group_numbers(atom)):
smarts_group_data = SMARTSGroupData(name=group_name,
number=group_number)
smarts_group_data_dict.setdefault(smarts_group_data,
[]).append(idx)
if incompatible:
continue
# if not allowing partial overlaps then none of the atoms
# in the match are allowed to have already been matched
if not allow_partial_overlap and smarts_group_data_dict:
continue
# if allowing partial overlap and (1) all atoms have already been
# matched to the same SMARTS group or (2) all atoms except a single
# H atom have already been matched to the same SMARTS group, then
# skip the match, the former allows creating a group from some atoms
# already matched to some group and other atoms already matched to
# some other group but prevents creating a group that is a sub-group
# of another group, the latter prevents matching a terminal methyl 3
# times as is the case for a polymer head or tail monomer with a
# terminating hydrogen
if allow_partial_overlap:
skip = False
for idxs in smarts_group_data_dict.values():
if len(idxs) == len(match):
skip = True
elif len(idxs) + 1 == len(match):
idx = list(set(match).difference(idxs))[0]
if struct.atom[idx].atomic_number == 1:
skip = True
if skip:
break
if skip:
continue
valid_matches += 1
number = self.nextNumber(numbers_used)
for match_index, atom_index in enumerate(match, 1):
atom = struct.atom[atom_index]
append_property(atom, GROUP_NAMES_PROP, self.name)
append_property(atom, GROUP_ATOM_INDEXES_PROP, match_index)
append_property(atom, GROUP_NUMBERS_PROP, number)
if valid_matches:
# last_number will be > 0 if there were any valid matches
struct.property[self.sprop] = self.pattern
struct.property[self.cprop] = valid_matches
if matches:
msg += ', %d were tagged as new particles' % valid_matches
if self.logger:
self.logger(msg)