"""
Utilities for working with SMARTS patterns
Copyright Schrodinger, LLC. All rights reserved.
"""
import string
import warnings
from collections import defaultdict
from collections import namedtuple
from rdkit import Chem
from schrodinger.application.matsci import msprops
from schrodinger.application.matsci import msutils
from schrodinger.structutils import analyze
SMARTSGroupData = namedtuple('SMARTSGroupData',
['number', 'name', 'pattern', 'indexes'],
defaults=[None, None])
GROUP_STRUCT_PROP_BASE = 's_matsci_smarts_group_%s'
GROUP_STRUCT_COUNT = 'i_matsci_smarts_group_matches_%s'
# see MATSCI-9181 - keep these for backwards compatibility
GROUP_NAME_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEX_PROP = 'i_matsci_smarts_group_atom_index'
GROUP_NUMBER_PROP = 'i_matsci_smarts_group_number'
GROUP_NAMES_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEXES_PROP = 's_matsci_smarts_group_atom_index'
GROUP_NUMBERS_PROP = 's_matsci_smarts_group_number'
ATOM_PROP_DELIM = ','
VALID_NAME_PUNCTUATION = '_-()[]'
VALID_NAME_CHARS = {
x for x in string.ascii_letters + string.digits + VALID_NAME_PUNCTUATION
}
VALID_NAME_DESC_TEXT = ('letters, numbers and the special characters %s' %
VALID_NAME_PUNCTUATION)
[docs]def defines_integer_atom_props(st):
"""
Return True if the given structure has SMARTS-related integer atom
properties defined. These properties were originally used when each
atom was forced to belong to a single SMARTS group. That condition
has been relaxed but these properties are kept for backwards
compatibility.
:type st: schrodinger.structure.Structure
:param st: the structure
:rtype: bool
:return: True if such properties are defined
"""
return msutils.has_atom_property(st, GROUP_ATOM_INDEX_PROP) or \
msutils.has_atom_property(st, GROUP_NUMBER_PROP)
[docs]def get_group_names(atom):
"""
Return a list of group names for the given atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:rtype: list
:return: group names
"""
tokens = atom.property.get(GROUP_NAMES_PROP)
if tokens:
return tokens.split(ATOM_PROP_DELIM)
return []
[docs]def get_group_atom_indices(atom):
"""
Return a list of group atom indices for the given atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:rtype: list
:return: group atom indices
"""
index = atom.property.get(GROUP_ATOM_INDEX_PROP)
if index:
return [index]
tokens = atom.property.get(GROUP_ATOM_INDEXES_PROP)
if tokens:
return list(map(int, tokens.split(ATOM_PROP_DELIM)))
return []
[docs]def get_group_numbers(atom):
"""
Return a list of group numbers for the given atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:rtype: list
:return: group numbers
"""
number = atom.property.get(GROUP_NUMBER_PROP)
if number:
return [number]
tokens = atom.property.get(GROUP_NUMBERS_PROP)
if tokens:
return list(map(int, tokens.split(ATOM_PROP_DELIM)))
return []
[docs]def append_property(atom, key, value):
"""
Append the given property to the atom.
:type atom: schrodinger.structure._StructureAtom
:param atom: the atom
:type key: str
:param key: the property key
:type value: str
:param value: the property value
"""
old_value = atom.property.get(key)
if old_value:
value = f'{old_value}{ATOM_PROP_DELIM}{value}'
atom.property[key] = str(value)
[docs]def validate_name(name):
"""
Make sure name has the correct set of characters
:type name: str
:param name: The string to check
:rtype: bool
:return: True if name has no invalid characters, False if any characters are
invalid
"""
return all(x in VALID_NAME_CHARS for x in name)
[docs]class SMARTSGroupError(Exception):
""" Class for exceptions related to SMARTS group finding """
[docs]def delete_group_properties(struct):
"""
Delete all SMARTS group properties (structure and atom) from the structure
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to delete properties from
"""
# Delete any structure properties that give group names or count
sbase = GROUP_STRUCT_PROP_BASE % ""
cbase = GROUP_STRUCT_COUNT % ""
tuple_base = (sbase, cbase)
for prop in list(struct.property):
if prop.startswith(tuple_base):
del struct.property[prop]
# Delete any atom-based properties
for atom in struct.atom:
for prop in [
GROUP_NAMES_PROP, GROUP_ATOM_INDEXES_PROP, GROUP_NUMBERS_PROP,
GROUP_ATOM_INDEX_PROP, GROUP_NUMBER_PROP
]:
if prop in atom.property:
del atom.property[prop]
[docs]def find_group_data(struct):
"""
Find an SMARTS group data on the structure
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to find groups on
:rtype: dict
:return: A dictionary. Keys are smarts group numbers, values are
`SMARTSGroupData` named tuples for the SMARTS group with that number
:raise `SMARTSGroupError`: If something in the data is not consistent
"""
groups = {}
for atom in struct.atom:
# validate
gindices = get_group_numbers(atom)
if not gindices:
continue
names = get_group_names(atom)
if not names:
raise SMARTSGroupError('Atom has at least a single '
'group index but no group names.')
if len(gindices) != len(names):
raise SMARTSGroupError('Atom belongs to %s groups but '
'has %s group names.' %
(len(gindices), len(names)))
for name in names:
pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name)
if not pattern:
raise SMARTSGroupError('There is no pattern specified '
'for group name %s.' % name)
# build groups
for gindex, name in zip(gindices, names):
if gindex in groups:
# New atom for existing group
group = groups[gindex]
if name != group.name:
raise SMARTSGroupError('At least two names are used for '
'group number %d: %s and %s' %
(gindex, name, group.name))
groups[gindex].indexes.append(atom.index)
else:
# New group
pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name)
inds = [atom.index]
groups[gindex] = SMARTSGroupData(number=gindex,
name=name,
pattern=pattern,
indexes=inds)
# Test that all groups with the same name have the same number of atoms
numat = {}
for group in groups.values():
total = numat.setdefault(group.name, len(group.indexes))
if len(group.indexes) != total:
raise SMARTSGroupError('Group %s has an inconsistent number of '
'atoms. Found groups with this name '
'containing %d and %d atoms.' %
(group.name, len(group.indexes), total))
return groups
[docs]def get_rdkit_atoms(smarts):
"""
Return a collection of rdkit atoms for the given
SMARTS. The return value has the length of a potential
match group, for example for 'cc' this length is 2, for
'[$([NH]([CH2])[CH2])]C' it is 2, for [n-0X2].[n-0X2] it
is 2, etc., even though there might be any number of
matches if the pattern was matched.
:type smarts: str
:param smarts: the SMARTS pattern
:raise RuntimeError: if rdkit has a problem with the SMARTS
:rtype: rdkit.Chem.rdchem._ROAtomSeq
:return: the rdkit atoms
"""
# recursive and component SMARTS supported
amol = Chem.MolFromSmarts(smarts)
if amol is None:
msg = ('Rdkit fails for the given SMARTS of '
'{smarts}.').format(smarts=smarts)
raise RuntimeError(msg)
return amol.GetAtoms()
[docs]def is_smarts_bonding_pair(smarts):
"""
Return True if the given SMARTS would match a bonding pair,
False otherwise.
:type smarts: str
:param smarts: the SMARTS pattern
:rtype: bool
:return: True if the SMARTS would match a bonding pair,
False otherwise
"""
# the smarts could be a component pattern like [n-0X2].[n-0X2]
# which is a non-bonding pair
atoms = get_rdkit_atoms(smarts)
if len(atoms) != 2:
return False
bonds = atoms[0].GetBonds()
return bool(bonds)
[docs]class SMARTSGroup(object):
"""
Handles matching and record-keeping for a SMARTS patter
"""
[docs] def __init__(self, name, pattern, logger=None):
"""
Create a SMARTSGroup object
:type name: str
:param name: The name of this SMARTS group
:type pattern: str
:param pattern: The SMARTS pattern for this group
:raise ValueError: If name has invalid characters
:raise ValueError: If the SMARTS is invalid
"""
if not validate_name(name):
raise ValueError(
'The SMARTS group name, %s, should only contain %s' %
(name, VALID_NAME_DESC_TEXT))
with warnings.catch_warnings():
warnings.filterwarnings(action="ignore",
category=DeprecationWarning,
message=r"analyze\.validate_smarts")
valid, msg = analyze.validate_smarts(pattern)
if not valid:
raise ValueError('The SMARTS pattern %s is not valid. Error was: '
'\n%s' % (pattern, msg))
self.name = name
self.pattern = pattern
self.sprop = GROUP_STRUCT_PROP_BASE % self.name
self.cprop = GROUP_STRUCT_COUNT % self.name
self.last_number = 0
self.logger = logger
[docs] def nextNumber(self, numbers_used):
"""
Get the next unused group number
:type numbers_used: set
:param numbers_used: Each member is a number that has already been used
for a group and is unavailable. The number returned by this function
is added to the numbers_used set.
:rtype: int
:return: The lowest available number. This number will have been added
to the numbers_used set.
"""
while True:
self.last_number += 1
if self.last_number not in numbers_used:
numbers_used.add(self.last_number)
return self.last_number
[docs] def prioritizeBackbone(self, matches, backbone_atoms):
"""
Prioritize matches that are in backbone of the molecule
:param matches: List of list containing the smart pattern matches
:type matches: list
:type backbone_atoms: dict
:param backbone_atoms: dictionary with key as molecule number and
backbone atoms index ordered in a list
:returns: List of list containing the smart pattern matches where the
matches in the backbone appears first
:rtype: list
"""
sorted_matches = []
for mol_num in sorted(matches):
# get matches and backbone atoms for the molecule
mol_matches = matches[mol_num]
mol_back_atoms = backbone_atoms.get(mol_num, [])
backbone_positions = {y: x for x, y in enumerate(mol_back_atoms)}
# List to hold positions for matches where at least one atom is in
# in the backbone
pos_backbone_matches = []
# List to hold positions for matches where there is no match in the
# backbone of the molecule
index_sidechain_matches = []
for match in mol_matches:
if not match:
continue
# Loop through atom indexes of the smarts match and store their
# postion in the backbone
match_bb_positions = [
backbone_positions[x]
for x in match
if x in backbone_positions
]
if match_bb_positions:
pos_backbone_matches.append(
(min(match_bb_positions), match))
else:
# In case none of the match atoms are in backbone, use the
# lowest atom index of the match
index_sidechain_matches.append((min(match), match))
for value_match in [pos_backbone_matches, index_sidechain_matches]:
sorted_matches.extend([y for x, y in sorted(value_match)])
return sorted_matches
def _getUniqueMonomersInMatches(self, struct, matches):
"""
Split the matches into dictionaries depending on the number of unique
monomer constituting the match.
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to find SMARTS pattern match in
:param matches: List of list containing the smart pattern matches
:type matches: list
:returns: Dictionary where the values are list of matches and the key is
the number of unique monomers to which the match belongs to.
:rtype: dict
"""
monomers_idx_matches = defaultdict(list)
for match in matches:
# Find the monomer number of atom ids in the match
monomer_numbers = set()
for idx in match:
monomer_num = struct.atom[idx].property.get(
msprops.MONOMER_NUMBER)
if monomer_num is not None:
monomer_numbers.add(monomer_num)
monomers_idx_matches[len(monomer_numbers)].append(match)
return dict(monomers_idx_matches)
[docs] def prioritizeSameMonomers(self, struct, matches):
"""
Prioritize matches that belong to same (or least) number of unique
monomers.
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to find SMARTS pattern match in
:param matches: List of list containing the smart pattern matches
:type matches: list
:returns: List of list containing the smart pattern matches where the
matches that belong to same (or least) number of unique monomers
appear first
:rtype: list
"""
monomers_idx_matches = self._getUniqueMonomersInMatches(struct, matches)
sorted_matches = []
for num_monomers in sorted(monomers_idx_matches):
sorted_matches.extend(monomers_idx_matches[num_monomers])
return sorted_matches
[docs] def orderedMatches(self, struct, backbone_atoms):
"""
Evaluate the smarts pattern matches, where matches are ordered to follow
network sequence. Consider backbone atoms matches first in the sequence
and the side chain matches are then ordered according to atom index.
:param `schrodinger.structure.Structure` struct: The structure to
delete properties from
:param dict backbone_atoms: dictionary with key as molecule number and
backbone atoms index ordered in a list
:return list(list): List of list containing the smart pattern matches
"""
# Find matches in for the smart pattern
with warnings.catch_warnings():
warnings.filterwarnings(action="ignore",
category=DeprecationWarning,
message=r"analyze\.evaluate_smarts")
matches = analyze.evaluate_smarts_by_molecule(struct,
self.pattern,
canvas=False,
matches_by_mol=True,
unique_sets=True)
backbone_sorted_matches = self.prioritizeBackbone(
matches, backbone_atoms)
monomer_prioritized_matches = self.prioritizeSameMonomers(
struct, backbone_sorted_matches)
return monomer_prioritized_matches
[docs] def getSmartsDictData(self, struct, match, allow_partial_overlap):
"""
Gets the smarts dictionary data for the passed match
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to which the SMARTS match belongs to
:param match: The list of atom indices that belong to the SMARTS match
:type match: list
:param allow_partial_overlap: Whether partial overlap is allowed
:type allow_partial_overlap: bool
:return: Dictionary where the key is a SMARTSGroupData and the value
is the list of atom indices that belong to the group
:rtype: dict
:raises ValueError: If overlapping is not supported due to backwards
incompatibility
"""
smarts_group_data_dict = defaultdict(list)
for idx in match:
atom = struct.atom[idx]
if allow_partial_overlap and (
atom.property.get(GROUP_ATOM_INDEX_PROP) or
atom.property.get(GROUP_NUMBER_PROP)):
raise ValueError(
'Backwards compatibility is supported but not here when '
'allowing partial overlaps.')
for group_name, group_number in zip(get_group_names(atom),
get_group_numbers(atom)):
smarts_group_data = SMARTSGroupData(name=group_name,
number=group_number)
smarts_group_data_dict[smarts_group_data].append(idx)
return smarts_group_data_dict
[docs] def isAlreadyMatched(self, match, smarts_group_matches):
"""
Check if atoms have already been matched to the same SMARTS pattern.
This allows creating a group from some atoms already matched to some
group and other atoms already matched to some other group but prevents
creating a group that is a sub-group of another group
:param match: The list of atom indices that belong to the SMARTS match
:type match: list
:param smarts_group_matches: Dictionary where the key is a
unique SMARTS pattern and the value is the list of atom indices that
belong to the group
:type smarts_group_matches: dict
:rtype: bool
:return: Whether the match has already been matched to same SMARTS
"""
for idxs in smarts_group_matches.values():
if set(match) == set(idxs):
return True
return False
[docs] def onlyDifferByAHydrogen(self, struct, match, smarts_group_matches):
"""
Check all atoms except a single H atom have already been matched to the
same SMARTS group. This prevents matching a terminal methyl 3
times as is the case for a polymer head or tail monomer with a
terminating hydrogen
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to which the SMARTS match belongs to
:param match: The list of atom indices that belong to the SMARTS match
:type match: list
:param smarts_group_matches: Dictionary where the key is a
unique SMARTS pattern and the value is the list of atom indices that
belong to the group
:type smarts_group_matches: dict
:rtype: bool
:return: Whether all atoms except a single H atom have already been
matched
"""
for idxs in smarts_group_matches.values():
if len(idxs) + 1 == len(match):
diff_idxs = list(set(match).difference(idxs))
if (len(diff_idxs) == 1 and
struct.atom[diff_idxs[0]].atomic_number == 1):
return True
return False
[docs] def isOverlapAllowedInMatch(self, struct, match, smarts_group_data_dict):
"""
Determines if overlap is allowed for the current match.
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to which the SMARTS match belongs to
:param match: The list of atom indices that belong to the SMARTS match
:type match: list
:param smarts_group_data_dict: Dictionary where the key is a
SMARTSGroupData and the value is the list of atom indices that
belong to the group
:type smarts_group_data_dict: dict
:rtype: bool
:return: Whether overlapping is allowed for the current match
"""
# Combine groups with same name to avoid multiple overlapping of same
# smarts type
smarts_group_matches = defaultdict(list)
for smarts_data, overlap_atom_ids in smarts_group_data_dict.items():
smarts_group_matches[smarts_data.name].extend(overlap_atom_ids)
for match_dict in [smarts_group_matches, smarts_group_data_dict]:
if (self.isAlreadyMatched(match, match_dict) or
self.onlyDifferByAHydrogen(struct, match, match_dict)):
return False
return True
[docs] def addMatchToAtoms(self, struct, numbers_used, match):
"""
Adds match information to structure atom property
:type struct: `schrodinger.structure.Structure`
:param struct: The structure to which the SMARTS match belongs to
:type numbers_used: set
:param numbers_used: Each member is a number that has already been used
for a group and is unavailable. The number returned by this function
is added to the numbers_used set.
:param match: The list of atom indices from the match
:type match: list
"""
number = self.nextNumber(numbers_used)
for match_index, atom_index in enumerate(match, 1):
atom = struct.atom[atom_index]
append_property(atom, GROUP_NAMES_PROP, self.name)
append_property(atom, GROUP_ATOM_INDEXES_PROP, match_index)
append_property(atom, GROUP_NUMBERS_PROP, number)
[docs] def log(self, msg):
"""
Log message if logger is present
:param msg: The message to log
:type msg: str
"""
if self.logger:
self.logger(msg)
[docs] def match(self,
struct,
numbers_used,
backbone_atoms,
allow_partial_overlap=False):
"""
Find all the SMARTS groups matching the SMARTS pattern and mark them
with the appropriate properties
:param `schrodinger.structure.Structure` struct: The structure to
find SMARTS pattern match in
:param set numbers_used: A set of all the group numbers that have been
used and are unavailable
:param dict backbone_atoms: dictionary with key as molecule number and
backbone atoms index ordered in a list
:param bool allow_partial_overlap: whether atoms can belong to multiple
groups
"""
self.last_number, valid_matches = 0, 0
self.log(
f'Matching Group: name = {self.name}, pattern = {self.pattern}')
matches = self.orderedMatches(struct, backbone_atoms)
for match in matches:
try:
smarts_group_data_dict = self.getSmartsDictData(
struct, match, allow_partial_overlap)
except ValueError:
continue
if allow_partial_overlap and not self.isOverlapAllowedInMatch(
struct, match, smarts_group_data_dict):
continue
elif not allow_partial_overlap and smarts_group_data_dict:
# if not allowing partial overlaps then none of the atoms
# in the match are allowed to have already been matched
continue
valid_matches += 1
self.addMatchToAtoms(struct, numbers_used, match)
if valid_matches:
# last_number will be > 0 if there were any valid matches
struct.property[self.sprop] = self.pattern
struct.property[self.cprop] = valid_matches
msg = '%d matches found' % len(matches)
if matches:
msg += ', %d were tagged as new particles' % valid_matches
self.log(msg)