Source code for schrodinger.application.matsci.smartsutils

"""
Utilities for working with SMARTS patterns

Copyright Schrodinger, LLC. All rights reserved.
"""

import string
import warnings
from collections import namedtuple

from rdkit import Chem

from schrodinger.application.matsci import msutils
from schrodinger.structutils import analyze

SMARTSGroupData = namedtuple('SMARTSGroupData',
                             ['number', 'name', 'pattern', 'indexes'],
                             defaults=[None, None])

GROUP_STRUCT_PROP_BASE = 's_matsci_smarts_group_%s'
GROUP_STRUCT_COUNT = 'i_matsci_smarts_group_matches_%s'

# see MATSCI-9181 - keep these for backwards compatibility
GROUP_NAME_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEX_PROP = 'i_matsci_smarts_group_atom_index'
GROUP_NUMBER_PROP = 'i_matsci_smarts_group_number'

GROUP_NAMES_PROP = 's_matsci_smarts_group_name'
GROUP_ATOM_INDEXES_PROP = 's_matsci_smarts_group_atom_index'
GROUP_NUMBERS_PROP = 's_matsci_smarts_group_number'

ATOM_PROP_DELIM = ','

VALID_NAME_PUNCTUATION = '_-()[]'
VALID_NAME_CHARS = {
    x for x in string.ascii_letters + string.digits + VALID_NAME_PUNCTUATION
}
VALID_NAME_DESC_TEXT = ('letters, numbers and the special characters %s' %
                        VALID_NAME_PUNCTUATION)


[docs]def defines_integer_atom_props(st): """ Return True if the given structure has SMARTS-related integer atom properties defined. These properties were originally used when each atom was forced to belong to a single SMARTS group. That condition has been relaxed but these properties are kept for backwards compatibility. :type st: schrodinger.structure.Structure :param st: the structure :rtype: bool :return: True if such properties are defined """ return msutils.has_atom_property(st, GROUP_ATOM_INDEX_PROP) or \ msutils.has_atom_property(st, GROUP_NUMBER_PROP)
[docs]def get_group_names(atom): """ Return a list of group names for the given atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: list :return: group names """ tokens = atom.property.get(GROUP_NAMES_PROP) if tokens: return tokens.split(ATOM_PROP_DELIM) return []
[docs]def get_group_atom_indices(atom): """ Return a list of group atom indices for the given atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: list :return: group atom indices """ index = atom.property.get(GROUP_ATOM_INDEX_PROP) if index: return [index] tokens = atom.property.get(GROUP_ATOM_INDEXES_PROP) if tokens: return list(map(int, tokens.split(ATOM_PROP_DELIM))) return []
[docs]def get_group_numbers(atom): """ Return a list of group numbers for the given atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :rtype: list :return: group numbers """ number = atom.property.get(GROUP_NUMBER_PROP) if number: return [number] tokens = atom.property.get(GROUP_NUMBERS_PROP) if tokens: return list(map(int, tokens.split(ATOM_PROP_DELIM))) return []
[docs]def append_property(atom, key, value): """ Append the given property to the atom. :type atom: schrodinger.structure._StructureAtom :param atom: the atom :type key: str :param key: the property key :type value: str :param value: the property value """ old_value = atom.property.get(key) if old_value: value = f'{old_value}{ATOM_PROP_DELIM}{value}' atom.property[key] = str(value)
[docs]def validate_name(name): """ Make sure name has the correct set of characters :type name: str :param name: The string to check :rtype: bool :return: True if name has no invalid characters, False if any characters are invalid """ return all(x in VALID_NAME_CHARS for x in name)
[docs]class SMARTSGroupError(Exception): """ Class for exceptions related to SMARTS group finding """
[docs]def delete_group_properties(struct): """ Delete all SMARTS group properties (structure and atom) from the structure :type struct: `schrodinger.structure.Structure` :param struct: The structure to delete properties from """ # Delete any structure properties that give group names or count sbase = GROUP_STRUCT_PROP_BASE % "" cbase = GROUP_STRUCT_COUNT % "" tuple_base = (sbase, cbase) for prop in list(struct.property): if prop.startswith(tuple_base): del struct.property[prop] # Delete any atom-based properties for atom in struct.atom: for prop in [ GROUP_NAMES_PROP, GROUP_ATOM_INDEXES_PROP, GROUP_NUMBERS_PROP, GROUP_ATOM_INDEX_PROP, GROUP_NUMBER_PROP ]: if prop in atom.property: del atom.property[prop]
[docs]def find_group_data(struct): """ Find an SMARTS group data on the structure :type struct: `schrodinger.structure.Structure` :param struct: The structure to find groups on :rtype: dict :return: A dictionary. Keys are smarts group numbers, values are `SMARTSGroupData` named tuples for the SMARTS group with that number :raise `SMARTSGroupError`: If something in the data is not consistent """ groups = {} for atom in struct.atom: # validate gindices = get_group_numbers(atom) if not gindices: continue names = get_group_names(atom) if not names: raise SMARTSGroupError('Atom has at least a single ' 'group index but no group names.') if len(gindices) != len(names): raise SMARTSGroupError('Atom belongs to %s groups but ' 'has %s group names.' % (len(gindices), len(names))) for name in names: pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name) if not pattern: raise SMARTSGroupError('There is no pattern specified ' 'for group name %s.' % name) # build groups for gindex, name in zip(gindices, names): if gindex in groups: # New atom for existing group group = groups[gindex] if name != group.name: raise SMARTSGroupError('At least two names are used for ' 'group number %d: %s and %s' % (gindex, name, group.name)) groups[gindex].indexes.append(atom.index) else: # New group pattern = struct.property.get(GROUP_STRUCT_PROP_BASE % name) inds = [atom.index] groups[gindex] = SMARTSGroupData(number=gindex, name=name, pattern=pattern, indexes=inds) # Test that all groups with the same name have the same number of atoms numat = {} for group in groups.values(): total = numat.setdefault(group.name, len(group.indexes)) if len(group.indexes) != total: raise SMARTSGroupError('Group %s has an inconsistent number of ' 'atoms. Found groups with this name ' 'containing %d and %d atoms.' % (group.name, len(group.indexes), total)) return groups
[docs]def get_rdkit_atoms(smarts): """ Return a collection of rdkit atoms for the given SMARTS. The return value has the length of a potential match group, for example for 'cc' this length is 2, for '[$([NH]([CH2])[CH2])]C' it is 2, for [n-0X2].[n-0X2] it is 2, etc., even though there might be any number of matches if the pattern was matched. :type smarts: str :param smarts: the SMARTS pattern :raise RuntimeError: if rdkit has a problem with the SMARTS :rtype: rdkit.Chem.rdchem._ROAtomSeq :return: the rdkit atoms """ # recursive and component SMARTS supported amol = Chem.MolFromSmarts(smarts) if amol is None: msg = ('Rdkit fails for the given SMARTS of ' '{smarts}.').format(smarts=smarts) raise RuntimeError(msg) return amol.GetAtoms()
[docs]def is_smarts_bonding_pair(smarts): """ Return True if the given SMARTS would match a bonding pair, False otherwise. :type smarts: str :param smarts: the SMARTS pattern :rtype: bool :return: True if the SMARTS would match a bonding pair, False otherwise """ # the smarts could be a component pattern like [n-0X2].[n-0X2] # which is a non-bonding pair atoms = get_rdkit_atoms(smarts) if len(atoms) != 2: return False bonds = atoms[0].GetBonds() return bool(bonds)
[docs]class SMARTSGroup(object): """ Handles matching and record-keeping for a SMARTS patter """
[docs] def __init__(self, name, pattern, logger=None): """ Create a SMARTSGroup object :type name: str :param name: The name of this SMARTS group :type pattern: str :param pattern: The SMARTS pattern for this group :raise ValueError: If name has invalid characters :raise ValueError: If the SMARTS is invalid """ if not validate_name(name): raise ValueError( 'The SMARTS group name, %s, should only contain %s' % (name, VALID_NAME_DESC_TEXT)) with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", category=DeprecationWarning, message=r"analyze\.validate_smarts") valid, msg = analyze.validate_smarts(pattern) if not valid: raise ValueError('The SMARTS pattern %s is not valid. Error was: ' '\n%s' % (pattern, msg)) self.name = name self.pattern = pattern self.sprop = GROUP_STRUCT_PROP_BASE % self.name self.cprop = GROUP_STRUCT_COUNT % self.name self.last_number = 0 self.logger = logger
[docs] def nextNumber(self, numbers_used): """ Get the next unused group number :type numbers_used: set :param numbers_used: Each member is a number that has already been used for a group and is unavailable. The number returned by this function is added to the numbers_used set. :rtype: int :return: The lowest available number. This number will have been added to the numbers_used set. """ while True: self.last_number += 1 if self.last_number not in numbers_used: numbers_used.add(self.last_number) return self.last_number
[docs] def orderedMatches(self, struct, backbone_atoms): """ Evaluate the smarts pattern matches, where matches are ordered to follow network sequence. Consider backbone atoms matches first in the sequence and the side chian matches are then ordered according to atom index. :param `schrodinger.structure.Structure` struct: The structure to delete properties from :param dict backbone_atoms: dictionary with key as molecule number and backbone atoms index ordered in a list :return list(list): List of list containing the smart pattern matches """ sorted_matches = [] # Find matches in for the smart pattern with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", category=DeprecationWarning, message=r"analyze\.evaluate_smarts") matches = analyze.evaluate_smarts_by_molecule(struct, self.pattern, canvas=False, matches_by_mol=True, unique_sets=True) for mol_num in sorted(matches): # get matches and backbone atoms for the molecule mol_matches = matches[mol_num] mol_back_atoms = backbone_atoms.get(mol_num, []) backbone_positions = {y: x for x, y in enumerate(mol_back_atoms)} # List to hold positions for matches where at least one atom is in # in the backbone pos_backbone_matches = [] # List to hold positions for matches where there is no match in the # backbone of the molecule index_sidechain_matches = [] for match in mol_matches: if not match: continue # Loop through atom indexes of the smarts match and store their # postion in the backbone match_bb_positions = [ backbone_positions[x] for x in match if x in backbone_positions ] if match_bb_positions: pos_backbone_matches.append( (min(match_bb_positions), match)) else: # In case none of the match atoms are in backbone, use the # lowest atom index of the match index_sidechain_matches.append((min(match), match)) for value_match in [pos_backbone_matches, index_sidechain_matches]: sorted_matches.extend([y for x, y in sorted(value_match)]) return sorted_matches
[docs] def match(self, struct, numbers_used, backbone_atoms, allow_partial_overlap=False): """ Find all the SMARTS groups matching the SMARTS pattern and mark them with the appropriate properties :param `schrodinger.structure.Structure` struct: The structure to delete properties from :param set numbers_used: A set of all the group numbers that have been used and are unavailable :param dict backbone_atoms: dictionary with key as molecule number and backbone atoms index ordered in a list :param bool allow_partial_overlap: whether atoms can belong to multiple groups """ self.last_number = 0 if self.logger: self.logger('Matching Group: name = %s, pattern = %s' % (self.name, self.pattern)) matches = self.orderedMatches(struct, backbone_atoms) msg = '%d matches found' % len(matches) valid_matches = 0 for match in matches: # some backwards compatibility is supported but not here # when allowing partial overlaps incompatible = False smarts_group_data_dict = {} for idx in match: atom = struct.atom[idx] if allow_partial_overlap and ( atom.property.get(GROUP_ATOM_INDEX_PROP) or atom.property.get(GROUP_NUMBER_PROP)): incompatible = True break for group_name, group_number in zip(get_group_names(atom), get_group_numbers(atom)): smarts_group_data = SMARTSGroupData(name=group_name, number=group_number) smarts_group_data_dict.setdefault(smarts_group_data, []).append(idx) if incompatible: continue # if not allowing partial overlaps then none of the atoms # in the match are allowed to have already been matched if not allow_partial_overlap and smarts_group_data_dict: continue # if allowing partial overlap and (1) all atoms have already been # matched to the same SMARTS group or (2) all atoms except a single # H atom have already been matched to the same SMARTS group, then # skip the match, the former allows creating a group from some atoms # already matched to some group and other atoms already matched to # some other group but prevents creating a group that is a sub-group # of another group, the latter prevents matching a terminal methyl 3 # times as is the case for a polymer head or tail monomer with a # terminating hydrogen if allow_partial_overlap: skip = False for idxs in smarts_group_data_dict.values(): if len(idxs) == len(match): skip = True elif len(idxs) + 1 == len(match): idx = list(set(match).difference(idxs))[0] if struct.atom[idx].atomic_number == 1: skip = True if skip: break if skip: continue valid_matches += 1 number = self.nextNumber(numbers_used) for match_index, atom_index in enumerate(match, 1): atom = struct.atom[atom_index] append_property(atom, GROUP_NAMES_PROP, self.name) append_property(atom, GROUP_ATOM_INDEXES_PROP, match_index) append_property(atom, GROUP_NUMBERS_PROP, number) if valid_matches: # last_number will be > 0 if there were any valid matches struct.property[self.sprop] = self.pattern struct.property[self.cprop] = valid_matches if matches: msg += ', %d were tagged as new particles' % valid_matches if self.logger: self.logger(msg)