'''
Implements "link node"/"repearing unit" enumeration (see ENUM-253).
'''
import collections
import rdkit.Chem
from schrodinger.utils import log
from . import common
logger = log.get_output_logger(__name__)
#------------------------------------------------------------------------------#
#
# from https://chemaxon.com/marvin-archive/5.8.2/marvin/help/formats/mrv-doc-old.html
#
# <molecule
# id="sg1"
# role="SruSgroup"
# title="name"
# molID="m1"
# atomRefs="a1 a2 ... "
# correspondence="b1 b2 ... "
# bondList="b1 b2 ... "
# connect="hh|ht|eu ">
#
# is a "SRU S-group"; `name` is something like "1-2,5,100500"
# for `correspondence` see MDL M CRS, for `bondlist` see MDL M SBL,
# for `connect` see MDL M SCN
#
# * this module uses only role/title/atomRefs and connect (I could not
# get the remaining attributes populated by MarvinSketcher anyway)
#
# * assume that connect=="hh" implies "head-to-head" connections, other
# choices ["(ht)" and "eu" (either/unknown)] imply "head-to-tail"
#
# * require exactly two "crossing" bonds
LinknodeSgroup = collections.namedtuple(
'LinknodeSgroup', [
'atoms', # set of atom indices
'repeats', # list of possible repeat counts
'connect' # heat-to-tail vs heat-to-head linking
]) # yapf: disable
#------------------------------------------------------------------------------#
def _is_linknode_sgroup(g):
'''
:param g: MRV S-group as dictionary.
:type g: dict
'''
return g.get('role', '') == 'SruSgroup'
#------------------------------------------------------------------------------#
def _drop_linknode_sgroups(mol):
'''
Remove linknode S-groups from the S-groups associated with `mol`.
'''
common.set_sgroups(
mol, [g for g in common.get_sgroups(mol) if not _is_linknode_sgroup(g)])
#------------------------------------------------------------------------------#
def _parse_repeats(text, max_variations=1 << 10):
'''
Parses strings like '1-4,10,100-500' into lists of corresponding integers.
:param text: String to parse.
:type text: str
:param maxlen: Greatest acceptable number of the integers.
:type maxlen: int
:return: List of integers.
:rtype: list(int)
'''
repeats = set()
for chunk in text.split(','):
try:
toks = list(map(int, chunk.split('-')))
except ValueError:
logger.warning('unexpected number of repeats: %s', text)
continue
if len(toks) == 1:
lo, hi = toks[0], toks[0]
elif len(toks) == 2:
lo, hi = toks[0], toks[1]
else:
logger.warning('unexpected format of repeats: %s', text)
continue
for r in range(lo, hi + 1):
if len(repeats) >= max_variations:
logger.warning('ignoring all but %d repeat variations',
max_variations)
break
if r > 0:
repeats.add(r)
return sorted(repeats)
#------------------------------------------------------------------------------#
def _collect_linknodes(mol):
'''
Collects "linknode" specs from the molecule.
:param mol: Molecule to be considered.
:type mol: rdkit.Chem.Mol
:return: List of "linknodes".
:rtype: list(LinknodeSgroup)
'''
a2idx = common.get_atom_id_map(mol)
outcome = []
for g in common.get_sgroups(mol):
if _is_linknode_sgroup(g):
try:
atom_ids = set(g['atomRefs'].split())
connect = g['connect']
title = g['title']
except KeyError:
logger.warning('incomplete linknode S-group: %s', g)
continue
try:
atoms = set(a2idx[a] for a in atom_ids)
except KeyError:
logger.warning('undefined atom id(s) in linknode S-group: %s',
g)
continue
repeats = _parse_repeats(title)
if atoms and repeats:
outcome.append(
LinknodeSgroup(atoms=atoms,
repeats=repeats,
connect=connect))
return outcome
#------------------------------------------------------------------------------#
def _is_connected_subgraph(mol, atom_indices_set):
'''
Determines whether `atoms` form connected subgraph in `mol`.
:param mol: Molecule.
:type mol: rdkit.Chem.Mol
:param atom_indices_set: Set atom indices in `mol`.
:type atom_indices_set: set(int)
'''
for i in atom_indices_set:
atom = mol.GetAtomWithIdx(i)
for neighbor in atom.GetNeighbors():
if neighbor.GetIdx() in atom_indices_set:
break
else:
return len(atom_indices_set) == 1
return True
#------------------------------------------------------------------------------#
def _get_crossing_bonds(mol, atom_indices_set):
'''
Returns list of bonds that have one atom from `atom_indices_set`.
:param mol: Molecule.
:type mol: rdkit.Chem.Mol
:param atom_indices_set: Set of atom indices in `mol`.
:type atom_indices_set: set(int)
:return: List of bonds.
:rtype: list(rdkit.Chem.Bond)
'''
outcome = []
for bond in mol.GetBonds():
atom1_is_in = bond.GetBeginAtomIdx() in atom_indices_set
atom2_is_in = bond.GetEndAtomIdx() in atom_indices_set
if atom1_is_in != atom2_is_in:
outcome.append(bond)
return outcome
#------------------------------------------------------------------------------#
def _validate_linknodes(mol, linknodes):
'''
Validates the `linknodes`.
:param mol: Molecule.
:type mol: rdkit.Chem.Mol
:param nodes: List of "linknodes" to validate.
:type nodes: list(LinknodeSgroup)
:return: Validation success and error message.
:rtype: (bool, str)
'''
num_atoms = mol.GetNumAtoms()
for node in linknodes:
if not node.atoms:
return False, 'no atoms'
for i in node.atoms:
if not (i >= 0 and i < num_atoms):
return False, 'invalid atom indices'
if not _is_connected_subgraph(mol, node.atoms):
return False, 'atoms do not form connected subgraph'
if len(_get_crossing_bonds(mol, node.atoms)) != 2:
return False, 'unexpected number of crossing bonds'
if not all(r > 0 for r in node.repeats):
return False, 'non-positive repeat count(s)'
num_nodes = len(linknodes)
for i in range(num_nodes):
for j in range(i + 1, num_nodes):
if linknodes[i].atoms & linknodes[j].atoms:
return False, 'link nodes must not overlap'
return True, ''
#------------------------------------------------------------------------------#
def _clone_atom(src):
'''
Strives to create a copy of the `src` atom.
:param src: Atom to be copied.
:type src: rdkit.Chem.Atom
:return: Copy of the `src` atom.
:rtype: rdkit.Chem.Atom
'''
clone = rdkit.Chem.Atom(src)
clone.ClearProp(common.CML_ID_PROP)
return clone
#------------------------------------------------------------------------------#
def _clone_bond(rwmol, bond_idx, beg_atom_idx, end_atom_idx):
'''
Strives to create a bond in `rwmol` between
`beg_atom_idx` and `end_atom_idx` that is otherwise identical
to the one with index `bond_idx`.
:param rwmol: Molecule to be modified.
:type rwmol: rdkit.Chem.RWMol
:param bond_idx: Index of the bond to be copied.
:type bond_idx: int
:param beg_atom_idx: Index of the "begin" atom for the new bond.
:type beg_atom_idx: int
:param end_atom_idx: Index of the "end" atom for the new bond.
:type end_atom_idx: int
'''
original_bond = rwmol.GetBondWithIdx(bond_idx)
num_bonds = rwmol.AddBond(beg_atom_idx, end_atom_idx)
rwmol.ReplaceBond(num_bonds - 1, original_bond)
#------------------------------------------------------------------------------#
def _replicate(rwmol, atom_indices, head_to_head=False):
'''
Replicate atoms with indices from `atom_indices` once. Assumes that
the "repeating unit" defined by the `atom_indices` forms connected
subgraph in `mol` and had exactly two "crossing" bonds.
:param rwmol: Molecule to be modified.
:type rwmol: rdkit.Chem.RWMol
:param atom_indices: Indices of the atoms to be "replicated".
:type atom_indices: set(int)
:param head_to_head: Replicate "head-to-head" instead of "head-to-tail".
:type head_to_head: bool
'''
# atoms
o2n = dict() # "orig" -> "new" atom index map
for idx in atom_indices:
o2n[idx] = rwmol.AddAtom(_clone_atom(rwmol.GetAtomWithIdx(idx)))
# bonds
BondInfo = collections.namedtuple('BondInfo',
['idx', 'beg_atom_idx', 'end_atom_idx'])
bonds_to_copy = [] # beg_atom_idx and end_atom_idx in atom_indices
crossing_bonds = [] # beg_atom_idx in atom_indices, end_atom_idx is not
for bond in rwmol.GetBonds():
beg_atom_idx = bond.GetBeginAtomIdx()
beg_atom_cloned = beg_atom_idx in o2n
end_atom_idx = bond.GetEndAtomIdx()
end_atom_cloned = end_atom_idx in o2n
if beg_atom_cloned and end_atom_cloned:
bonds_to_copy.append(
BondInfo(bond.GetIdx(), beg_atom_idx, end_atom_idx))
elif beg_atom_cloned and not end_atom_cloned:
crossing_bonds.append(
BondInfo(bond.GetIdx(), beg_atom_idx, end_atom_idx))
elif end_atom_cloned and not beg_atom_cloned:
crossing_bonds.append(
BondInfo(bond.GetIdx(), end_atom_idx, beg_atom_idx))
for bond_info in bonds_to_copy:
_clone_bond(rwmol, bond_info.idx, o2n[bond_info.beg_atom_idx],
o2n[bond_info.end_atom_idx])
# head/tail atoms of the "replica"
new_ht_atoms = [o2n[bond_info.beg_atom_idx] for bond_info in crossing_bonds]
if head_to_head:
new_ht_atoms.reverse()
#
# [new_head]...[new_tail] \
# => X-[head]...[tail]-[new_head]...[new_tail]-Y
# X-[head]...[tail]-Y /
#
# here: X-[head] is crossing_bonds[0] (with beg_atom_idx == head),
# [tail]-Y is crossing_bonds[1] (with beg_atom_idx == tail)
#
# create [tail]-[new_head]
_clone_bond(rwmol, crossing_bonds[1].idx, crossing_bonds[1].beg_atom_idx,
new_ht_atoms[0])
# create [new_tail]-Y
_clone_bond(rwmol, crossing_bonds[0].idx, new_ht_atoms[1],
crossing_bonds[1].end_atom_idx)
# drop [tail]-Y
rwmol.RemoveBond(crossing_bonds[1].beg_atom_idx,
crossing_bonds[1].end_atom_idx)
#------------------------------------------------------------------------------#
def _apply_linknode(rwmol, atom_indices, num_repeats, head_to_head=False):
'''
Repeats link node defined by `atom_indices` `num_repeats - 1` times.
:param rwmol: R/W molecule.
:type rwmol: rdkit.Chem.RWMol
:param atom_indices: Set of atom indices in `rwmol` that define the
"linknode" (assumed to be validated: form connected subgraph in
`rwmol` and have exactly two crossing bonds).
:type atoms: set(int)
:param num_repeats: Number of repeats (positive integer).
:type num_repeats: int
:param head_to_head: Connect "head-to-head" vs "head-to-tail" otherwise.
:type head_to_head: bool
'''
for i in range(1, num_repeats):
_replicate(rwmol, atom_indices, head_to_head and
(num_repeats - i) % 2 == 1)
#------------------------------------------------------------------------------#
[docs]class LinknodeEnumerable(common.EnumerableMixin):
[docs] def __init__(self, mol, linknodes=None):
'''
:param mol: RDKit molecule.
:type mol: rdkit.Chem.Mol
:param linknodes: List of link nodes.
:type linknodes: list(LinknodeSgroup)
'''
if linknodes is None:
linknodes = _collect_linknodes(mol)
valid, msg = _validate_linknodes(mol, linknodes)
if not valid:
raise ValueError('LinknodeEnumerable: ' + msg)
self.linknodes = linknodes
self.mol = mol
[docs] def getExtents(self):
return [len(n.repeats) for n in self.linknodes]
[docs] def getRealization(self, idx):
'''
:param idx: "Index" of a realization.
:type idx: iterable over int
:return: RDKit molecule without "link nodes".
:rtype: rdkit.Chem.Mol
'''
if self.linknodes:
rwmol = rdkit.Chem.RWMol(self.mol)
_drop_linknode_sgroups(rwmol)
for (i, node) in zip(idx, self.linknodes):
_apply_linknode(rwmol,
node.atoms,
node.repeats[i],
head_to_head=(node.connect != '(ht)'))
return rwmol.GetMol()
else:
return self.mol
#------------------------------------------------------------------------------#