"""
This module contains the LigandAligner class, which uses a best practices
approach to flexibly align a set of ligand structures. The basic procedure
is as follows:
1. A Bemis-Murcko scaffold analysis is performed, and structures are clustered
according to their largest scaffold, where scaffolds are represented using
bond orders but not elemental types (i.e., fuzzy scaffolds).
2. Clusters are sorted by decreasing scaffold size, and the structures within
each cluster are sorted by decreasing size to establish a canonical order
for all subsequent work. The very first structure in the canonical list is
the primary reference, and subsequent structures in the list tend to be
aligned to reference structures that appear earlier in the list.
3. A scaffold-based distance matrix is computed, where the i, j entry is the
scaffold number (0, 1, etc.) of the largest scaffold that's shared by
structures i and j. Larger scaffolds have lower scaffold numbers, so the
scaffold number can be used as a distance. When constructing the matrix,
embedded matching is employed (e.g., imidazole matches benzimidazole) in
order to increase the size of the largest shared scaffold.
4. A minimum spanning tree is constructed from the distance matrix, with
structures being added to the tree using the canonical order described in
step 2. Each edge of the tree holds a structure, the reference to which it
is to be aligned, and the largest scaffold they share. The ordered edges of
this tree provide a prescription for sequentially aligning each structure
to a previously aligned reference with which it shares the largest possible
scaffold.
5. The ordered edges of the minimum spanning tree are traversed, with the
applicable alignment method (see AlignMethod below) being used to find the
best superposition of a given structure to its reference. Once a structure
is aligned to its reference, the aligned structure replaces the original
structure, and it is made available as a reference for subsequent structures
in the tree.
6. The structures for which core snapping was successful are post-processed
in an attempt to superimpose sidechains that are common to two or more
structures.
7. The aligned structures are returned in the order they were provided.
If a particular structure is designated as the reference to which all other
structures are to be aligned, steps 1-5 are appropriately modified or skipped.
Furthermore, if a SMARTS-based core is designated or MCS-based cores are
requested, Bemis-Murcko scaffold detection is disabled and the indicated core
is used in lieu of the largest shared Bemis-Murcko scaffold. In the case of
MCS, bond orders and elemental types must match, ring atoms may match only
other ring atoms, and complete rings must be matched.
If refinement is requested, additional conformers are generated for each
structure that was successfully snapped to its reference, with core atoms and
snapped sidechains being held fixed. The resulting conformers are used to
systematically increase the average in-place shape similarity over all pairs
of structures, whether snapped or not. Conformers are not generated for the
the primary reference, a user designated reference, or any structure that
was not snapped to its reference.
Copyright Schrodinger LLC, All Rights Reserved.
"""
import copy
import itertools
from enum import Enum
import numpy
from rdkit import Chem
from rdkit.Chem import rdFMCS
from schrodinger import structure
from schrodinger.infra import canvas
from schrodinger.infra import fast3d
from schrodinger.infra import mm
from schrodinger.infra import phase
from schrodinger.infra.canvas import ChmMmctAdaptor
from schrodinger.structutils import analyze
from schrodinger.thirdparty import rdkit_adapter
from schrodinger.utils import log
# Designation for using MCS-based cores:
MCS_CORES = "MCS"
# Conformational sampling method:
SAMPLING_METHODS = {
phase.CONF_SAMPLE_COARSE_NAME: "basic",
phase.CONF_SAMPLE_FINE_NAME: "default"
}
# Method used to align a particular structure to its reference:
AlignMethod = Enum("AlignMethod", "snap_core least_squares shape_based")
ALIGN_METHOD_DESCRIPTION = {
AlignMethod.snap_core: "snapped core",
AlignMethod.least_squares: "flexible least-squares core",
AlignMethod.shape_based: "flexible shape-based"
}
# Note that the preferred method is to snap the core onto the reference and
# conformationally sample the region outside the core. If the core cannot be
# snapped because of a stereo failure, open ring failure, etc., the entire
# structure is conformationally sampled and each conformer is aligned using
# least-squares superposition of the core atoms. If the structure shares no
# scaffolds with its reference, fully-flexible shape-based alignment is done.
# The best alignment, regardless of method, is the superposition that yields
# the highest all-atom shape similarity with atomic overlaps differentiated by
# elemental type.
# Aligned structures contain the following properties:
INPUT_STRUCTURE_NUMBER_PROP = "i_phase_Input_Structure_Number"
OUTPUT_STRUCTURE_NUMBER_PROP = "i_phase_Structure_Number"
REFERENCE_STRUCTURE_PROP = "i_phase_Reference_Structure"
SHAPE_SIM_PROP = "r_phase_Similarity_to_Reference"
ALIGN_METHOD_PROP = "s_phase_Alignment_Method"
ALIGN_REASON_PROP = "s_phase_Alignment_Reason"
CORE_SMARTS_PROP = "s_phase_Core_SMARTS"
# Atom coordinate tolerance for detecting common snapped cores:
ATOM_COORD_TOL = 1.0e-4
# Tolerance for detecting distorted bonds:
DISTORTED_BOND_TOL = 0.5
# Default maximum number of conformers to generate:
DEFAULT_MAX_CONFS = 1000
[docs]class LigandAlignerOptions:
"""
Holds options that control alignment of structures.
"""
[docs] def __init__(self):
# Whether to refine superpositons after the basic alignment procedure.
self.refine = False
# Whether to immediately fail with a RuntimeError if any structures
# contain multiple fragments or no rings. If False, those structures
# are quietly skipped.
self.fail_on_bad = False
# Whether to skip the post-processing step wherein chemically identical
# sidechains are snapped onto one another in pairs of structures that
# share a common snapped core.
self.ignore_sidechains = False
# Whether to align hydrogens on terminal rotatable heavy atoms (-CH3,
# -NH2, -OH, etc.) if the heavy atoms are part of the core. If False,
# those hydrogens will be aligned only if the conformational sampling
# yields a torsion angle that superimposes them.
self.align_terminal_hydrogens = True
# Conformational sampling method. Empirical evidence suggests that
# fast3d "basic" sampling, which corresponds to Phase coarse/rapid
# sampling, tends to yield the most consistent superpositions.
self.sampling_method = phase.CONF_SAMPLE_COARSE_NAME
# Maximum number of conformers to generate for each input structure.
# Must be > 0.
self.max_confs = DEFAULT_MAX_CONFS
# Whether to minimize conformers.
self.minimize_confs = False
# Whether to replace the primary reference structure with a sampled
# fast3d conformer that yields the best shape-based superposition to
# the input primary reference structure. This can sometimes result in
# better consensus alignments since the primary reference structure
# is then an actual fast3d conformer, and as such may produce better
# superpositions with fast3d conformers generated for other structures.
self.use_sampled_ref = False
# Tolerance for detection of close contacts in snapped structures. A
# tolerance of 0 turns off close contact detection.
self.close_contact_tol = phase.DEFAULT_CLOSE_CONTACT_TOL
[docs]class LigandAligner:
"""
Generates a consensus alignment for a set of ligands using the best
practices approach described at the top of this module.
"""
[docs] def __init__(self, options=None, logger=None):
"""
Constructor taking options that control the alignment behavior and
a logger for printed output.
:param options: Alignment options. See also setOptions.
:type options: LigandAlignerOptions
:param logger: Logger for printed output. Informative messages about
the alignment process are issued at the INFO level; additional
messages triggered by core snapping failures are issed at the
DEBUG level. See also setLogger.
:type logger: Logger
"""
self.setOptions(options or LigandAlignerOptions())
self.setLogger(logger or log.get_output_logger(__file__))
[docs] def align(self, sts, ref_index=None, core=""):
"""
Aligns the supplied structures using core snapping, flexible
least-squares core alignment, or flexible shape-based superposition.
See setOptions to modify the default alignment behavior.
:param sts: The structures to be aligned.
:type sts: list(structure.Structure)
:param ref_index: 1-based reference structure number. Overrides the
basic algorithm that chooses reference structures and
determines the order in which alignments are performed.
:type ref_index: int
:param core: If a reference structure is designated, core may be a
SMARTS string that matches the reference structure, or it may
be MCS_CORES.
:type core: str
:return: The aligned structures in the order they were provided, minus
any fragmented structures or structures lacking rings.
:rtype: list(structure.Structure)
"""
if core and not ref_index:
mesg = "Must designate a reference structure when specifying core"
raise RuntimeError(mesg)
# Initialize all data that will be created during the alignment
# workflow.
self._initAlignData(ref_index, core)
# Filter out bad structures, create ChmMol objects and, if
# applicable, rdkit.Mol objects for MCS.
sts_filtered, self._mols = self._setupAlign(sts)
if len(sts_filtered) < 2:
raise RuntimeError("Must supply 2 or more valid structures")
self._checkCore()
# Align the valid structures.
return self._align(sts_filtered)
[docs] def getLogger(self):
"""
Returns the logger used for output messages.
:return: The logger.
:rtype: Logger
"""
return self._logger
[docs] def getOptions(self):
"""
Returns a copy of the options that control the alignment process and
the level of printed output.
:return: The options.
:rtype: LigandAlignerOptions
"""
return copy.copy(self._options)
[docs] def setLogger(self, logger):
"""
Sets the logger to be used for printing messages during the
alignment process.
:param logger: The logger.
:type logger: Logger
"""
self._logger = logger
[docs] def setOptions(self, options):
"""
Sets options that control the alignment process and the level of
output printed.
:param options: The options.
:type options: LigandAlignerOptions
"""
if options.max_confs < 1:
raise ValueError("Maximum number of conformers must be > 0")
mode = SAMPLING_METHODS[options.sampling_method]
use_default_frag_lib = True
custom_frag = []
self._f3d = fast3d.Engine(mode, options.minimize_confs,
use_default_frag_lib, custom_frag)
self._f3d.setDesiredNumberOfConformers(options.max_confs)
self._options = copy.copy(options)
def _align(self, sts):
"""
Aligns the supplied structures using core snapping, flexible
least-squares core alignment, or flexible shape-based superposition.
Structures are modified directly.
:param sts: The structures to be aligned.
:type sts: list(structure.Structure)
:return: The aligned structures in the order they were provided.
:rtype: list(structure.Structure)
"""
if not self._core:
self._logger.info(f"\nFinding scaffolds for {len(sts)} structures")
finder = canvas.ChmScaffoldFinder(self._mols, False, False)
self._scaffold_count = finder.getScaffoldCount()
mesg = f"Total number of scaffolds = {self._scaffold_count}"
self._logger.info(mesg)
# Store fuzzy SMARTS for each scaffold.
for i in range(self._scaffold_count):
smarts = finder.getScaffold(i).getFuzzyScaffold(2)
# As per PHASE-2111, allow any aromatic nitrogen to match any
# other aromatic nitrogen to protect against inconsistent
# assignment of tautomeric states.
self._fuzzy_smarts.append(smarts.replace("[aH]", "[a]"))
elif self._core == MCS_CORES:
self._logger.info(f"\nFinding MCS cores for {len(sts)} structures")
self._findMcsCores(sts)
else:
self._fuzzy_smarts = [self._core]
if not self._ref_index:
# Create minimum spanning tree whose edges determine the order in
# which structures are aligned. The following prerequisite work is
# also done:
# 1. Clustering structures by largest scaffold.
# 2. Generation of a canonical structure order.
# 3. Construction of the scaffold-based distance matrix.
self._createMinimumSpanningTree()
else:
# Create a tree that traverses the input structures in the order
# provided, with the user-designated reference being first.
self._createLinearSpanningTree()
# Prepare primary reference structure.
prim_ref = self._mst_edges[0][0]
sts[prim_ref] = self._preparePrimaryRef(sts[prim_ref])
# Perform alignments in the order prescribed by the edges of the
# minimum/linear spanning tree.
self._logger.info("")
n = len(self._mst_edges)
for i, edge in enumerate(self._mst_edges):
stA = sts[edge[0]]
stB = sts[edge[1]]
smarts = ""
dAB = edge[2] # Index of largest shared scaffold or MCS
if dAB != -1:
smarts = self._fuzzy_smarts[dAB]
progress = f"({i + 1} of {n})"
mesg = f"Aligning {edge[1] + 1}-->{edge[0] + 1} {progress}"
if smarts:
mesg += f": Core SMARTS = {smarts}"
self._logger.info(mesg)
stB.property[REFERENCE_STRUCTURE_PROP] = edge[0] + 1
stB.property[CORE_SMARTS_PROP] = smarts
stB_aligned = self._getBestAlignment(stA, stB, smarts)
shape_sim = stB_aligned.property[SHAPE_SIM_PROP]
self._logger.info(" Shape similarity = %.6f" % shape_sim)
sts[edge[1]] = stB_aligned
if not self._options.ignore_sidechains:
sts = self._snapSideChains(sts)
if self._options.refine:
sts = self._refineAlignments(sts)
return sts
def _alignConformers(self,
stA,
stB,
stB_confs,
shape,
core_mapping=None,
append_confs=True):
"""
Finds the best superposition of the provided conformers of stB onto
stA using shape similarity. If core_mapping is provided, the B->A
mappings therein will be used to perform a least-squares alignment)
of the applicable core atoms. Otherwise, conformers will be aligned
using the standard PhpFastShape algorithm. Returns the highest shape
similarity found and the associated aligned conformer of stB.
:param stA: The fixed reference structure.
:type stA: structure.Structure
:param stB: The structure to be aligned onto stA.
:type stB: structure.Structure
:param stB_confs: fast3d conformers of stB. Each conformer is a tuple
whose second element contains the atomic coordinates as a
linear array of length 3 * num_atoms.
:type stB_confs: list(fast3d.Conformation)
:param shape: PhpFastShape object with stA as the reference.
:type shape: phase.PhpFastShape
:param core_mapping: Pairs of 1-based B->A core atom mappings.
:type core_mapping: list((int, int))
:param append_confs: If True, stB and stB_confs are screened.
Otherwise, only stB_confs are screened.
:type append_confs: bool
:return: Highest shape similarity and associated aligned structure.
:rtype: float, structure.Structure
"""
mappingA = None
mappingB = None
inplace = False
if core_mapping:
mappingB, mappingA = zip(*core_mapping)
inplace = True
result_best = [0.0]
stB_copy = stB.copy()
stB_best = None
first_conf = True
# Prepending [None] to stB_confs allows incoming stB coordinates to
# be used.
for stB_conf in [None] + stB_confs:
if stB_conf:
# Note that the shape of stB_conf[1] prevents it from being
# supplied directly to stB_copy.setXYZ.
mm.mmct_ct_set_all_xyz(stB_copy, stB_conf[1])
elif not append_confs:
continue
if core_mapping:
phase.align_least_squares(stA, mappingA, stB_copy, mappingB,
False)
if self._options.align_terminal_hydrogens:
phase.align_terminal_hydrogens(stA, stB_copy, core_mapping)
result = shape.computeShapeSimPy(stB_copy, first_conf, inplace)
first_conf = False
if result[0] > result_best[0]:
result_best = list(result)
stB_best = stB_copy.copy()
if not inplace:
shape.alignCtPy(stB_best.handle, result_best)
stB_best.property[SHAPE_SIM_PROP] = result_best[0]
return result_best[0], stB_best
def _alignLeastSquares(self, stA, stB, core_mappings):
"""
Generates conformers for stB, with stB->stA least-squares alignment
on the core atoms, and finds the conformer that yields the highest
shape similarity to the reference structure stA.
:param stA: The fixed reference structure.
:type stA: structure.Structure
:param stB: The structure to be aligned onto stA.
:type stB: structure.Structure
:param core_mappings: Lists of 1-based B-->A atom mappings that define
the various ways the core of stB is to be aligned to stA.
:type core_mappings: list(list((int, int)))
:return: Best alignment over all generated conformers of stB.
:rtype: structure.Structure
"""
shape = self._getShape(stA)
stB_confs = self._f3d.run(stB)
sim_best = 0.0
stB_aligned_best = None
core_mapping_best = None
for core_mapping in core_mappings:
sim, stB_aligned = self._alignConformers(stA, stB, stB_confs, shape,
core_mapping)
if sim > sim_best:
sim_best = sim
stB_aligned_best = stB_aligned
core_mapping_best = list(core_mapping)
st_number = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1
self._least_squares_core_mappings[st_number] = core_mapping_best
return stB_aligned_best
def _alignShapes(self, stA, stB, append_confs=True):
"""
Generates conformers for stB and finds the conformer that yields
the best shape-based alignment to the reference structure stA.
:param stA: The fixed reference structure.
:type stA: structure.Structure
:param stB: The structure to be aligned onto stA.
:type stB: structure.Structure
:param append_confs: If True, stB and the generated conformers are
screened. Otherwise, only the conformers are screened.
:type append_confs: bool
:return: Best alignment over all generated conformers of stB.
:rtype: structure.Structure
"""
shape = self._getShape(stA)
return self._alignConformers(stA,
stB,
self._f3d.run(stB),
shape,
append_confs=append_confs)[1]
def _alignSnappedCore(self, stA, snappedB, core_mappings):
"""
Generates conformers for each snapped structure in snappedB, keeping
the core atoms frozen, and finds the conformer that yields the highest
shape similarity to the reference structure stA. If refinement is
turned on, this function also fills self._frozen_refinement_atoms with
a first approximation of the atoms to hold fixed during refinement. The
sets are typically expanded as rigid regions are grown when snapping
sidechains.
:param stA: The fixed reference structure.
:type stA: structure.Structure
:param snappedB: The snapped structures to be aligned onto stA.
:type snappedB: list(structure.Structure)
:param core_mappings: 1-based B->A core atom mappings for each
structure in snappedB.
:type core_mappings: list(list((int, int)))
:return: Best alignment over all generated conformers of snappedB.
:rtype: structure.Structure
"""
shape = self._getShape(stA)
frozen_prop = fast3d.Engine.FROZEN_ATOMS_PROPERTY_NAME
# Note: The presence of frozen_prop signals that the core has been
# snapped and should be held frozen. The core still needs to be
# realigned, however, because fast3d doesn't keep the conformers
# in the original frame of reference.
sim_best = 0.0
stB_aligned_best = None
core_mapping_best = None
for stB, core_mapping in zip(snappedB, core_mappings):
frozen_atoms = self._getFrozenAtoms(stB, core_mapping)
frozen_atoms_string = " ".join(str(i) for i in frozen_atoms)
stB_copy = stB.copy()
stB_copy.property[frozen_prop] = frozen_atoms_string
stB_confs = self._f3d.run(stB_copy)
sim, stB_aligned = self._alignConformers(stA, stB_copy, stB_confs,
shape, core_mapping)
if sim > sim_best:
sim_best = sim
stB_aligned_best = stB_aligned
core_mapping_best = list(core_mapping)
st_number = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1
if not self._options.ignore_sidechains:
# Store best mapping for use in common core construction when
# snapping identical sidechains.
self._snapped_core_mappings[st_number] = core_mapping_best
if self._options.refine:
self._frozen_refinement_atoms[st_number] = set(
self._getFrozenAtoms(stB, core_mapping_best))
return stB_aligned_best
def _badCore(self, st1, st2, core1, core2):
"""
Returns True if core1 is too small or either core is disconnected.
:param st1: First structure.
:type st1: structure.Structure
:param st2: Second structure.
:type st2: structure.Structure
:param core1: 1-based core atom numbers in first structure.
:type core1: list(int)
:param core2: 1-based core atom numbers in second structure.
:type core2: list(int)
:return: True if core is bad.
:rtype: bool
"""
if len(core1) < 3:
return True
if not phase.is_connected_subgraph(st1.handle, core1):
return True
if not phase.is_connected_subgraph(st2.handle, core2):
return True
return False
def _checkCore(self):
"""
Raises a RuntimeError if a user-supplied core SMARTS doesn't match
the user-designated reference structure.
"""
smarts = self._core
if smarts and smarts != MCS_CORES:
mol = self._mols[self._ref_index - 1]
query = canvas.ChmQuery(smarts)
matcher = canvas.ChmQueryMatcher(True)
if not matcher.hasMatch(query, mol):
mesg = f'SMARTS "{smarts}" does not match reference structure'
raise RuntimeError(mesg)
def _clusterByScaffold(self):
"""
Clusters structures according to their largest scaffold.
"""
self._logger.info("\nClustering structures by largest scaffold")
self._storeFuzzyScaffoldsInMol()
already_clustered = set()
nmol = len(self._mols)
for iscaff in range(self._scaffold_count):
source_structures = []
for jmol in range(nmol):
if jmol not in already_clustered:
if iscaff in self._scaffolds_in_mol[jmol]:
source_structures.append(jmol)
already_clustered.add(jmol)
if source_structures:
self._clusters[iscaff] = source_structures
icluster = 0
for iscaff, source_structures in self._clusters.items():
icluster += 1
smarts = self._fuzzy_smarts[iscaff]
ss_string = ", ".join(str(i + 1) for i in source_structures)
self._logger.info(f"Cluster {icluster}:")
self._logger.info(f" Scaffold SMARTS = {smarts}")
self._logger.info(f" Structures: {ss_string}")
def _createLinearSpanningTree(self):
"""
Creates a surrogate for the minimum spanning tree when a reference
structure has been designated. That structure will appear in every
edge of the tree, and the order of the edges will follow the input
order of the other ligands.
"""
nmol = len(self._mols)
ref_pos = self._ref_index - 1
self._canonical_order = [ref_pos]
for i in range(nmol):
if i != ref_pos:
self._canonical_order.append(i)
if self._core:
if self._core == MCS_CORES:
dmatrix = self._getMcsMatrix()
self._scaffold_count = nmol
else:
dmatrix = self._getSmartsMatrix()
self._scaffold_count = 1
else:
self._storeFuzzyScaffoldsInMol()
dmatrix = self._getScaffoldMatrix()
for i in range(1, nmol):
ipos = self._canonical_order[i]
dij = dmatrix[ipos][ref_pos]
if dij == self._scaffold_count:
# No shared scaffold/SMARTS.
dij = -1
self._mst_edges.append([ref_pos, ipos, dij])
def _createMinimumSpanningTree(self):
"""
Creates a minimum spanning tree using Prim's algorithm.
The tree consists of edges [i, j, dij], where i and j are 0-based
structure numbers, and dij is the 0-based scaffold number of the
largest scaffold shared by the two structures. If they share no
scaffolds, dij will be set to -1.
"""
# Cluster compounds by the largest scaffold they contain.
self._clusterByScaffold()
# Sort structures by cluster, and then by decreasing structure
# size within each cluster. Clusters are already ordered by
# decreasing size of the associated scaffold.
size_ranks1 = [-mol.getHeavyAtomCount() for mol in self._mols]
size_ranks2 = [-mol.getAtomCount() for mol in self._mols]
size_ranks = list(zip(size_ranks1, size_ranks2))
ranks = []
for iscaff, source_structures in self._clusters.items():
for imol in source_structures:
ranks.append((iscaff, size_ranks[imol], imol))
self._canonical_order = [row[2] for row in sorted(ranks)]
dmatrix = self._getScaffoldMatrix()
nmol = len(self._mols)
in_mst = nmol * [0]
# Start with primary reference structure and build edges of mst by
# iteratively adding the closest structure not already in the mst.
primary_ref = self._canonical_order[0]
in_mst[primary_ref] = 1
mst_size = 1
big_dist = self._scaffold_count + 1
while mst_size < nmol:
dmin = big_dist
imin = None
jmin = None
# Always follow canonical order when adding edges so that larger
# structures with larger scaffolds tend to appear earlier in tree.
for i in self._canonical_order:
if in_mst[i]:
for j in self._canonical_order:
if not in_mst[j] and dmatrix[i][j] < dmin:
dmin = dmatrix[i][j]
imin = i
jmin = j
in_mst[jmin] = 1
mst_size += 1
self._mst_edges.append([imin, jmin, dmin])
# Correct edges that correspond to no shared scaffold. These are the
# edges for which flexible shape-based alignment must be done.
for edge in self._mst_edges:
if edge[2] == self._scaffold_count:
edge[0] = primary_ref
edge[2] = -1
def _filterMatches(self, matches, filter):
"""
Returns the rows in matches that contain the exact same atom numbers
as in filter.
:param matches: Lists of 1-based atom numbers.
:type matches: list(list(int))
:param filter: List of 1-based atom numbers.
:type filter: list(int)
:return: Matches containing the same atom numbers as in filter.
:rtype: list(list(int))
"""
filter_set = set(filter)
n = len(filter_set)
filtered_matches = []
for match in matches:
if len(filter_set.intersection(match)) == n:
filtered_matches.append(match)
return filtered_matches
def _findMcsCores(self, sts):
"""
Finds the MCS between the reference structure and each of the
supplied structures and stores the smarts in self._fuzzy_smarts,
in the same order as sts. An empty string is stored for the
reference structure as well as for any structure that fails
to yield an MCS of at least 3 atoms.
:param sts: The structures to be aligned.
:type sts: list(structure.Structure)
"""
ref_pos = self._ref_index - 1
rdkit_mol_ref = rdkit_adapter.to_rdkit(sts[ref_pos], implicitH=True)
chm_adaptor = canvas.ChmMmctAdaptor()
stereo = canvas.ChmMmctAdaptor.StereoFromAnnotation_Safe
chm_mol_ref = chm_adaptor.create(sts[ref_pos], stereo)
chm_matcher = canvas.ChmQueryMatcher(True)
for i, st in enumerate(sts):
if i == ref_pos:
self._fuzzy_smarts.append("")
else:
rdkit_mol = rdkit_adapter.to_rdkit(st, implicitH=True)
smarts = rdFMCS.FindMCS(
[rdkit_mol, rdkit_mol_ref],
ringMatchesRingOnly=True,
completeRingsOnly=True,
atomCompare=Chem.rdFMCS.AtomCompare.CompareElements,
bondCompare=Chem.rdFMCS.BondCompare.CompareOrder,
ringCompare=Chem.rdFMCS.RingCompare.StrictRingFusion,
timeout=10).smartsString
# Ensure that the RDKit SMARTS is legal and matches at least
# 3 atoms in the equivalent ChmMol, so that we know it's safe
# to use the SMARTS for subsequent Canvas-based work.
chm_mol = chm_adaptor.create(st, stereo)
match_size = 0
try:
chm_query = canvas.ChmQuery(smarts)
for match in chm_matcher.match(chm_query, chm_mol):
match_size = match.getMatchedAtoms().size()
break
except:
pass
if match_size < 3:
smarts = ""
self._fuzzy_smarts.append(smarts)
def _getAlignCoreOptions(self, snapping_sidechains=False):
"""
Creates options for snapping core atoms. By default, close contacts,
open rings, and changes in stereochemistry are prohibited, but a
PhpException will not be thrown in the event of these failures, so that
mappings can be retrieved for least-squares alignment of the core. If
snapping_sidechains is True, open rings will be allowed, and the other
types of failures will trigger an exception. Open rings are allowed
when snapping sidechains because only the sidechain atoms will move,
so it doesn't matter if a given ring is not fully mapped.
:param snapping_sidechains: Whether sidechains are being snapped.
:type snapping_sidechains: bool
:return: Core snapping options.
:rtype: phase.PhpAlignCoreOptions
"""
options = phase.PhpAlignCoreOptions()
options.close_contact_tol = self._options.close_contact_tol
if not snapping_sidechains:
options.open_ring_policy = phase.PhpOpenRingPolicy_SKIP
options.stereo_change_action = phase.PhpStereoChangeAction_SAVE_MAPPINGS
options.close_contact_action = phase.PhpCloseContactAction_SAVE_MAPPINGS
options.open_ring_action = phase.PhpOpenRingAction_SAVE_MAPPINGS
return options
def _getAnchorAtoms(self, st, ring_atoms):
"""
Returns a set containing 1-based atom numbers for all atoms from
which a sidechain is allowed to be grown. This consists of ring
atoms bonded to exactly one exocyclic heavy atom.
:param st: Structure whose anchor atoms are sought.
:type st: structure.Structure
:param ring_atoms: 1-based set of all ring atom numbers in structure.
:type ring_atoms: set(int)
:return: Anchor atom numbers.
:rtype: set(int)
"""
anchor_atoms = set()
for ring_atom in ring_atoms:
num_exocyclic = 0
for neighbor in st.atom[ring_atom].bonded_atoms:
if neighbor.atomic_number != 1 and neighbor.index not in ring_atoms:
num_exocyclic += 1
if num_exocyclic == 1:
anchor_atoms.add(ring_atom)
return anchor_atoms
def _getBestAlignment(self, stA, stB, smarts=""):
"""
Finds the best flexible alignment of stB onto stB using one of the
following treatments of stB:
(1) Core snapping with flexible treatment beyond core.
(2) Full flexible treatment with least-squares alignment of core.
(3) Full flexible treatment with elemental shape-based alignment.
Method 1 is used if a SMARTS is provided and the core can be snapped
with no errors. Method 2 is used if core snapping results in errors.
Method 3 is used if no SMARTS is provided.
:param stA: The fixed reference structure
:type stA: structure.Structure
:param stB: The structure to be aligned onto stA
:type stB: structure.Structure
:param smarts: SMARTS string that represents the core
:type smarts: str
:return: Aligned version of stB.
:rtype: structure.Structure
"""
align_method = AlignMethod.snap_core
align_method_descr = ALIGN_METHOD_DESCRIPTION[align_method]
align_reason = ""
snappedB = []
core_mappings = []
if smarts:
align_core_options = self._getAlignCoreOptions()
try:
aligner = phase.PhpAlignCore(stA.handle, stB.handle, smarts,
align_core_options)
# We've asked that mappings be saved even if snapping fails.
core_mappings = aligner.getMappings()
# Check for failures.
if aligner.closeContactFailure():
align_reason = "creation of a close contact"
elif aligner.openRingFailure():
align_reason = "open ring in core mapping"
elif aligner.stereoFailure():
align_reason = "stereochemistry change"
if align_reason:
align_method = AlignMethod.least_squares
else:
# Get snapped core alignment for each mapping.
snappedB = [
structure.Structure(ct) for ct in aligner.getAlignedB()
]
# Keep only alignments without distorted bonds.
align_keep = self._removeDistoredStructures(stB, snappedB)
if align_keep:
snappedB = [snappedB[i] for i in align_keep]
core_mappings = [core_mappings[i] for i in align_keep]
else:
align_method = AlignMethod.least_squares
align_reason = "distorted bond in snapped structure"
except phase.PhpException as exc:
# An unknown type of failure. We don't have mappings, so we
# have to do flexible shape-based alignment.
self._logger.debug(f"Core snapping failed: {exc}")
align_method = AlignMethod.shape_based
align_reason = "unknown core snapping failure"
else:
# No core SMARTS available for this pair.
align_method = AlignMethod.shape_based
if not self._core:
align_reason = "no shared scaffolds"
elif self._core == MCS_CORES:
align_reason = "no MCS with reference"
else:
align_reason = "no SMARTS matches"
if align_reason:
align_method_descr = ALIGN_METHOD_DESCRIPTION[align_method]
mesg = "Performing {} alignment due to {}"
self._logger.info(mesg.format(align_method_descr, align_reason))
stB_aligned = None
if align_method == AlignMethod.snap_core:
stB_aligned = self._alignSnappedCore(stA, snappedB, core_mappings)
elif align_method == AlignMethod.least_squares:
stB_aligned = self._alignLeastSquares(stA, stB, core_mappings)
else:
stB_aligned = self._alignShapes(stA, stB)
stB_aligned.property[ALIGN_METHOD_PROP] = align_method_descr
stB_aligned.property[ALIGN_REASON_PROP] = align_reason
return stB_aligned
def _getFrozenAtoms(self, stB, core_mapping):
"""
Creates a 1-based list of atoms in stB to be held frozen while
generating conformers.
:param stB: Structure containing atoms to freeze.
:type stB: structure.Structure
:param core_mapping: Pairs of 1-based B->A core atom mappings.
:type core_mapping: list(int, int))
:return: List of frozen atoms.
:rtype: list(int)
"""
frozen_atoms = [atom_pair[0] for atom_pair in core_mapping]
if not self._options.minimize_confs:
# Expand list to include terminal atoms attached to frozen_atoms
# because they may end up with weird placement otherwise.
frozen_set = set(frozen_atoms)
n = len(frozen_atoms)
for i in range(n):
for neighbor in stB.atom[frozen_atoms[i]].bonded_atoms:
if neighbor.bond_total == 1 and neighbor not in frozen_set:
frozen_atoms.append(neighbor.index)
return frozen_atoms
def _getMcsMatrix(self):
"""
Surrogate for _getScaffoldMatrix when MCS-based cores are in use.
The [i][ref] and [ref][i] elements are set to i if structure i
yielded an MCS with the reference structure. Other elements are
set to -1.
:return: n by n matrix with values as described above.
:rtype: numpy.ndarray
"""
nmol = len(self._mols)
mcs_matrix = numpy.full((nmol, nmol), -1)
ref_pos = self._ref_index - 1
for i in range(nmol):
if self._fuzzy_smarts[i]:
mcs_matrix[i][ref_pos] = i
mcs_matrix[ref_pos][i] = i
return mcs_matrix
def _getScaffoldMatrix(self):
"""
Determines the largest scaffold shared by each pair of structures. This
can be thought of as a distance matrix because a value of 0 means that
the two structures share the largest identified scaffold and thus share
as much of their scaffold backbone as possible. Larger scaffold numbers
correspond to progressively smaller shared scaffolds and hence larger
distances. A distance of self._scaffold_count indicates the structures
share no scaffolds.
:return: n by n matrix with scaffold numbers in off-diagonal positions,
where n is the number of structures.
:rtype: numpy.ndarray
"""
nmol = len(self._mols)
nscaff = self._scaffold_count
scaffold_matrix = numpy.full((nmol, nmol), nscaff)
for i, scaffolds_i in enumerate(self._scaffolds_in_mol):
if scaffolds_i:
scaffold_matrix[i][i] = min(scaffolds_i)
for j in range(i):
scaffolds_j = self._scaffolds_in_mol[j]
shared_ij = scaffolds_i.intersection(scaffolds_j)
if shared_ij:
dij = min(shared_ij)
scaffold_matrix[i][j] = dij
scaffold_matrix[j][i] = dij
return scaffold_matrix
def _getSmartsMatrix(self):
"""
Surrogate for _getScaffoldMatrix when a core SMARTS has been supplied.
Off-diagonal elements are set to 0 if one of the structures is the
reference and the other structure matches the core SMARTS. Other
elements are set to 1.
:return: n by n matrix with 0/1 values as described above, where n is
the number of structures.
:rtype: numpy.ndarray
"""
nmol = len(self._mols)
smarts_matrix = numpy.ones((nmol, nmol), int)
query = canvas.ChmQuery(self._core)
matcher = canvas.ChmQueryMatcher(True)
ref_pos = self._ref_index - 1
for i, mol in enumerate(self._mols):
if i != ref_pos:
if matcher.hasMatch(query, mol):
smarts_matrix[i][ref_pos] = 0
smarts_matrix[ref_pos][i] = 0
return smarts_matrix
def _getShape(self, st_ref):
"""
Creates a PhpFastShape object to score alignments against the
provided reference structure.
:param st_ref: Reference structure.
:type st_ref: structure.Structure
:return: PhpFastShape object.
:rtype: phase.PhpFastShape
"""
atom_types = "element"
hydrogens = phase.HYDROGENS_ALL
weight_prop = ""
return phase.PhpFastShape(st_ref.handle, hydrogens, atom_types,
weight_prop)
def _getSideChain(self, anchor_atom, st, ring_atoms):
"""
Retrieves a sidechain connected to the indicated anchor atom from
a stored dictionary or grows it from scratch. The returned list of
sidechain atoms will be [-1] if a chain cannot be grown or if the
chain has been removed from the pool because it was already snapped.
:param anchor_atom: 1-based atom number from which the sidechain
originates. This atom is not included in the sidechain.
:type anchor_atom: int
:param st: The structure.
:type st: structure.Structure
:param ring_atoms: 1-based set of ring atom numbers in the structure.
:type ring_atoms: set(int)
:return: 1-based sidechain atom numbers followed by SMARTS.
:rtype: list(int), str
"""
st_number = st.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1
atoms = self._sidechain_atoms[st_number][anchor_atom]
smarts = self._sidechain_smarts[st_number][anchor_atom]
if not atoms:
atoms, smarts = self._growSideChain(st, anchor_atom, ring_atoms)
if not atoms:
atoms = [-1]
self._sidechain_atoms[st_number][anchor_atom] = atoms
self._sidechain_smarts[st_number][anchor_atom] = smarts
return atoms, smarts
def _growCore(self, st_i, st_j, core_i, core_j):
"""
Attempts to expand the provided lists of core atoms by adding pairs
of heavy atoms which have the same coordinates and which are bonded
to atoms already in the core. The lists of core atoms are modified
directly.
:param st_i: Structure associated with core_i.
:type st_i: structure.Structure
:param st_j: Structure associated with core_j.
:type st_j: structure.Structure
:param core_i: 1-based list of core atom numbers in st_i.
:type core_i: list(int)
:param core_j: 1-based list of core atom numbers in st_j.
:type core_j: list(int)
"""
excluded_i = (st_i.atom_total + 1) * [0]
for i in core_i:
excluded_i[i] = 1
excluded_j = (st_j.atom_total + 1) * [0]
for j in core_j:
excluded_j[j] = 1
while True:
# Look for atoms with the same coordinates bonded to current core.
len_prev = len(core_i)
for atom_pos in range(len_prev):
atom_i = core_i[atom_pos]
atom_j = core_j[atom_pos]
for bi in st_i.atom[atom_i].bonded_atoms:
if excluded_i[bi.index] or bi.atomic_number == 1:
continue
xyzi = (bi.x, bi.y, bi.z)
for bj in st_j.atom[atom_j].bonded_atoms:
if excluded_j[bj.index] or bj.atomic_number == 1:
continue
xyzj = (bj.x, bj.y, bj.z)
match = True
for coord_i, coord_j in zip(xyzi, xyzj):
if abs(coord_i - coord_j) > ATOM_COORD_TOL:
match = False
break
if match:
core_i.append(bi.index)
core_j.append(bj.index)
excluded_i[bi.index] = 1
excluded_j[bj.index] = 1
if len(core_i) == len_prev:
break
def _growSideChain(self, st, anchor_atom, ring_atoms):
"""
Attempts to grow a sidechain attached to anchor_atom, which is assumed
to be a ring atom in the core with exactly one attached exocyclic heavy
atom. Returns the list of heavy atoms in the sidechain followed by the
the corresponding SMARTS string. An empty list and string are returned
if the chain leads to another ring.
:param st: Structure for which the chain is to be grown.
:type st: structure.Structure
:param anchor_atom: 1-based atom number of chain attachment point.
:type anchor_atom: int
:param ring_atoms: 1-based set of all ring atom numbers in structure.
:type ring_atoms: set(int)
:return: 1-based atom numbers and SMARTS string for sidechain.
:rtype: list(int), str
"""
excluded = (st.atom_total + 1) * [0]
excluded[anchor_atom] = 1
sidechain_atoms = []
# Find first atom in sidechain.
for neighbor in st.atom[anchor_atom].bonded_atoms:
index = neighbor.index
if index not in ring_atoms and neighbor.atomic_number != 1:
sidechain_atoms.append(neighbor)
excluded[index] = 1
# Grow sidechain from first atom, traversing all branches.
while True:
len_prev = len(sidechain_atoms)
for i in range(len_prev):
atom = sidechain_atoms[i]
for neighbor in atom.bonded_atoms:
index = neighbor.index
if not excluded[index]:
if index in ring_atoms:
return [], ""
elif neighbor.atomic_number != 1:
sidechain_atoms.append(neighbor)
excluded[index] = 1
if len(sidechain_atoms) == len_prev:
break
smarts = analyze.generate_smarts_canvas(st, sidechain_atoms)
return [atom.index for atom in sidechain_atoms], smarts
def _initAlignData(self, ref_index, core):
"""
Initializes all member level data that gets regenerated with each call
to the align() function.
:param ref_index: 1-based reference structure number.
:type ref_index: int
:param core: A SMARTS string, MCS_CORES or empty.
:type core: str
"""
self._ref_index = ref_index
self._core = core
self._anchor_atom_sets = []
self._clusters = {}
self._fuzzy_smarts = []
self._mols = []
self._mst_edges = []
self._ring_atom_sets = []
self._scaffold_count = 0
self._scaffolds_in_mol = []
self._sidechain_atoms = []
self._sidechain_smarts = []
self._snapped_core_mappings = {}
self._least_squares_core_mappings = {}
self._ring_atom_sets = []
self._frozen_refinement_atoms = {}
def _initSideChains(self):
"""
Initializes dictionaries of sidechain atoms and SMARTS that are used
to avoid repetitive re-growing of sidechains from the same anchor
atoms.
"""
# Loop over anchor atoms in each structure.
for anchor_atoms in self._anchor_atom_sets:
sidechain_atoms = {}
sidechain_smarts = {}
for atom in anchor_atoms:
sidechain_atoms[atom] = []
sidechain_smarts[atom] = ""
self._sidechain_atoms.append(sidechain_atoms)
self._sidechain_smarts.append(sidechain_smarts)
def _matchCoreAtoms(self, st_i, st_j):
"""
Identifies the common core for a pair of structures from coordinates
of snapped atoms.
:param st_i: First structure in pair.
:type st_i: structure.Structure
:param st_j Second structure in pair.
:type st_j: structure.Structure
:return: Lists of 1-based core atom numbers for structures i and j.
:rtype: list(int), list(int)
"""
i = st_i.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1
j = st_j.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1
core_atoms_i = []
core_atoms_j = []
if self._core_atom_coords[i] and self._core_atom_coords[j]:
for row_i in self._core_atom_coords[i]:
for row_j in self._core_atom_coords[j]:
match = True
for k in range(1, 4):
if abs(row_i[k] - row_j[k]) > ATOM_COORD_TOL:
match = False
break
if match:
core_atoms_i.append(row_i[0])
core_atoms_j.append(row_j[0])
if core_atoms_i:
# Try to add coincident atoms that lie outside the defined
# cores.
self._growCore(st_i, st_j, core_atoms_i, core_atoms_j)
return core_atoms_i, core_atoms_j
def _matchSideChains(self, stA, stB, anchorA, anchorB):
"""
Returns all B-->A atom mappings for the sidechains attached to the
indicated anchor atoms. No mappings are returned if the side chains
don't exist or they are not equivalent.
:param stA: The fixed reference structure.
:type stA: structure.Structure
:param stB: The candidate structure to be aligned to the reference.
:type stB: structure.Structure
:param anchorA: 1-based atom number of anchor atom in stA.
:type anchorA: int
:param anchorB: 1-based atom number of anchor atom in stB.
:type anchorB: int
:return: lists of B-->A atom mappings.
:rtype: list(list((int, int)))
"""
ringsA = self._ring_atom_sets[stA.property[OUTPUT_STRUCTURE_NUMBER_PROP]
- 1]
atomsA, smartsA = self._getSideChain(anchorA, stA, ringsA)
if atomsA == [-1]:
return []
ringsB = self._ring_atom_sets[stB.property[OUTPUT_STRUCTURE_NUMBER_PROP]
- 1]
atomsB, smartsB = self._getSideChain(anchorB, stB, ringsB)
if atomsB == [-1] or len(atomsB) != len(atomsA):
return []
# At this point we know that the sidechains contain the same
# numbers of atoms, but we don't know that they're equivalent.
# And we can't assume the sidechains are the same/different simply
# because the SMARTS are the same/different. The only way to know
# for certain is to pick one SMARTS and match it against both
# sidechains.
matchesA = analyze.evaluate_smarts_canvas(stA,
smartsB,
uniqueFilter=False)
# Filter out matches that don't consist of the atoms in atomsA.
filteredA = self._filterMatches(matchesA, atomsA)
if not filteredA:
return []
# We only need one mapping to stB because we already requested
# all mappings to stA.
matchesB = analyze.evaluate_smarts_canvas(stB,
smartsB,
uniqueFilter=True)
filteredB = self._filterMatches(matchesB, atomsB)
core_mappings = []
firstA = atomsA[0]
firstB = atomsB[0]
# Keep only mappings for which firstA and firstB appear in the same
# position in their respective lists. These are the sidechain atoms
# that are bonded directly to the rings, so they must be mapped to
# each other.
for matchB in filteredB:
indexB = matchB.index(firstB)
for matchA in filteredA:
if matchA.index(firstA) == indexB:
core_mappings.append(list(zip(matchB, matchA)))
return core_mappings
def _preparePrimaryRef(self, st_ref):
"""
Prepares the primary reference structure, which includes finding the
most similar conformer if the option to use a primary reference
conformer is True.
:param st_ref: Input primary reference structure.
:type st_ref: structure.Structure
:return: The incoming reference structure or most similar conformer,
with the full set of CT-level properties.
:rtype: structure.Structure
"""
st_number = st_ref.property[OUTPUT_STRUCTURE_NUMBER_PROP]
self._logger.info(f"\nPrimary reference = structure {st_number}")
st_ref.property[SHAPE_SIM_PROP] = 1.0
align_method = ""
align_reason = ""
if self._options.use_sampled_ref:
mesg = "Replacing primary reference structure with most " + \
"similar conformer"
self._logger.info(mesg)
st_ref = self._alignShapes(st_ref, st_ref, append_confs=False)
align_method = ALIGN_METHOD_DESCRIPTION[AlignMethod.shape_based]
align_reason = "primary reference conformer requested"
sim_ref = st_ref.property[SHAPE_SIM_PROP]
self._logger.info("Conformer shape similarity = %.6f" % sim_ref)
st_ref.property[REFERENCE_STRUCTURE_PROP] = st_number
st_ref.property[ALIGN_METHOD_PROP] = align_method
st_ref.property[ALIGN_REASON_PROP] = align_reason
st_ref.property[CORE_SMARTS_PROP] = ""
return st_ref
def _refineAlignments(self, sts):
"""
Generates additional conformers for each structure that was snapped
to its reference, with core atoms and snapped sidechains held fixed,
and uses those conformers to systematically increase the average
in-place shape similarity over all pairs of structures.
:param sts: The aligned structures to be refined.
:type sts: list(structure.Structure)
:return: Refined alignments in the same order as the input structures.
:rtype: list(structure.Structure)
"""
refiner = phase.PhpAlignmentRefiner()
conf_options = phase.PhpConfOptions()
if self._options.sampling_method == phase.CONF_SAMPLE_FINE_NAME:
conf_options.setSamplingMethod(phase.CONF_SAMPLE_FINE)
conf_options.setMaxConfs(self._options.max_confs)
if self._options.minimize_confs:
conf_options.setForceField(phase.CONF_FF_OPLS_2005)
refiner.setConfOptions(conf_options)
refiner.setMaxIterations(len(sts)) # Number completed is usually small
if self._logger.getEffectiveLevel() == log.DEBUG:
refiner.setVerboseOutput(True)
frozen_atoms = []
refined_set = set()
for imol in range(len(sts)):
frozen = []
if imol in self._frozen_refinement_atoms:
frozen = list(self._frozen_refinement_atoms[imol])
refined_set.add(imol)
frozen_atoms.append(frozen)
self._logger.info("\nRefining alignments of snapped structures")
sts_refined = [
structure.Structure(ct) for ct in refiner.refine(sts, frozen_atoms)
]
# Need to update SHAPE_SIM_PROP.
self._updateSimilarities(sts_refined, refined_set)
return sts_refined
def _removeDistoredStructures(self,
stB,
snappedB,
bond_tol=DISTORTED_BOND_TOL):
"""
Identifies structures in snappedB with a bond length that differs by
more than bond_tol from the corresponding bond length in stB. Returns
a 0-based list of the structures that contain no such bonds.
:param stB: The original structure.
:type stB: structure.Structure
:param snappedB: List of snapped versions of stB which may contain
distored bonds.
:type snappedB: list(structure.Structure)
:return: 0-based list of structures with no distorted bonds.
:rtype: list(int)
"""
sts_to_keep = []
for i, st_snapped in enumerate(snappedB):
distorted_bond = False
for bond_ref, bond_snapped in zip(stB.bond, st_snapped.bond):
if abs(bond_ref.length - bond_snapped.length) > bond_tol:
distorted_bond = True
break
if not distorted_bond:
sts_to_keep.append(i)
return sts_to_keep
def _setupAlign(self, sts):
"""
Does setup for alignment of the provided structures, which includes
checking for multiple fragments and the absence of rings. Filters
out such structures if the option to skip bad structures is True.
Raises a RuntimeError otherwise.
:param sts: The structures to be aligned.
:type sts: list(structure.Structure)
:return: Copies of the incoming structures, minus any bad structures,
ChmMol objects created from those structures and, if MCS
cores are requested, rdkit.Mol objects from those structures
:rtype: list(structure.Structure), list(ChmMol)
"""
adaptor = ChmMmctAdaptor()
mols = []
keep = []
fragmented = []
no_rings = []
# Don't require rings to be present if a core has been specified.
require_rings = not self._core
for i, st in enumerate(sts):
if len(st.molecule) > 1:
fragmented.append(i)
else:
mol = adaptor.create(st,
ChmMmctAdaptor.StereoFromAnnotation_Safe)
if require_rings and not mol.getRingCount():
no_rings.append(i)
else:
mols.append(mol)
keep.append(i)
if self._options.fail_on_bad:
if fragmented:
fs = ", ".join(str(i + 1) for i in fragmented)
mesg = "Found multiple disconnected fragments in the " + \
"following structures: {}"
raise RuntimeError(mesg.format(fs))
if no_rings:
nr = ", ".join(str(i + 1) for i in no_rings)
mesg = f"Found no rings in the following structures: {nr}"
raise RuntimeError(mesg)
# Make copies of the valid structures and add properties that hold
# the input and output structure numbers. For example, if there are
# 10 input structures and structures 3 and 7 are removed, the 8
# returned structures will have input structure numbers 1, 2, 4, 5,
# 6, 8, 9, 10 and output structure numbers 1, 2, 3, 4, 5, 6, 7, 8.
sts_out = []
# Reference structure number may shift or it could be lost.
found_ref = False
for number_out, number_in in enumerate(keep):
in_plus_1 = number_in + 1
out_plus_1 = number_out + 1
if self._ref_index and in_plus_1 == self._ref_index:
self._ref_index = out_plus_1
found_ref = True
st_out = sts[number_in].copy()
st_out.property[INPUT_STRUCTURE_NUMBER_PROP] = in_plus_1
st_out.property[OUTPUT_STRUCTURE_NUMBER_PROP] = out_plus_1
sts_out.append(st_out)
if self._ref_index and not found_ref:
mesg = "Reference structure contains multiple fragments"
if require_rings:
mesg += " or no rings"
raise RuntimeError(mesg)
return sts_out, mols
def _snapAndScore(self, stA, stB, core_mappings):
"""
Snaps stB to stA according to the B-->A atom mappings in core_mappings
and identifies the snapped version of stB which yields the highest
shape similarity to stA. If all attempts to snap the core fail, the
original stB is returned. If refinement is turned on, the stB atoms
from the best mapping are added to self._frozen_refinement_atoms. The
stA atoms are added as long as stA isn't the primary or designated
reference structure.
:param stA: The fixed reference structure.
:type stA: structure.Structure
:param stB: The structure to be snapped onto stA.
:type stB: structure.Structure
:param core_mappings: Lists of 1-based B-->A atom mappings that define
the various ways the core of stB is to be snapped to stA.
:type core_mappings: list(list((int, int)))
:return: A boolean indicating whether stB was successfully snapped,
and the highest scoring snapped version of stB, or the
incoming stB if snapping failed.
:rtype: bool, structure.Structure
"""
shape = self._getShape(stA)
inplace = True
align_core_options = self._getAlignCoreOptions(True)
try:
aligner = phase.PhpAlignCore(stA.handle, stB.handle, core_mappings,
align_core_options)
stB_snapped = [
structure.Structure(ct) for ct in aligner.getAlignedB()
]
mappings = aligner.getMappings()
sim_best = 0.0
stB_snapped_best = None
mapping_best = None
first_conf = True
for st, mapping in zip(stB_snapped, mappings):
if self._options.align_terminal_hydrogens:
phase.align_terminal_hydrogens(stA, st, mapping)
result = shape.computeShapeSimPy(st.handle, first_conf, inplace)
first_conf = False
if result[0] > sim_best:
sim_best = result[0]
stB_snapped_best = st
mapping_best = mapping
if self._options.refine:
stA_number = stA.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1
stB_number = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP] - 1
addA = stA_number != self._mst_edges[0][0]
for atomB, atomA in mapping_best:
self._frozen_refinement_atoms[stB_number].add(atomB)
if addA:
self._frozen_refinement_atoms[stA_number].add(atomA)
return True, stB_snapped_best
except phase.PhpException as exc:
iA = stA.property[OUTPUT_STRUCTURE_NUMBER_PROP]
iB = stB.property[OUTPUT_STRUCTURE_NUMBER_PROP]
self._logger.debug(
f"Sidechain snapping failed for {iB}-->{iA}: {exc}")
return False, stB
def _snapSideChains(self, sts_aligned):
"""
Attempts to improve snapped core alignments by expanding cores to
include sidechains that are common to two or more structures. In
this treatment we don't use fuzzy matching, i.e., the sidechains
must be chemically identical.
:param sts_aligned: All structures from the initial alignment phase.
:type sts_aligned: list(structure.Structure)
:return: Re-snapped structures. If a structure doesn't get re-snapped,
the incoming structure is returned.
:rtype: list(structure.Structure)
"""
self._logger.info("\nAttempting to superimpose identical sidechains")
# For each snapped structure, store the coordinates of each snapped
# atom. We will use this data to determine the common core for each
# pair of snapped structures, even if that pair was never directly
# snapped onto one another.
self._storeCoreAtomCoords(sts_aligned)
# Compile sets of ring atoms and anchor atoms, the latter being
# potential attachment points for sidechains.
self._storeRingsAndAnchors(sts_aligned)
# Avoid repetitive re-growing of sidechains by storing them in
# dictionaries keyed by anchor atom.
self._initSideChains()
# Do all applicable pairwise comparisons in canonical order and re-snap
# any common sidechains.
sts_resnapped = [st.copy() for st in sts_aligned]
resnapped_set = set()
n = len(sts_aligned)
for i in range(1, n):
imol = self._canonical_order[i]
anchor_atoms_i = self._anchor_atom_sets[imol]
for j in range(i):
st_i = sts_resnapped[imol]
jmol = self._canonical_order[j]
st_j = sts_resnapped[jmol]
core_atoms_i, core_atoms_j = self._matchCoreAtoms(st_i, st_j)
# Skip this pair if the core is too small or disconnected.
if self._badCore(st_i, st_j, core_atoms_i, core_atoms_j):
continue
core_mapping = list(zip(core_atoms_i, core_atoms_j))
anchor_atoms_j = self._anchor_atom_sets[jmol]
# Attempt to grow and snap sidechains attached to each anchor
# atom in the respective common cores.
for atom_i, atom_j in zip(core_atoms_i, core_atoms_j):
if atom_i in anchor_atoms_i and atom_j in anchor_atoms_j:
sidechain_mappings = self._matchSideChains(
st_j, st_i, atom_j, atom_i)
if not sidechain_mappings:
continue
new_core_mappings = []
for mapping in sidechain_mappings:
# It's possible that _matchCoreAtoms already added
# one or more atoms from this sidechain simply
# because they were already coincident, so don't
# add any atoms that are already in the core.
new_core = list(core_mapping)
for atom_pair in mapping:
if atom_pair not in core_mapping:
new_core.append(atom_pair)
new_core_mappings.append(new_core)
smarts = self._sidechain_smarts[imol][atom_i]
mesg = "Attempting to re-snap {}-->{} with " + \
"sidechain {} attached to atoms {} and " + \
"{}\nCore mappings = {}"
self._logger.debug(
mesg.format(imol + 1, jmol + 1, smarts, atom_i,
atom_j, new_core_mappings))
# If refinement is turned on, a successful call to
# self._snapAndScore adds the st_i sidechain atoms to
# self._frozen_refinement_atoms. The st_j sidechain
# atoms are added if it's not the primary reference.
success, snapped_i = self._snapAndScore(
st_j, st_i, new_core_mappings)
if success:
mesg = "Successfully re-snapped {}-->{}: " + \
"Sidechain = {}, anchor atoms = {}, {}"
self._logger.info(
mesg.format(imol + 1, jmol + 1, smarts, atom_i,
atom_j))
st_i = snapped_i
sts_resnapped[imol] = snapped_i
resnapped_set.add(imol)
# Don't snap this sidechain again.
self._sidechain_atoms[imol][atom_i] = [-1]
if resnapped_set:
self._updateSimilarities(sts_resnapped, resnapped_set)
return sts_resnapped
def _storeCoreAtomCoords(self, sts_aligned):
"""
For each snapped structure, this function stores the coordinates of
each snapped atom.
:param sts_aligned: Aligned structures.
:type sts_aligned: list(structure.Structure)
"""
self._core_atom_coords = len(sts_aligned) * [[]]
for i, st in enumerate(sts_aligned):
if i in self._snapped_core_mappings:
atom_coords = []
core_mapping = self._snapped_core_mappings[i]
core_atoms = [atom_pair[0] for atom_pair in core_mapping]
for core_atom in core_atoms:
atom = st.atom[core_atom]
atom_coords.append((core_atom, atom.x, atom.y, atom.z))
self._core_atom_coords[i] = atom_coords
# The primary reference was never snapped to anything else, so just
# store coordinates of all heavy atoms.
iref = self._canonical_order[0]
st_ref = sts_aligned[iref]
atom_coords = []
for atom in st_ref.atom:
if atom.atomic_number != 1:
atom_coords.append((atom.index, atom.x, atom.y, atom.z))
self._core_atom_coords[iref] = atom_coords
def _storeFuzzyScaffoldsInMol(self):
"""
Matches the fuzzy scaffold SMARTS against each structure and stores
the scaffold index for each SMARTS that yields a match.
"""
queries = [canvas.ChmQuery(smarts) for smarts in self._fuzzy_smarts]
matcher = canvas.ChmQueryMatcher(True)
for mol in self._mols:
scaffolds = set()
for i, query in enumerate(queries):
if matcher.hasMatch(query, mol):
scaffolds.add(i)
self._scaffolds_in_mol.append(scaffolds)
def _storeRingsAndAnchors(self, sts):
"""
Compiles sets of ring atoms and anchor atoms for the provided
structures.
:param sts: The structures.
:type sts: list(structure.Structure)
"""
for st in sts:
ring_atoms = set(itertools.chain(*st.find_rings()))
self._ring_atom_sets.append(ring_atoms)
self._anchor_atom_sets.append(self._getAnchorAtoms(st, ring_atoms))
def _updateSimilarities(self, sts, subset):
"""
For each structure number in subset, the in-place shape similarity
to the corresponding reference structure is recalculated to reflect
snapping of sidechains or refinement, and the property SHAPE_SIM_PROP
is updated. It's worth noting that shape similarity to the reference
doesn't always increase because snapping of sidechains and refinement
aren't solely focused on increasing the similarity to the reference.
They are instead concerned with increasing the similarity between any
number of pairs of structures.
:param sts: All structures in the analysis. The SHAPE_SIM_PROP is
overwritten for the affected subset.
:type sts: list(structure.Structure)
:param subset: 0-based set of structure numbers to operate on.
:type subset: set(int)
"""
mesg = "\nRecalculating shape similarities to reference structure"
self._logger.info(mesg)
inplace = True
first_conf = True
ref_prev = None
shape = None
for i in sorted(subset):
stB = sts[i]
ref = stB.property[REFERENCE_STRUCTURE_PROP] - 1
if ref != ref_prev:
shape = self._getShape(sts[ref])
ref_prev = ref
sim_old = sts[i].property[SHAPE_SIM_PROP]
sim_new = shape.computeShapeSimPy(stB, first_conf, inplace)[0]
sts[i].property[SHAPE_SIM_PROP] = sim_new
mesg = "Structure %d: Old similarity = %.6f, new similarity = " + \
"%.6f"
self._logger.info(mesg % (i + 1, sim_old, sim_new))