"""
A collection of miscellaneous molecular-structure-related functions.
Copyright Schrodinger, LLC. All rights reserved.
"""
import collections
from collections import defaultdict
from itertools import chain
from itertools import zip_longest
from typing import TYPE_CHECKING
from typing import Dict
from typing import Iterable
from typing import Iterator
from typing import List
from typing import Optional
from typing import Tuple
import numpy as np
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from schrodinger import structure
from schrodinger.application.desmond import constants
from schrodinger.application.desmond import gcmc_utils
from schrodinger.application.desmond import util
from schrodinger.application.desmond.constants import CT_TYPE
from schrodinger.application.desmond.ffiostructure import FFIOStructure
from schrodinger.infra import mm
from schrodinger.protein import captermini
from schrodinger.structure import Structure
from schrodinger.structure import StructureReader
from schrodinger.structure import _Residue
from schrodinger.structure import _StructureAtom
from schrodinger.structure import create_new_structure
from schrodinger.structutils import analyze
from schrodinger.structutils import build
from schrodinger.thirdparty import rdkit_adapter
if TYPE_CHECKING:
from schrodinger.application.desmond.cms import Cms
# - These are the names normally used for protein backbone atoms in PDB files.
# - Note that the name of the hydrogen atom bonded to the backbone N atom has
# two options: 'H' or 'HN'.
BACKBONE_ATOM_NAME = [
"H",
"HN",
"HA",
"N",
"CA",
"C",
"O",
]
[docs]def get_sidechain_att_atom(residue: _Residue) \
-> Optional[Tuple[_StructureAtom, _StructureAtom]]:
"""
Given an amino acid residue `residue`, this function returns two atoms:
The first atom is the alpha carbon, and the 2nd is the beta atom belonging
to the sidechain and is bonded to the first atom.
If no attachment atoms cannot be identified, `None` is returned.
"""
# Finds out the attachment atoms: first the "CA" atom, then the "CB" atom
# (or the hydrogen atom for GLY).
for a in residue.atom:
if a.pdbname.strip().upper() == "CA":
c_atom = a
break
else:
return None
# DESMOND-2774 - 2HA is the name for Gly's beta hydrogen in PDB v2, and HA3
# in PDB v3.
beta_atom_names = ["2HA", "HA3"] \
if residue.pdbres.strip().upper() == "GLY" else ["CB"]
for a in residue.atom:
if a.pdbname.strip().upper() in beta_atom_names:
s_atom = a
break
else:
return None
return (c_atom, s_atom)
[docs]def get_residue_sitename(residue: _Residue):
"""
Given a `_Residue` object ('residue'), this function returns a string that
represents the name of the residue site.
The string is in the format C:RESXX[I] where C is the chain name, RES is
the residue type name, XX is the residue number in the chain, and I is
the insertion code. For example, a string could be "A:Tyr45[B]".
If there is no insertion code, the string will be reduced to C:RESXX,
e.g. "A:Tyr45".
"""
resname = residue.pdbres.strip().capitalize()
resnum = residue.resnum
chain = residue.chain
inscode = residue.inscode
if (chain != ' ' and inscode != ' '):
s = "%s:%s%d[%s]" % (
chain,
resname,
resnum,
inscode,
)
elif (chain != ' '):
s = "%s:%s%d" % (
chain,
resname,
resnum,
)
elif (inscode != ' '):
s = "%s%d[%s]" % (
resname,
resnum,
inscode,
)
else:
s = "%s%d" % (
resname,
resnum,
)
return s
[docs]def all_atoms(structures):
"""
Returns an iterator over all atoms in all given structures. The order of
iteration is the following: The atoms of the first structure, the atoms of
the second, and so on. If a `Cms' object is given, the order of iteration
is the atoms of the full-system CT, those of an internal copy of the
full-systemm CT within the `Cms` object [when writing the `Cms` object into
a file (or string), it's this copy that is outputted], the atoms of the
first component CT, of the 2nd component CT, and so on.
:type stuctures: Iterable[Structure] or Cms
"""
# Imports `Cms` here to avoid the circular dependency between the `struc`
# and `cms` modules.
from schrodinger.application.desmond.cms import Cms
if isinstance(structures, Cms):
c = structures
structures = [c, c._raw_fsys_ct] + c.comp_ct
return chain(*[st.atom for st in structures])
[docs]def selected_atoms(st: Structure,
atom_indices: Iterable[int] = None,
asl: str = None) -> Iterator[_StructureAtom]:
"""
Returns an iterator over the atoms selected either by `atom_indices` or
`asl`. The order of the iteration is the same as that of `atom_indices` or
the result of `analyze.evaluate_asl`.
It's an error to provide both `atom_indices` and `asl` arguments.
"""
assert bool(atom_indices) != bool(asl)
if asl:
atom_indices = analyze.evaluate_asl(st, asl)
for i in atom_indices:
yield st.atom[i]
[docs]def set_atom_properties(atoms, propnames, values=None):
"""
Set atom properties for the given atoms.
:type atoms: Iterable[_StructureAtom]
:param atoms: Atoms whose properties will be set.
:type propnames: List[str]
:param propnames: A list of atom property names
:param values: A list of atom property values. Each value is for the
corresponding property in `propnames`. If not given, the default value
will be `0`, `False`, and `""` for the numeric types, the boolean type,
and the string type, respectively.
"""
if values is None:
values = []
for name in propnames:
if name.startswith("i_") or name.startswith("r_"):
values.append(0)
elif name.startswith("b_"):
values.append(False)
elif name.startswith("s_"):
values.append("")
kvpairs = list(zip(propnames, values))
for a in atoms:
for k, v in kvpairs:
a.property[k] = v
[docs]def delete_atom_properties(st, propnames):
"""
Deletes atom properties for all atoms of the given structure(s).
:type st: Union[Structure, List[Structure]]
:type propnames: Union[str, Iterable[str]]
"""
if isinstance(st, Structure):
st = [st]
if isinstance(propnames, str):
propnames = [propnames]
for e in st:
for pn in propnames:
e.deletePropertyFromAllAtoms(pn)
[docs]def set_structure_properties(structures, keyvalues):
"""
Set the structure properties with the key-value pairs.
:param structures: a list of structures (or a single structure) for which to
set the specified properties
:type structures: Union[Structure, Iterable[Structure]]
:param keyvalues: The value must be either an iterable of `(key, value)`
tuples, or a `dict` object whose key values will be used to update the
structure properties. Note that the keys must be `str` and follow the
MAE property name format.
:type keyvalues: Optional[Union[dict, Iterable[Tuple[str, value]]]]
"""
if isinstance(structures, Structure):
structures = [structures]
if not isinstance(keyvalues, dict):
keyvalues = list(keyvalues)
for st in structures:
st.property.update(keyvalues)
[docs]def delete_ffio_ff(ffio_ct: FFIOStructure) -> Structure:
"""
Remove ffio_ff block by first converting FFIOStructure to Structure and then
deleting the block.
"""
ct = structure.Structure(ffio_ct.handle)
handle = mm.mmct_ct_m2io_get_unrequested_handle(ct.handle)
mm.m2io_delete_named_block(handle, constants.FFIO_FF)
return ct
[docs]def delete_structure_properties(structures, propnames):
"""
For a supplied list of structures and structure property keys, delete the
property values associated with those keys from each structure if possible.
:param structures: a list of structure (or a single structure) from which to
delete the specified properties
:type structures: Union[Structrue, Iterable[Structure]]
:param propnames: a list of structure property keys (or a single
structure property keys) that should be deleted from the supplied
structures
:type propnames: Union[str, Iterable[str]]
"""
if isinstance(structures, Structure):
structures = [structures]
if isinstance(propnames, str):
propnames = [propnames]
for st in structures:
for prop_key in propnames:
if prop_key in st.property:
del st.property[prop_key]
# DEPRECATED!!! Use `read_all_structures` below instead.
[docs]def read_all_ct(fname):
return read_all_structures(fname)
[docs]def read_all_structures(fname: str) -> List[Structure]:
"""
Read a `*.mae` file and return all CTs in a list.
"""
return list(StructureReader(fname))
[docs]def set_atom_reference_coordinates(
atom: _StructureAtom,
coords: List[float],
prop_names=constants.REFERENCE_COORD_PROPERTIES):
"""
:param atom: An atom whose properties will be updated to save the given
reference coordinates
:param coords: [x, y, z] coordinates for atom
:param prop_names: Property names for the reference coordinates.
Default REFERENCE_COORD_PROPERTIES.
"""
for i, p in enumerate(prop_names):
atom.property[p] = coords[i]
[docs]def get_atom_reference_coordinates(
atom: _StructureAtom,
prop_names=constants.REFERENCE_COORD_PROPERTIES) -> List[float]:
"""
Return the given atom's reference coordinates that have been previously
saved into the atom properties. If no reference coordinates was saved
before, return the atom's current coordinates.
:param prop_names: Property names for the reference coordinates.
Default REFERENCE_COORD_PROPERTIES.
"""
ret_val = [atom.x, atom.y, atom.z]
if set(prop_names).issubset(set(atom.property)):
for i, p in enumerate(prop_names):
ret_val[i] = atom.property[p]
return ret_val
[docs]def set_ct_reference_coordinates(ct: Structure,
prop_names=constants.REFERENCE_COORD_PROPERTIES
):
"""
Set reference coordinates for all atoms in `ct` using current coordinates.
:param prop_names: Property names for the reference coordinates.
Default REFERENCE_COORD_PROPERTIES.
"""
for atom in ct.atom:
set_atom_reference_coordinates(atom, atom.xyz, prop_names)
[docs]def get_reference_ct(
ct: Structure,
prop_names=constants.REFERENCE_COORD_PROPERTIES) -> Structure:
"""
Returns a copy of the input structure. If the input structure contains
previously saved reference coordinates, these coordinates are recovered in
the return structure.
:param prop_names: Property names for the reference coordinates.
Default REFERENCE_COORD_PROPERTIES.
"""
ref_ct = ct.copy()
for a in ref_ct.atom:
a.xyz = get_atom_reference_coordinates(a, prop_names)
return ref_ct
[docs]def fixup_duplicate_properties(cts: List[Structure]) -> List[Structure]:
"""
Return a copy of the original structures. In the returned structures,
properties that only differ by the type are deleted.
"""
cts = list(ct.copy() for ct in cts)
ct_properties = []
for ct in cts:
ct_properties.extend(ct.property.keys())
ct_props_to_remove = _get_duplicate_properties(ct_properties)
atom_properties = []
for ct in cts:
# Need to loop through all atoms since the property
# may be 'null' and not show up on the first atom
for a in ct.atom:
atom_properties.extend(a.property.keys())
atom_props_to_remove = _get_duplicate_properties(atom_properties)
delete_structure_properties(cts, ct_props_to_remove)
delete_atom_properties(cts, atom_props_to_remove)
return cts
def _get_duplicate_properties(typed_props: List[str]) -> List[str]:
"""
Given a list of typed properties, return a list of properties that only
differ by the type.
:param typed_props: List of typed properties, e.g.,
['i_sd_chiral_flag', 'b_sd_chiral_flag']
"""
duplicate_properties = []
# map from property name to typed properties
prop_name_to_typed_props = defaultdict(set)
for typed_prop in typed_props:
# typed_prop: t_prop_name
prop_name_to_typed_props[typed_prop[2:]].add(typed_prop)
for typed_props in prop_name_to_typed_props.values():
if len(typed_props) > 1:
typed_props.pop()
duplicate_properties.extend(typed_props)
return duplicate_properties
[docs]def get_res_ct_and_atom_idx(res: _Residue,
cap=False) -> Tuple[Structure, Dict[int, int]]:
"""
Given a `_Residue` object, extract the residue structure and
a map of from the original atom index to the residue fragment atom index.
:param res: Residue to extract
:param cap: Set to True to add capping groups, default is False.
These atoms are not included in the atom index.
"""
# Keep track of the original atom index
TMP_IDX = 'i_fep_tmp_idx'
for a in res.atom:
a.property[TMP_IDX] = a.index
# Extract the residue ct
res_ct = res.extractStructure(copy_props=True)
res_ct.title = res.pdbres
# Add cap if requested
if cap:
build.add_hydrogens(res_ct)
cap = captermini.CapTermini(res_ct, frag_min_atoms=0)
res_ct = cap.outputStructure()
# Map from original to residue atom index
atom_idx = {
a.property[TMP_IDX]: a.index
for a in res_ct.atom
if a.property.get(TMP_IDX) is not None
}
# Clean up the index
for a in res.atom:
del a.property[TMP_IDX]
return res_ct, atom_idx
[docs]def align_cap(ct0: Structure, ct1: Structure) -> None:
"""
Given two capped peptides, set the coordinates for the cap groups in `ct1`
to match those of `ct0`.
:param ct0: The reference structure.
:param ct1: The structure to modify in place.
"""
atom_to_coord = {}
for a in ct0.atom:
if a.pdbres.strip() in ['ACE', 'NMA']:
atom_to_coord[(a.pdbname, a.pdbres)] = a.xyz
for a in ct1.atom:
if a.pdbres.strip() in ['ACE', 'NMA']:
a.xyz = atom_to_coord[(a.pdbname, a.pdbres)]
# DEPRECATED!!! Use something like `util.str2hexid(ct.title)` instead.
[docs]def hash_title(ct: Structure) -> str:
"""
Hash a structure's title string and return the hash.
:param ct: Structure to get title from.
"""
return util.str2hexid(ct.title)
[docs]def struc_iter(*structures):
"""
Iterates over all structures passed in as the arguments. Containers of
`Structure` objects will be flatten out and iterated over. Non-`Structure`
objects will be skipped. For example:
struc_iter(st0, st1, None, [st2, st3, [st4, None, 1, "2", 3.0, st5]])
The iteration sequence will be `st0, st1, st2, st3, st4, st5`.
"""
for e in structures:
if isinstance(e, Structure):
yield e
elif isinstance(e, collections.abc.Iterable) and not isinstance(e, str):
yield from struc_iter(*e)
[docs]def struc_merge(*structures):
"""
Merges all given structures and returns a single `Structure` object.
For example:
struc_merge(st0, st1, None, [st2, st3, [st4, None, 1, "2", 3.0, st5]])
The returned object will be a merged structure containing `st0`, `st1`,
`st2`, `st3`, `st4`, and `st5`, in that order.
"""
merged = create_new_structure(0)
for e in structures:
if isinstance(e, Structure):
merged = structure.Structure(
mm.mmffld_extendCTwithCustomCharges(merged.handle, e))
elif isinstance(e, collections.abc.Iterable) and not isinstance(e, str):
merged = structure.Structure(
mm.mmffld_extendCTwithCustomCharges(merged.handle,
struc_merge(*e)))
return merged
[docs]class CompStruc:
__slots__ = ("solute", "membrane", "ion", "solvent", "cosolvent",
"crystal_water", "alchemical_water", "positive_salt",
"negative_salt", "receptor", "ligand", "fep_ref", "fep_mut",
"gcmc_solvent")
# NOTE: Order matters, additional properties should be appended to
# this __slots__.
# FIXME: Correctly assign ligand
# Joe's comment:
# It really depends on the fep type.
# For protein fep, the ligand would be a stand alone ligand structure
# and fep_ref, fep_mut contain the wildtype/mutant protein structures.
# For covalent fep, the asl "protein" will match, so ligand will be
# empty and the receptor will contain both structures.
# Whether this is ok depends on your use case.
# But you can't assume that ligands = fep_ref + fep_mut.
"""
Definitions of the components (tentative for now):
- solute CT_TYPE is "solute"
- membrane CT_TYPE is "membrane"
- ion CT_TYPE is "ion", if such structure is not defined, it
will be the sum of the "positive_salt" and the
"negative_salt" component structures.
- solvent CT_TYPE is "solvent" and any inactive solvent molecules have been removed
- gcmc_solvent CT_TYPE is "solvent" and all solvent molecules are included, even inactive ones
- cosolvent CT_TYPE is "cosolvent"
- crystal_water CT_TYPE is "crystal_water"
- alchemical_water The water structure in either the "fep_ref" or the
"fep_mut" component structure.
- positive_salt CT_TYPE is "positive_salt"
- negative_salt CT_TYPE is "negative_salt"
- receptor All "solute" structures that contain a "large" (more
than 5 residues) protein molecule.
- ligand All "solute" structures that are not "receptor". For
FEP systems, this component includes both the reference
and the mutant species.
- fep_ref One of the "ligand" structures whose structure-level
property `constants.FEP_FRAGNAME`'s value is `"none"`.
- fep_mut One of the "ligand" structures whose structure-level
property `constants.FEP_FRAGNAME`'s value is NOT
`"none"`.
Several things to note:
- The value of each component will be either `None`, or a single `Structure`
object, or a `tuple` of two (or more) `Structure` objects. If this causes
inconveniences of having to check the type, consider using the above
`struc_iter` and `struc_merge` functions, which will always return an
iterator of structures or a single merged structure.
- Some components may have overlaps. For example, the "receptor", "ref", and
"mut" component structures are also included in "solute".
- Some components may have more than one CTs, e.g., "solute", whereas some
components may originally be only part of one CT, e.g., "alchemical_water",
and now a new CT which is extracted from the original one.
- If there are two or more `Structure` objects in a component, the order of
these objects is guaranteed to be the same as the original.
- This class is similiar to namedtuple but its attributes can be set after
creating the object.
"""
[docs] def __init__(self, *args):
num_slots = len(self.__slots__)
for attr, arg in zip_longest(self.__slots__, args[:num_slots]):
setattr(self, attr, arg)
def __str__(self):
s = ""
for name, attr in zip(self.__slots__, self):
if attr:
s += f"{name}:\n"
for st in struc_iter(attr):
s += (
f" {st}, {st.title}: {st.mol_total} molecules, " +
f"{len(st.residue)} residues, {st.atom_total} atoms\n")
return s
def __repr__(self):
return str(self)
def __iter__(self):
for name in self.__slots__:
yield getattr(self, name)
[docs] def __len__(self):
return len(self.__slots__)
def __getitem__(self, index):
return getattr(self, self.__slots__[index])
def __setitem__(self, index, value):
setattr(self, self.__slots__[index], value)
[docs]def component_structures(model: "Cms") -> CompStruc:
def append(c: Optional[List], t: list):
return (c and (c + t)) or t
ret = CompStruc()
has_solvent = False
for st in model.comp_ct:
ct_type = st.property[CT_TYPE]
if ct_type == CT_TYPE.VAL.SOLUTE:
ret.solute = append(ret.solute, [st])
elif ct_type == CT_TYPE.VAL.SOLVENT:
has_solvent = True
ret.gcmc_solvent = append(ret.gcmc_solvent, [st])
elif ct_type == CT_TYPE.VAL.MEMBRANE:
ret.membrane = append(ret.membrane, [st])
elif ct_type == CT_TYPE.VAL.ION:
ret.ion = append(ret.ion, [st])
elif ct_type == CT_TYPE.VAL.COSOLVENT:
ret.cosolvent = append(ret.cosolvent, [st])
elif ct_type == CT_TYPE.VAL.POSITIVE_SALT:
ret.positive_salt = append(ret.positive_salt, [st])
elif ct_type == CT_TYPE.VAL.NEGATIVE_SALT:
ret.negative_salt = append(ret.negative_salt, [st])
elif ct_type == CT_TYPE.VAL.LIGAND:
ret.ligand = append(ret.ligand, [st])
elif ct_type == CT_TYPE.VAL.RECEPTOR:
ret.receptor = append(ret.receptor, [st])
if has_solvent:
active_model = gcmc_utils.remove_inactive_solvent(model,
copy_if_gcmc=True)
for st in active_model.comp_ct:
if st.property[CT_TYPE] == CT_TYPE.VAL.SOLVENT:
ret.solvent = append(ret.solvent, [st])
if not ret.ion:
if ret.positive_salt:
ret.ion = append(ret.ion, ret.positive_salt)
if ret.negative_salt:
ret.ion = append(ret.ion, ret.negative_salt)
if not ret.receptor and ret.solute:
ret.receptor = []
for st in ret.solute:
if analyze.evaluate_asl(st, "protein"):
# Excludes small peptide.
# 5 residue cutoff is chosen mainly because in protein FEPs
# the fragment cut out of the protein is practically 5
# residues maximum.
if len(st.residue) > 5:
ret.receptor.append(st)
if not ret.ligand and ret.solute:
ret.ligand = []
for st in ret.solute:
if st not in ret.receptor:
ret.ligand.append(st)
if not ret.fep_ref and ret.solute:
for st in ret.solute:
if st.property.get(constants.FEP_FRAGNAME) == "none":
ret.fep_ref = [st]
break
if not ret.fep_mut and ret.solute:
for st in ret.solute:
if st.property.get(constants.FEP_FRAGNAME) != "none":
ret.fep_mut = [st]
break
if not ret.alchemical_water:
# We do NOT assume which FEP structure the alchemical water is in,
# but it has to be in only one of the FEP structures.
for st in struc_iter(ret.fep_ref, ret.fep_mut):
indices = analyze.evaluate_asl(
st, f'a.{constants.ALCHEMICAL_WATER} > 0')
if indices:
ret.alchemical_water = [st.extract(indices)]
break
# Converts list into tuples, and reduces one-tuple to the element.
for i, val in enumerate(ret):
if val:
ret[i] = tuple(val) if len(val) > 1 else val[0]
return ret
[docs]def write_structures(strucs: Iterable[Optional[Structure]],
fname: Optional[str] = None,
*,
overwrite: bool = True) -> Optional[str]:
"""
Writes the given `Structure` objects into the same file (or string) in the
MAE format.
Similar functions:
- structure.write_ct_to_string
- structure.write_cts
:param structures: `Structure` objects to be written. `None`, are allowed
to be in the iterable and will be ignored. If the iterable is empty or
contains only `None`, for writing to a file this function will have no
side effects, for writing to a string an empty string will be returned.
:param fname: Name of the file to be written. If it's `None` or an empty
string, the structures will be written into a string.
:param overwrite: If true, this function will overwrite the file; otherwise,
it will append the structures into the existing file.
:return: Returns a string which the structures are written into, or `None`
if the structures are written into a file.
"""
strucs = filter(None, strucs)
if fname:
strucs = list(strucs)
if strucs: # Ensures empty `strucs` has no effects on files in disk.
with structure.MaestroWriter(fname, overwrite) as writer:
for st in strucs:
writer.append(st)
else:
return "\n".join(st.writeToString(structure.MAESTRO) for st in strucs)
[docs]def find_duplicate_titles(strucs: Iterable[Structure]) -> List[Tuple[int, str]]:
"""
Return a list of tuples, where each tuple gives the
duplicate index and title in `strucs`.
Return an empty list if no duplicate titles were found.
"""
seen_titles = set()
duplicated_titles = []
for index, st in enumerate(strucs, start=1):
if st.title not in seen_titles:
seen_titles.add(st.title)
else:
duplicated_titles.append((index, st.title))
return duplicated_titles
[docs]def get_fep_structures(strucs: Iterable[Structure],
struc_tags: List[str]) -> List[Structure]:
"""
Structures with a `FEP_STRUC_TAG` that match one of the `struc_tags` will be returned.
"""
return [
struc for struc in strucs
if struc.property.get(constants.FEP_STRUC_TAG) in struc_tags
]
[docs]def find_box(cts: Iterable[structure.Structure]) -> Optional[Tuple[float]]:
"""
Look for box dimension properties on any of the given structures.
:raise ValueError: if there are different box values on different
environment strucs
:return: a 9-tuple of box vector values or None if no box is found
"""
from schrodinger.application.desmond import cms
box = None
for ct in cts:
try:
cur_box = cms.get_box(ct)
except KeyError:
continue
if box is not None and not np.allclose(cur_box, box, atol=1E-3, rtol=0):
# check if cur_box differs from another box
raise ValueError("Conflicting boxes on input structures")
box = cur_box
return box
def _neutralize_atoms(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
"""
Neutralize the atoms in a molecule. Based on the RDKIT cookbook:
https://rdkit.org/docs/Cookbook.html
This function is under the
https://creativecommons.org/licenses/by-sa/4.0/legalcode.
This had been modified from the original form.
:param mol: A molecule that may or may not be charged.
:return: The neutralized molecule.
"""
# Get list of atoms with charge
pattern = Chem.MolFromSmarts(
"[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
at_matches = mol.GetSubstructMatches(pattern)
at_matches_list = [y[0] for y in at_matches]
for at_idx in at_matches_list:
# Get charged atom from the structure
atom = mol.GetAtomWithIdx(at_idx)
chg = atom.GetFormalCharge()
hcount = atom.GetTotalNumHs()
# Neutralize the atom
atom.SetFormalCharge(0)
# Update hydrogens based on the charge
atom.SetNumExplicitHs(hcount - chg)
atom.UpdatePropertyCache()
return mol
[docs]def get_reduced_smiles(st: structure.Structure) -> str:
"""
Read in a molecular structure and return a SMILES that has canonicalized the charge and tautomer state.
NOTE: Canonicalization is tied to a specific schrodinger release,
and can only be compared to other keys generated from this function.
Different versions may produce different results as bugs are fixed.
:param st: A structure of a molecule that could be in an arbitrary charge or tautomer state.
:return: The SMILES of the structure that has a canonical charge and tautomer.
"""
ORIG_IDX = 'ORIG_IDX'
enumerator = rdMolStandardize.TautomerEnumerator()
with rdkit_adapter.suppress_rdkit_log():
smile = analyze.generate_smiles(st, unique=True)
m = Chem.MolFromSmiles(smile)
# Save the chirality tag and formal charge of each atom
chirality_chrg = {
a.GetIdx(): (a.GetChiralTag(), a.GetFormalCharge())
for a in m.GetAtoms()
}
for a in m.GetAtoms():
a.SetIntProp(ORIG_IDX, a.GetIdx())
# Neutralize molecule atom by atom:
m = _neutralize_atoms(m)
# Get the atomic formal charges after neutralization
final_charge = {
int(a.GetProp(ORIG_IDX)): a.GetFormalCharge() for a in m.GetAtoms()
}
# Standardize the tautomer
m = enumerator.Canonicalize(m)
# Re-instate the atomic chirality before being neutralized, EXCEPT if the charge on the atom has changed.
for a in m.GetAtoms():
i = int(a.GetProp(ORIG_IDX))
chirality, orig_charge = chirality_chrg[i]
if (orig_charge - final_charge[i]) == 0:
a.SetChiralTag(chirality)
return Chem.MolToSmiles(m)