Source code for schrodinger.application.desmond.struc

"""
A collection of miscellaneous molecular-structure-related functions.

Copyright Schrodinger, LLC. All rights reserved.
"""

import collections
from collections import defaultdict
from itertools import chain
from itertools import zip_longest
from typing import TYPE_CHECKING
from typing import Dict
from typing import Iterable
from typing import Iterator
from typing import List
from typing import Optional
from typing import Tuple

import numpy as np
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize

from schrodinger import structure
from schrodinger.application.desmond import constants
from schrodinger.application.desmond import gcmc_utils
from schrodinger.application.desmond import util
from schrodinger.application.desmond.constants import CT_TYPE
from schrodinger.application.desmond.ffiostructure import FFIOStructure
from schrodinger.infra import mm
from schrodinger.infra import mmcheck
from schrodinger.protein import captermini
from schrodinger.structure import Structure
from schrodinger.structure import StructureReader
from schrodinger.structure import _Residue
from schrodinger.structure import _StructureAtom
from schrodinger.structure import create_new_structure
from schrodinger.structutils import analyze
from schrodinger.structutils import build
from schrodinger.thirdparty import rdkit_adapter

if TYPE_CHECKING:
    from schrodinger.application.desmond.cms import Cms

# - These are the names normally used for protein backbone atoms in PDB files.
# - Note that the name of the hydrogen atom bonded to the backbone N atom has
#   two options: 'H' or 'HN'.
BACKBONE_ATOM_NAME = [
    "H",
    "HN",
    "HA",
    "N",
    "CA",
    "C",
    "O",
]


[docs]def get_sidechain_att_atom(residue: _Residue) \
    -> Optional[Tuple[_StructureAtom, _StructureAtom]]:
    """
    Given an amino acid residue `residue`, this function returns two atoms:
    The first atom is the alpha carbon, and the 2nd is the beta atom belonging
    to the sidechain and is bonded to the first atom.

    If no attachment atoms cannot be identified, `None` is returned.
    """
    # Finds out the attachment atoms: first the "CA" atom, then the "CB" atom
    # (or the hydrogen atom for GLY).
    for a in residue.atom:
        if a.pdbname.strip().upper() == "CA":
            c_atom = a
            break
    else:
        return None

    # DESMOND-2774 - 2HA is the name for Gly's beta hydrogen in PDB v2, and HA3
    # in PDB v3.
    beta_atom_names = ["2HA", "HA3"] \
                      if residue.pdbres.strip().upper() == "GLY" else ["CB"]
    for a in residue.atom:
        if a.pdbname.strip().upper() in beta_atom_names:
            s_atom = a
            break
    else:
        return None

    return (c_atom, s_atom)


[docs]def get_residue_sitename(residue: _Residue):
    """
    Given a `_Residue` object ('residue'), this function returns a string that
    represents the name of the residue site.

    The string is in the format C:RESXX[I] where C is the chain name, RES is
    the residue type name, XX is the residue number in the chain, and I is
    the insertion code. For example, a string could be "A:Tyr45[B]".

    If there is no insertion code, the string will be reduced to C:RESXX,
    e.g. "A:Tyr45".
    """
    resname = residue.pdbres.strip().capitalize()
    resnum = residue.resnum
    chain = residue.chain
    inscode = residue.inscode
    if (chain != ' ' and inscode != ' '):
        s = "%s:%s%d[%s]" % (
            chain,
            resname,
            resnum,
            inscode,
        )
    elif (chain != ' '):
        s = "%s:%s%d" % (
            chain,
            resname,
            resnum,
        )
    elif (inscode != ' '):
        s = "%s%d[%s]" % (
            resname,
            resnum,
            inscode,
        )
    else:
        s = "%s%d" % (
            resname,
            resnum,
        )
    return s


[docs]def all_atoms(structures):
    """
    Returns an iterator over all atoms in all given structures. The order of
    iteration is the following: The atoms of the first structure, the atoms of
    the second, and so on. If a `Cms' object is given, the order of iteration
    is the atoms of the full-system CT, those of an internal copy of the
    full-systemm CT within the `Cms` object [when writing the `Cms` object into
    a file (or string), it's this copy that is outputted], the atoms of the
    first component CT, of the 2nd component CT, and so on.

    :type stuctures: Iterable[Structure] or Cms
    """
    # Imports `Cms` here to avoid the circular dependency between the `struc`
    # and `cms` modules.
    from schrodinger.application.desmond.cms import Cms

    if isinstance(structures, Cms):
        c = structures
        structures = [c, c._raw_fsys_ct] + c.comp_ct
    return chain(*[st.atom for st in structures])


[docs]def selected_atoms(st: Structure,
                   atom_indices: Iterable[int] = None,
                   asl: str = None) -> Iterator[_StructureAtom]:
    """
    Returns an iterator over the atoms selected either by `atom_indices` or
    `asl`. The order of the iteration is the same as that of `atom_indices` or
    the result of `analyze.evaluate_asl`.
    It's an error to provide both `atom_indices` and `asl` arguments.
    """
    assert bool(atom_indices) != bool(asl)
    if asl:
        atom_indices = analyze.evaluate_asl(st, asl)
    for i in atom_indices:
        yield st.atom[i]


[docs]def set_atom_properties(atoms, propnames, values=None):
    """
    Set atom properties for the given atoms.

    :type  atoms: Iterable[_StructureAtom]
    :param atoms: Atoms whose properties will be set.
    :type  propnames: List[str]
    :param propnames: A list of atom property names
    :param values: A list of atom property values. Each value is for the
        corresponding property in `propnames`. If not given, the default value
        will be `0`, `False`, and `""` for the numeric types, the boolean type,
        and the string type, respectively.
    """
    if values is None:
        values = []
        for name in propnames:
            if name.startswith("i_") or name.startswith("r_"):
                values.append(0)
            elif name.startswith("b_"):
                values.append(False)
            elif name.startswith("s_"):
                values.append("")
    kvpairs = list(zip(propnames, values))
    for a in atoms:
        for k, v in kvpairs:
            a.property[k] = v


[docs]def delete_atom_properties(st, propnames):
    """
    Deletes atom properties for all atoms of the given structure(s).

    :type st: Union[Structure, List[Structure]]
    :type propnames: Union[str, Iterable[str]]
    """
    if isinstance(st, Structure):
        st = [st]

    if isinstance(propnames, str):
        propnames = [propnames]

    for e in st:
        for pn in propnames:
            # Ignore missing properties.
            try:
                mm.mmct_atom_property_delete(e.handle, pn, mm.MMCT_ATOMS_ALL)
            except mmcheck.MmException:
                continue


[docs]def rename_atom_property(st, old_name, new_name):
    """
    Renames an atom property's name from `old_name` to `new_name` for the given
    structure(s) `st`

    :type st: `Structure` or `list[Structure]`
    :type old_name: str
    :type new_name: str
    """
    if (old_name == new_name):
        return

    if isinstance(st, Structure):
        st = [st]

    for e in st:
        mm.mmct_atom_property_set(st.handle, new_name, mm.MMCT_ATOMS_ALL)
        for atom in e.atom:
            atom.property[new_name] = atom.property[old_name]

    delete_atom_properties(st, [old_name])


[docs]def set_structure_properties(structures, keyvalues):
    """
    Set the structure properties with the key-value pairs.

    :param structures: a list of structures (or a single structure) for which to
        set the specified properties
    :type structures: Union[Structure, Iterable[Structure]]
    :param keyvalues: The value must be either an iterable of `(key, value)`
        tuples, or a `dict` object whose key values will be used to update the
        structure properties. Note that the keys must be `str` and follow the
        MAE property name format.
    :type keyvalues: Optional[Union[dict, Iterable[Tuple[str, value]]]]
    """
    if isinstance(structures, Structure):
        structures = [structures]

    if not isinstance(keyvalues, dict):
        keyvalues = list(keyvalues)
    for st in structures:
        st.property.update(keyvalues)


[docs]def delete_ffio_ff(ffio_ct: FFIOStructure) -> Structure:
    """
    Remove ffio_ff block by first converting FFIOStructure to Structure and then
    deleting the block.
    """
    ct = structure.Structure(ffio_ct.handle)
    handle = mm.mmct_ct_m2io_get_unrequested_handle(ct.handle)
    mm.m2io_delete_named_block(handle, constants.FFIO_FF)
    return ct


[docs]def delete_structure_properties(structures, propnames):
    """
    For a supplied list of structures and structure property keys, delete the
    property values associated with those keys from each structure if possible.

    :param structures: a list of structure (or a single structure) from which to
        delete the specified properties
    :type structures: Union[Structrue, Iterable[Structure]]
    :param propnames: a list of structure property keys (or a single
        structure property keys) that should be deleted from the supplied
        structures
    :type propnames: Union[str, Iterable[str]]
    """
    if isinstance(structures, Structure):
        structures = [structures]

    if isinstance(propnames, str):
        propnames = [propnames]

    for st in structures:
        for prop_key in propnames:
            if prop_key in st.property:
                del st.property[prop_key]


# DEPRECATED!!! Use `read_all_structures` below instead.
[docs]def read_all_ct(fname):
    return read_all_structures(fname)


[docs]def read_all_structures(fname: str) -> List[Structure]:
    """
    Read a `*.mae` file and return all CTs in a list.
    """
    return list(StructureReader(fname))


[docs]def set_atom_reference_coordinates(
        atom: _StructureAtom,
        coords: List[float],
        prop_names=constants.REFERENCE_COORD_PROPERTIES):
    """
    :param atom: An atom whose properties will be updated to save the given
        reference coordinates

    :param coords: [x, y, z] coordinates for atom

    :param prop_names: Property names for the reference coordinates.
        Default REFERENCE_COORD_PROPERTIES.
    """
    for i, p in enumerate(prop_names):
        atom.property[p] = coords[i]


[docs]def get_atom_reference_coordinates(
        atom: _StructureAtom,
        prop_names=constants.REFERENCE_COORD_PROPERTIES) -> List[float]:
    """
    Return the given atom's reference coordinates that have been previously
    saved into the atom properties. If no reference coordinates was saved
    before, return the atom's current coordinates.

    :param prop_names: Property names for the reference coordinates.
        Default REFERENCE_COORD_PROPERTIES.
    """
    ret_val = [atom.x, atom.y, atom.z]
    if set(prop_names).issubset(set(atom.property)):
        for i, p in enumerate(prop_names):
            ret_val[i] = atom.property[p]
    return ret_val


[docs]def set_ct_reference_coordinates(ct: Structure,
                                 prop_names=constants.REFERENCE_COORD_PROPERTIES
                                ):
    """
    Set reference coordinates for all atoms in `ct` using current coordinates.

    :param prop_names: Property names for the reference coordinates.
        Default REFERENCE_COORD_PROPERTIES.
    """
    for atom in ct.atom:
        set_atom_reference_coordinates(atom, atom.xyz, prop_names)


[docs]def get_reference_ct(
        ct: Structure,
        prop_names=constants.REFERENCE_COORD_PROPERTIES) -> Structure:
    """
    Returns a copy of the input structure. If the input structure contains
    previously saved reference coordinates, these coordinates are recovered in
    the return structure.

    :param prop_names: Property names for the reference coordinates.
        Default REFERENCE_COORD_PROPERTIES.
    """
    ref_ct = ct.copy()
    for a in ref_ct.atom:
        a.xyz = get_atom_reference_coordinates(a, prop_names)
    return ref_ct


[docs]def fixup_duplicate_properties(cts: List[Structure]) -> List[Structure]:
    """
    Return a copy of the original structures. In the returned structures,
    properties that only differ by the type are deleted.
    """
    cts = list(ct.copy() for ct in cts)

    ct_properties = []
    for ct in cts:
        ct_properties.extend(ct.property.keys())
    ct_props_to_remove = _get_duplicate_properties(ct_properties)

    atom_properties = []
    for ct in cts:
        # Need to loop through all atoms since the property
        # may be 'null' and not show up on the first atom
        for a in ct.atom:
            atom_properties.extend(a.property.keys())

    atom_props_to_remove = _get_duplicate_properties(atom_properties)

    delete_structure_properties(cts, ct_props_to_remove)
    delete_atom_properties(cts, atom_props_to_remove)
    return cts


def _get_duplicate_properties(typed_props: List[str]) -> List[str]:
    """
    Given a list of typed properties, return a list of properties that only
    differ by the type.

    :param typed_props: List of typed properties, e.g.,
        ['i_sd_chiral_flag', 'b_sd_chiral_flag']
    """
    duplicate_properties = []

    # map from property name to typed properties
    prop_name_to_typed_props = defaultdict(set)
    for typed_prop in typed_props:
        # typed_prop: t_prop_name
        prop_name_to_typed_props[typed_prop[2:]].add(typed_prop)

    for typed_props in prop_name_to_typed_props.values():
        if len(typed_props) > 1:
            typed_props.pop()
            duplicate_properties.extend(typed_props)

    return duplicate_properties


[docs]def get_res_ct_and_atom_idx(res: _Residue,
                            cap=False) -> Tuple[Structure, Dict[int, int]]:
    """
    Given a `_Residue` object, extract the residue structure and
    a map of from the original atom index to the residue fragment atom index.

    :param res: Residue to extract
    :param cap: Set to True to add capping groups, default is False.
                These atoms are not included in the atom index.
    """
    # Keep track of the original atom index
    TMP_IDX = 'i_fep_tmp_idx'
    for a in res.atom:
        a.property[TMP_IDX] = a.index

    # Extract the residue ct
    res_ct = res.extractStructure(copy_props=True)
    res_ct.title = res.pdbres

    # Add cap if requested
    if cap:
        build.add_hydrogens(res_ct)
        cap = captermini.CapTermini(res_ct, frag_min_atoms=0)
        res_ct = cap.outputStructure()

    # Map from original to residue atom index
    atom_idx = {
        a.property[TMP_IDX]: a.index
        for a in res_ct.atom
        if a.property.get(TMP_IDX) is not None
    }

    # Clean up the index
    for a in res.atom:
        del a.property[TMP_IDX]

    return res_ct, atom_idx


[docs]def align_cap(ct0: Structure, ct1: Structure) -> None:
    """
    Given two capped peptides, set the coordinates for the cap groups in `ct1`
    to match those of `ct0`.

    :param ct0: The reference structure.
    :param ct1: The structure to modify in place.
    """
    atom_to_coord = {}
    for a in ct0.atom:
        if a.pdbres.strip() in ['ACE', 'NMA']:
            atom_to_coord[(a.pdbname, a.pdbres)] = a.xyz

    for a in ct1.atom:
        if a.pdbres.strip() in ['ACE', 'NMA']:
            a.xyz = atom_to_coord[(a.pdbname, a.pdbres)]


# DEPRECATED!!! Use something like `util.str2hexid(ct.title)` instead.
[docs]def hash_title(ct: Structure) -> str:
    """
    Hash a structure's title string and return the hash.
    :param ct: Structure to get title from.
    """
    return util.str2hexid(ct.title)


[docs]def struc_iter(*structures):
    """
    Iterates over all structures passed in as the arguments. Containers of
    `Structure` objects will be flatten out and iterated over. Non-`Structure`
    objects will be skipped. For example:

      struc_iter(st0, st1, None, [st2, st3, [st4, None, 1, "2", 3.0, st5]])

    The iteration sequence will be `st0, st1, st2, st3, st4, st5`.
    """
    for e in structures:
        if isinstance(e, Structure):
            yield e
        elif isinstance(e, collections.Iterable) and not isinstance(e, str):
            yield from struc_iter(*e)


[docs]def struc_merge(*structures):
    """
    Merges all given structures and returns a single `Structure` object.
    For example:

      struc_merge(st0, st1, None, [st2, st3, [st4, None, 1, "2", 3.0, st5]])

    The returned object will be a merged structure containing `st0`, `st1`,
    `st2`, `st3`, `st4`, and `st5`, in that order.
    """
    merged = create_new_structure(0)
    for e in structures:
        if isinstance(e, Structure):
            merged = structure.Structure(
                mm.mmffld_extendCTwithCustomCharges(merged.handle, e))
        elif isinstance(e, collections.Iterable) and not isinstance(e, str):
            merged = structure.Structure(
                mm.mmffld_extendCTwithCustomCharges(merged.handle,
                                                    struc_merge(*e)))
    return merged


[docs]class CompStruc:
    __slots__ = ("solute", "membrane", "ion", "solvent", "cosolvent",
                 "crystal_water", "alchemical_water", "positive_salt",
                 "negative_salt", "receptor", "ligand", "fep_ref", "fep_mut",
                 "gcmc_solvent")
    # NOTE: Order matters, additional properties should be appended to
    # this __slots__.

    # FIXME: Correctly assign ligand
    # Joe's comment:
    #     It really depends on the fep type.
    #     For protein fep, the ligand would be a stand alone ligand structure
    #     and fep_ref, fep_mut contain the wildtype/mutant protein structures.
    #     For covalent fep, the asl "protein" will match, so ligand will be
    #     empty and the receptor will contain both structures.
    #     Whether this is ok depends on your use case.
    #     But you can't assume that ligands = fep_ref + fep_mut.
    """
    Definitions of the components (tentative for now):

    - solute            CT_TYPE is "solute"
    - membrane          CT_TYPE is "membrane"
    - ion               CT_TYPE is "ion", if such structure is not defined, it
                        will be the sum of the "positive_salt" and the
                        "negative_salt" component structures.
    - solvent           CT_TYPE is "solvent" and any inactive solvent molecules have been removed
    - gcmc_solvent      CT_TYPE is "solvent" and all solvent molecules are included, even inactive ones
    - cosolvent         CT_TYPE is "cosolvent"
    - crystal_water     CT_TYPE is "crystal_water"
    - alchemical_water  The water structure in either the "fep_ref" or the
                        "fep_mut" component structure.
    - positive_salt     CT_TYPE is "positive_salt"
    - negative_salt     CT_TYPE is "negative_salt"
    - receptor          All "solute" structures that contain a "large" (more
                        than 5 residues) protein molecule.
    - ligand            All "solute" structures that are not "receptor". For
                        FEP systems, this component includes both the reference
                        and the mutant species.
    - fep_ref           One of the "ligand" structures whose structure-level
                        property `constants.FEP_FRAGNAME`'s value is `"none"`.
    - fep_mut           One of the "ligand" structures whose structure-level
                        property `constants.FEP_FRAGNAME`'s value is NOT
                        `"none"`.

    Several things to note:
    - The value of each component will be either `None`, or a single `Structure`
      object, or a `tuple` of two (or more) `Structure` objects. If this causes
      inconveniences of having to check the type, consider using the above
      `struc_iter` and `struc_merge` functions, which will always return an
      iterator of structures or a single merged structure.
    - Some components may have overlaps. For example, the "receptor", "ref", and
      "mut" component structures are also included in "solute".
    - Some components may have more than one CTs, e.g., "solute", whereas some
      components may originally be only part of one CT, e.g., "alchemical_water",
      and now a new CT which is extracted from the original one.
    - If there are two or more `Structure` objects in a component, the order of
      these objects is guaranteed to be the same as the original.
    - This class is similiar to namedtuple but its attributes can be set after
      creating the object.
    """

[docs]    def __init__(self, *args):
        num_slots = len(self.__slots__)
        for attr, arg in zip_longest(self.__slots__, args[:num_slots]):
            setattr(self, attr, arg)

    def __str__(self):
        s = ""
        for name, attr in zip(self.__slots__, self):
            if attr:
                s += f"{name}:\n"
                for st in struc_iter(attr):
                    s += (
                        f"  {st}, {st.title}: {st.mol_total} molecules, " +
                        f"{len(st.residue)} residues, {st.atom_total} atoms\n")
        return s

    def __repr__(self):
        return str(self)

    def __iter__(self):
        for name in self.__slots__:
            yield getattr(self, name)

[docs]    def __len__(self):
        return len(self.__slots__)

    def __getitem__(self, index):
        return getattr(self, self.__slots__[index])

    def __setitem__(self, index, value):
        setattr(self, self.__slots__[index], value)


[docs]def component_structures(model: "Cms") -> CompStruc:

    def append(c: Optional[List], t: list):
        return (c and (c + t)) or t

    ret = CompStruc()

    has_solvent = False
    for st in model.comp_ct:
        ct_type = st.property[CT_TYPE]
        if ct_type == CT_TYPE.VAL.SOLUTE:
            ret.solute = append(ret.solute, [st])
        elif ct_type == CT_TYPE.VAL.SOLVENT:
            has_solvent = True
            ret.gcmc_solvent = append(ret.gcmc_solvent, [st])
        elif ct_type == CT_TYPE.VAL.MEMBRANE:
            ret.membrane = append(ret.membrane, [st])
        elif ct_type == CT_TYPE.VAL.ION:
            ret.ion = append(ret.ion, [st])
        elif ct_type == CT_TYPE.VAL.COSOLVENT:
            ret.cosolvent = append(ret.cosolvent, [st])
        elif ct_type == CT_TYPE.VAL.POSITIVE_SALT:
            ret.positive_salt = append(ret.positive_salt, [st])
        elif ct_type == CT_TYPE.VAL.NEGATIVE_SALT:
            ret.negative_salt = append(ret.negative_salt, [st])
        elif ct_type == CT_TYPE.VAL.LIGAND:
            ret.ligand = append(ret.ligand, [st])
        elif ct_type == CT_TYPE.VAL.RECEPTOR:
            ret.receptor = append(ret.receptor, [st])

    if has_solvent:
        active_model = gcmc_utils.remove_inactive_solvent(model,
                                                          copy_if_gcmc=True)
        for st in active_model.comp_ct:
            if st.property[CT_TYPE] == CT_TYPE.VAL.SOLVENT:
                ret.solvent = append(ret.solvent, [st])

    if not ret.ion:
        if ret.positive_salt:
            ret.ion = append(ret.ion, ret.positive_salt)
        if ret.negative_salt:
            ret.ion = append(ret.ion, ret.negative_salt)

    if not ret.receptor and ret.solute:
        ret.receptor = []
        for st in ret.solute:
            if analyze.evaluate_asl(st, "protein"):
                # Excludes small peptide.
                # 5 residue cutoff is chosen mainly because in protein FEPs
                # the fragment cut out of the protein is practically 5
                # residues maximum.
                if len(st.residue) > 5:
                    ret.receptor.append(st)

    if not ret.ligand and ret.solute:
        ret.ligand = []
        for st in ret.solute:
            if st not in ret.receptor:
                ret.ligand.append(st)

    if not ret.fep_ref and ret.solute:
        for st in ret.solute:
            if st.property.get(constants.FEP_FRAGNAME) == "none":
                ret.fep_ref = [st]
                break

    if not ret.fep_mut and ret.solute:
        for st in ret.solute:
            if st.property.get(constants.FEP_FRAGNAME) != "none":
                ret.fep_mut = [st]
                break

    if not ret.alchemical_water:
        # We do NOT assume which FEP structure the alchemical water is in,
        # but it has to be in only one of the FEP structures.
        for st in struc_iter(ret.fep_ref, ret.fep_mut):
            indices = analyze.evaluate_asl(
                st, f'a.{constants.ALCHEMICAL_WATER} > 0')
            if indices:
                ret.alchemical_water = [st.extract(indices)]
                break

    # Converts list into tuples, and reduces one-tuple to the element.
    for i, val in enumerate(ret):
        if val:
            ret[i] = tuple(val) if len(val) > 1 else val[0]
    return ret


[docs]def write_structures(strucs: Iterable[Optional[Structure]],
                     fname: Optional[str] = None,
                     *,
                     overwrite: bool = True) -> Optional[str]:
    """
    Writes the given `Structure` objects into the same file (or string) in the
    MAE format.

    Similar functions:

    - structure.write_ct_to_string
    - structure.write_cts

    :param structures: `Structure` objects to be written. `None`, are allowed
        to be in the iterable and will be ignored. If the iterable is empty or
        contains only `None`, for writing to a file this function will have no
        side effects, for writing to a string an empty string will be returned.
    :param fname: Name of the file to be written. If it's `None` or an empty
        string, the structures will be written into a string.
    :param overwrite: If true, this function will overwrite the file; otherwise,
        it will append the structures into the existing file.

    :return: Returns a string which the structures are written into, or `None`
        if the structures are written into a file.
    """
    strucs = filter(None, strucs)
    if fname:
        strucs = list(strucs)
        if strucs:  # Ensures empty `strucs` has no effects on files in disk.
            with structure.MaestroWriter(fname, overwrite) as writer:
                for st in strucs:
                    writer.append(st)
    else:
        return "\n".join(st.writeToString(structure.MAESTRO) for st in strucs)


[docs]def find_duplicate_titles(strucs: Iterable[Structure]) -> List[Tuple[int, str]]:
    """
    Return a list of tuples, where each tuple gives the
    duplicate index and title in `strucs`.
    Return an empty list if no duplicate titles were found.
    """
    seen_titles = set()
    duplicated_titles = []
    for index, st in enumerate(strucs, start=1):
        if st.title not in seen_titles:
            seen_titles.add(st.title)
        else:
            duplicated_titles.append((index, st.title))
    return duplicated_titles


[docs]def get_fep_structures(strucs: Iterable[Structure],
                       struc_tags: List[str]) -> List[Structure]:
    """
    Structures with a `FEP_STRUC_TAG` that match one of the `struc_tags` will be returned.
    """
    return [
        struc for struc in strucs
        if struc.property.get(constants.FEP_STRUC_TAG) in struc_tags
    ]


[docs]def is_charged_or_formal_charged(st: structure.Structure) -> bool:
    return st.formal_charge != 0 or any(a.formal_charge != 0 for a in st.atom)


[docs]def find_box(cts: Iterable[structure.Structure]) -> Optional[Tuple[float]]:
    """
    Look for box dimension properties on any of the given structures.

    :raise ValueError: if there are different box values on different
        environment strucs
    :return: a 9-tuple of box vector values or None if no box is found
    """
    from schrodinger.application.desmond import cms
    box = None
    for ct in cts:
        try:
            cur_box = cms.get_box(ct)
        except KeyError:
            continue
        if box is not None and not np.allclose(cur_box, box, atol=1E-3, rtol=0):
            # check if cur_box differs from another box
            raise ValueError("Conflicting boxes on input structures")
        box = cur_box
    return box


def _neutralize_atoms(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
    """
    Neutralize the atoms in a molecule. Based on the RDKIT cookbook:
    https://rdkit.org/docs/Cookbook.html

    This function is under the
    https://creativecommons.org/licenses/by-sa/4.0/legalcode.

    This had been modified from the original form.

    :param mol: A molecule that may or may not be charged.

    :return: The neutralized molecule.
    """
    # Get list of atoms with charge
    pattern = Chem.MolFromSmarts(
        "[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    for at_idx in at_matches_list:
        # Get charged atom from the structure
        atom = mol.GetAtomWithIdx(at_idx)
        chg = atom.GetFormalCharge()
        hcount = atom.GetTotalNumHs()

        # Neutralize the atom
        atom.SetFormalCharge(0)

        # Update hydrogens based on the charge
        atom.SetNumExplicitHs(hcount - chg)
        atom.UpdatePropertyCache()
    return mol


[docs]def get_reduced_smiles(st: structure.Structure) -> str:
    """
    Read in a molecular structure and return a SMILES that has canonicalized the charge and tautomer state.

    NOTE: Canonicalization is tied to a specific schrodinger release,
    and can only be compared to other keys generated from this function.
    Different versions may produce different results as bugs are fixed.

    :param st: A structure of a molecule that could be in an arbitrary charge or tautomer state.

    :return: The SMILES of the structure that has a canonical charge and tautomer.
    """
    ORIG_IDX = 'ORIG_IDX'
    enumerator = rdMolStandardize.TautomerEnumerator()

    with rdkit_adapter.suppress_rdkit_log():
        smile = analyze.generate_smiles(st, unique=True)
        m = Chem.MolFromSmiles(smile)

        # Save the chirality tag and formal charge of each atom
        chirality_chrg = {
            a.GetIdx(): (a.GetChiralTag(), a.GetFormalCharge())
            for a in m.GetAtoms()
        }
        for a in m.GetAtoms():
            a.SetIntProp(ORIG_IDX, a.GetIdx())

        # Neutralize molecule atom by atom:
        m = _neutralize_atoms(m)

        # Get the atomic formal charges after neutralization
        final_charge = {
            int(a.GetProp(ORIG_IDX)): a.GetFormalCharge() for a in m.GetAtoms()
        }

        # Standardize the tautomer
        m = enumerator.Canonicalize(m)
        # Re-instate the atomic chirality before being neutralized, EXCEPT if the charge on the atom has changed.
        for a in m.GetAtoms():
            i = int(a.GetProp(ORIG_IDX))
            chirality, orig_charge = chirality_chrg[i]
            if (orig_charge - final_charge[i]) == 0:
                a.SetChiralTag(chirality)

        return Chem.MolToSmiles(m)