Source code for schrodinger.livedesign.convert

import enum
from functools import partial
from typing import Generator
from typing import Union

import numpy as np
from rdkit import Chem
from rdkit import Geometry
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdmolops

from schrodinger.livedesign import substructure
from schrodinger.thirdparty import rdkit_adapter

ATOM_PROP_ATOM_LABEL = "atomLabel"
BOND_PROP_BOND_STEREO = "_MolFileBondStereo"
BOND_PROP_BOND_CONFIG = "_MolFileBondCfg"
MOL_PROP_ATTACHPT = "molAttchpt"
MOL_PROP_R_LABEL = "_MolFileRLabel"


[docs]class Format(enum.Enum):
    SDF = enum.auto()
    SMILES = enum.auto()
    CXSMILES = enum.auto()
    SMARTS = enum.auto()
    INCHI = enum.auto()


[docs]def convert(data: str, input_format: Format, output_format: Format) -> str:
    """
    Main entrypoint for converting one serialization to another. A few
    convience functions are provided below for common LD conversions.

    :param data: input text string
    :param input_format: expected format for input string
    :param output_format: desired format for output string
    :return: converted text string
    """
    if _is_reaction(data, input_format):
        return _from_rdkit_reaction(_to_rdkit_reaction(data,
                                                       format=input_format),
                                    format=output_format)
    else:
        return _from_rdkit_mol(_to_rdkit_mol(data, format=input_format),
                               format=output_format)


[docs]def sdf_to_rdkit(
    molblock: str
) -> Union[Chem.rdchem.Mol, Chem.rdChemReactions.ChemicalReaction]:
    """
    :param molblock: given SDF or RXN molblock
    :return: corresponding RDKit mol or reaction
    """
    if _is_reaction(molblock, Format.SDF):
        return _to_rdkit_reaction(molblock, format=Format.SDF)
    else:
        return _to_rdkit_mol(molblock, format=Format.SDF)


[docs]def rdkit_to_sdf(mol: Chem.rdchem.Mol, kekulize: bool = True) -> str:
    """
    :param input: given RDKit mol
    :param kekulize: whether to kekulize mol
    :return: corresponding SDF molblock
    """
    return _from_rdkit_mol(mol, format=Format.SDF, kekulize_sdf=kekulize)


[docs]def rdkit_to_cxsmiles(mol: Chem.rdchem.Mol) -> str:
    """
    :param mol: given RDKit mol
    :return: corresponding CXSMILES stripped of coordinates
    """
    return _from_rdkit_mol(mol, format=Format.CXSMILES)


[docs]def get_sd_reader(molblock: str) -> Generator[Chem.rdchem.Mol, None, None]:
    """
    :param molblock: string of SDF molblocks
    :return: generator that provides mols from SDF molblcoks
    """
    reader = Chem.SDMolSupplier()
    reader.SetData(molblock,
                   sanitize=False,
                   removeHs=False,
                   strictParsing=False)
    for mol in reader:
        yield _get_processed_mol(mol)


[docs]def atom_has_attachment_point(atom):
    return atom.HasProp(ATOM_PROP_ATOM_LABEL) and atom.GetProp(
        ATOM_PROP_ATOM_LABEL).startswith("_AP")


def _is_reaction(data: str, format: Format) -> bool:
    if format == Format.SDF:
        return data.startswith("$RXN")
    elif format in (Format.SMILES, Format.CXSMILES, Format.SMARTS):
        return ">>" in data
    else:
        return False  # Unsupported reaction format


def _to_rdkit_mol(data: str, format: Format) -> Chem.rdchem.Mol:

    def _from_sdf(molblock: str) -> Chem.rdchem.Mol:
        reader = get_sd_reader(molblock)
        mol = next(reader)
        if mol is None:
            raise RuntimeError("Single molblock required; 0 present")
        for m in reader:
            # there is another molblock if this can loop
            raise RuntimeError("Single molblock required; multiple present")
        return mol

    _from_smiles = partial(Chem.MolFromSmiles, sanitize=False)

    str_to_mol = {
        Format.SDF: _from_sdf,
        Format.SMILES: _from_smiles,
        Format.CXSMILES: _from_smiles,
        Format.SMARTS: Chem.MolFromSmarts,
        Format.INCHI: partial(Chem.MolFromInchi, sanitize=False),
    }

    with rdkit_adapter.convert_log_to_exception():
        mol = str_to_mol[format](data)
        _fix_r0_rgroup(mol)
        # partial sanitization to preserve input molecule
        ops = (Chem.SANITIZE_ALL ^ Chem.SANITIZE_CLEANUP ^
               Chem.SANITIZE_PROPERTIES ^ Chem.SANITIZE_KEKULIZE ^
               Chem.SANITIZE_FINDRADICALS ^ Chem.SANITIZE_CLEANUPCHIRALITY)
        Chem.SanitizeMol(mol, sanitizeOps=ops)
        mol.UpdatePropertyCache(strict=False)
        return mol


def _from_rdkit_mol(mol: Chem.rdchem.Mol,
                    format: Format,
                    kekulize_sdf: bool = True) -> str:

    def _to_cxsmiles(mol: Chem.rdchem.Mol) -> str:
        mol.RemoveAllConformers()  # remove coordinates from CXSMILES
        return Chem.MolToCXSmiles(mol)

    def _to_sdf(mol: Chem.rdchem.Mol, kekulize: bool) -> str:
        mol = _replace_dummy_atoms_with_attchpts(mol)
        # SHARED-8282, SHARED-8498: there's cases that can't be kekulized,
        # so just kekulize if possible
        if kekulize:
            Chem.rdmolops.KekulizeIfPossible(mol, clearAromaticFlags=True)
        return Chem.SDWriter.GetText(mol, kekulize=False, force_v3000=True)

    mol_to_str = {
        Format.SDF: partial(_to_sdf, kekulize=kekulize_sdf),
        Format.SMILES: Chem.MolToSmiles,
        Format.SMARTS: Chem.MolToSmarts,
        Format.CXSMILES: _to_cxsmiles,
        Format.INCHI: Chem.MolToInchi,
    }
    return mol_to_str[format](Chem.Mol(mol))


def _to_rdkit_reaction(data: str,
                       format: Format) -> Chem.rdChemReactions.ChemicalReaction:
    str_to_rxn = {
        Format.SDF: rdChemReactions.ReactionFromRxnBlock,
        Format.SMILES: partial(rdChemReactions.ReactionFromSmarts,
                               useSmiles=True),
        Format.SMARTS: rdChemReactions.ReactionFromSmarts,
        Format.CXSMILES: partial(rdChemReactions.ReactionFromSmarts,
                                 useSmiles=True),
    }
    # RDKit doesn't support SMARTS extensions in reactions
    if format in (Format.SMARTS, Format.SMILES, Format.CXSMILES):
        data = data.split(" ")[0]
    if format == Format.INCHI:
        raise RuntimeError("Cannot convert INCHI format to reaction")

    rxn = str_to_rxn[format](data)

    for reactant in rxn.GetReactants():
        _fix_r0_rgroup(reactant)

    for product in rxn.GetProducts():
        _fix_r0_rgroup(product)

    return rxn


def _from_rdkit_reaction(rxn: Chem.rdChemReactions.ChemicalReaction,
                         format: Format) -> str:
    rxn_to_str = {
        Format.SDF: rdChemReactions.ReactionToRxnBlock,
        Format.SMILES: rdChemReactions.ReactionToSmiles,
        Format.SMARTS: rdChemReactions.ReactionToSmarts,
        Format.CXSMILES: rdChemReactions.ReactionToSmiles,
    }
    if format == Format.INCHI:
        raise RuntimeError("Cannot convert reaction to INCHI format")
    return rxn_to_str[format](rxn)


def _set_attachment_point_coordinates(mol, edit_mol, attch_pt_idxs):
    """
    Calculates coordinate of recently added attachment points by aligning to
    the original molecule and assigns chiral tags using bond directions and
    new coordinates
    """
    mol.UpdatePropertyCache(strict=False)
    edit_mol_copy = Chem.Mol(edit_mol)
    edit_mol_copy.UpdatePropertyCache(strict=False)
    substructure.apply_substructure_coordinates(edit_mol_copy, mol)
    positions = edit_mol_copy.GetConformer().GetPositions()

    # Since applying substructure coordinates can change non-attachment point
    # coordinates, add the new attachment point coordinates to the original mol
    for at_idx in attch_pt_idxs:
        xyz = positions[at_idx]
        edit_mol.GetConformer().SetAtomPosition(at_idx, xyz)

    mol = reapply_molblock_wedging(edit_mol)
    Chem.AssignChiralTypesFromBondDirs(mol)
    return mol


def _replace_attachment_points(mol, attch_pt_parents):
    """
    Replace attachment points with dummy atoms to preserve chirality
    """
    # Replace attachment points with dummy atoms
    edit_mol = Chem.RWMol(mol)
    attch_pt_idxs = []
    attch_pt_num = 1
    for atom_idx in attch_pt_parents:
        num_attch_pts = 1
        parent_atom = edit_mol.GetAtomWithIdx(atom_idx)
        attch_num = parent_atom.GetIntProp(MOL_PROP_ATTACHPT)
        if attch_num == -1:
            # this atom has two attachment points
            num_attch_pts = 2
            attch_num = 1
        exp_hs = parent_atom.GetNumExplicitHs()
        for i in range(num_attch_pts):
            new_at = edit_mol.AddAtom(Chem.Atom(0))
            attch_pt_idxs.append(new_at)
            edit_mol.AddBond(atom_idx, new_at, Chem.BondType.SINGLE)
            edit_mol.GetAtomWithIdx(new_at).SetProp(ATOM_PROP_ATOM_LABEL,
                                                    f'_AP{attch_num}')
            attch_num += 1
        parent_atom.ClearProp(MOL_PROP_ATTACHPT)
        if exp_hs:
            parent_atom.SetNumExplicitHs(exp_hs - num_attch_pts)

    # Dummy atoms should be query atoms. This ensures the dummy atom is written
    # as '*' instead of 'R'
    params = rdmolops.AdjustQueryParameters.NoAdjustments()
    params.makeDummiesQueries = True
    edit_mol = rdmolops.AdjustQueryProperties(edit_mol, params)

    return _set_attachment_point_coordinates(mol, edit_mol, attch_pt_idxs)


def _replace_dummy_atoms_with_attchpts(mol):
    # According to the MDL specification, ATTCHPT=1 is the first attachment point,
    # ATTCHPT=2 is the second attachment point, and ATTCHPT=-1 means that this
    # atom has the first and second attachment points.
    remove_attchpts = []
    for at in mol.GetAtoms():
        if atom_has_attachment_point(at):
            remove_attchpts.append(at.GetIdx())
            parent = at.GetNeighbors()[0]
            exp_hs = parent.GetNumExplicitHs()
            if parent.HasProp(MOL_PROP_ATTACHPT):
                parent.SetIntProp(MOL_PROP_ATTACHPT, -1)
                parent.SetNumExplicitHs(exp_hs + 2)
            else:
                parent.SetIntProp(MOL_PROP_ATTACHPT,
                                  int(at.GetProp(ATOM_PROP_ATOM_LABEL)[3]))
                parent.SetNumExplicitHs(exp_hs + 1)

    if not remove_attchpts:
        return mol

    # remove attachment points
    edit_mol = Chem.RWMol(mol)
    edit_mol.BeginBatchEdit()
    for at_idx in remove_attchpts:
        edit_mol.RemoveAtom(at_idx)
    edit_mol.CommitBatchEdit()

    return edit_mol.GetMol()


def _get_processed_mol(mol):
    if mol is None:
        return None

    # Replace attachment points with dummy atoms to preserve chirality
    attch_pt_parents = [
        a.GetIdx() for a in mol.GetAtoms() if a.HasProp(MOL_PROP_ATTACHPT)
    ]
    if attch_pt_parents:
        mol = _replace_attachment_points(mol, attch_pt_parents)
    else:
        # SHARED-8537: Preserve wiggly bond directions for molecules without
        # attachment points
        for bnd in mol.GetBonds():
            wiggly_bond_v3000 = bnd.HasProp(
                BOND_PROP_BOND_CONFIG) and bnd.GetIntProp(
                    BOND_PROP_BOND_CONFIG) == 2
            wiggly_bond_v2000 = bnd.HasProp(
                BOND_PROP_BOND_STEREO) and bnd.GetIntProp(
                    BOND_PROP_BOND_STEREO) == 4
            if wiggly_bond_v3000 or wiggly_bond_v2000:
                bnd.SetBondDir(Chem.BondDir.UNKNOWN)

    # SHARED-8629: Add polymer brackets in convert if they are not present
    return add_polymer_brackets(mol, keep_existing_brackets=True)


def _detect_unkekulized_atoms(m):
    opts = Chem.SANITIZE_ALL ^ Chem.SANITIZE_PROPERTIES ^ Chem.SANITIZE_CLEANUP ^ Chem.SANITIZE_CLEANUPCHIRALITY ^ Chem.SANITIZE_FINDRADICALS
    errs = Chem.DetectChemistryProblems(m, opts)
    for err in errs:
        if err.GetType() == 'KekulizeException':
            return sorted(err.GetAtomIndices())
    return None


[docs]def reapply_molblock_wedging(mol):
    for bnd in mol.GetBonds():
        # only change the wedging if the bond was wedged in the input data (we
        # recognize that by looking for the properties the mol file parser
        # sets):
        if bnd.HasProp(BOND_PROP_BOND_STEREO):
            # V2000
            val = bnd.GetProp(BOND_PROP_BOND_STEREO)
            if val == "1":
                bnd.SetBondDir(Chem.BondDir.BEGINWEDGE)
            elif val == "6":
                bnd.SetBondDir(Chem.BondDir.BEGINDASH)
            elif val == "4":
                bnd.SetBondDir(Chem.BondDir.UNKNOWN)
        elif bnd.HasProp(BOND_PROP_BOND_CONFIG):
            # V3000
            val = bnd.GetProp(BOND_PROP_BOND_CONFIG)
            if val == "1":
                bnd.SetBondDir(Chem.BondDir.BEGINWEDGE)
            elif val == "3":
                bnd.SetBondDir(Chem.BondDir.BEGINDASH)
            elif val == "2":
                bnd.SetBondDir(Chem.BondDir.UNKNOWN)
        elif bnd.GetBondDir() in (Chem.BondDir.BEGINDASH,
                                  Chem.BondDir.BEGINWEDGE,
                                  Chem.BondDir.UNKNOWN):
            bnd.SetBondDir(Chem.BondDir.NONE)
    return mol


[docs]def add_hs_to_aromatic_nitrogen(mol):
    """
    Intended to be used with molecules which have kekulization failures due to
    aromatic system(s) containing Ns where the user hasn't provided the location of
    the implicit Hs in the system.

    This picks an arbitrary (but canonical) aromatic N to add an H to in each
    aromatic system.

    Returns the original mol if it can't fix it.
    """
    Chem.GetSymmSSSR(mol)
    tm = Chem.Mol(mol)
    unkekulized_atoms = _detect_unkekulized_atoms(tm)
    while unkekulized_atoms:
        tm, new_unkekulized_atoms = _fix_ring_system(tm, unkekulized_atoms)
        if new_unkekulized_atoms == unkekulized_atoms:
            # got stuck, return the original
            return mol
        unkekulized_atoms = new_unkekulized_atoms

    return tm


def _fix_ring_system(m, unkekulized_atoms):
    m.UpdatePropertyCache(False)

    # we want to always generate the same result for the same molecule
    # Which result to return will cause arguments, yay, but we'll
    # prefer smaller rings, then use the canonical atom ranking to break ties
    ranks = Chem.CanonicalRankAtoms(m)
    ri = m.GetRingInfo()
    atoms_to_check = unkekulized_atoms[:]
    atoms_to_check.sort(key=lambda x: (ri.MinAtomRingSize(x), ranks[x]))
    while atoms_to_check:
        ai = atoms_to_check.pop(0)
        atom = m.GetAtomWithIdx(ai)
        if atom.GetAtomicNum() != 7:
            continue
        if atom.GetDegree() != 2 or atom.GetNumExplicitHs() != 0:
            continue

        tm = Chem.Mol(m)
        atom = tm.GetAtomWithIdx(ai)
        atom.SetNumExplicitHs(1)
        atom.SetNoImplicit(True)
        # did we fix something?
        new_unkekulized_atoms = _detect_unkekulized_atoms(tm)
        if new_unkekulized_atoms != unkekulized_atoms:
            return tm, new_unkekulized_atoms
    return m, unkekulized_atoms


def _fix_r0_rgroup(mol):
    highest_rlabel = 0
    atoms_with_r0 = []
    for atom in mol.GetAtoms():
        if atom.HasProp(MOL_PROP_R_LABEL):
            label = atom.GetIntProp(MOL_PROP_R_LABEL)
            highest_rlabel = max(highest_rlabel, label)
            if label == 0:
                atoms_with_r0.append(atom)

    if atoms_with_r0:
        highest_rlabel += 1
        for atom in atoms_with_r0:
            atom.SetIntProp(MOL_PROP_R_LABEL, highest_rlabel)


def _get_perpendicular_bracket(start_xyz, end_xyz):
    """
    Get coordinates for a bracket bisecting bond X. The bracket length will be the same
    length as the bond that it bisects.
    """
    xy_change = start_xyz - end_xyz
    opposite_reciprocal = np.array([xy_change[1] / 2, (xy_change[0] * -1) / 2])
    midpoint = np.array([(start_xyz[0] + end_xyz[0]) / 2,
                         (start_xyz[1] + end_xyz[1]) / 2])
    bracket = [
        Geometry.Point3D(*(midpoint + opposite_reciprocal), 0),
        Geometry.Point3D(*(midpoint - opposite_reciprocal), 0),
        Geometry.Point3D(0, 0, 0)
    ]
    return bracket


[docs]def is_polymer(s_group):
    if not s_group.HasProp("TYPE"):
        raise ValueError(
            f"Substance group {s_group.GetIndexInMol()} does not have the TYPE property"
        )
    return s_group.GetProp("TYPE") in [
        "SRU", "MON", "COP", "CRO", "GRA", "MOD", "MER", "ANY"
    ]


[docs]def add_polymer_brackets(mol, revert_to_mol=None, keep_existing_brackets=False):
    """
    Add polymer brackets back to mol.

    :param mol: RDKit mol to add polymer brackets to
    :param revert_to_mol: RDKit mol to revert to if polymer brackets cannot be
        added correctly to provided mol. This will occur when brackets cross more
        than one bond.
    :param keep_existing_brackets: whether to recalculate the positions of brackets
        that are already present
    :return: RDKit mol with polymer brackets
    """
    if not revert_to_mol:
        revert_to_mol = Chem.Mol(mol)
    substance_groups = Chem.GetMolSubstanceGroups(mol)
    Chem.ClearMolSubstanceGroups(mol)
    pos = mol.GetConformer().GetPositions()
    revert = False
    for s_group in substance_groups:
        # do not add brackets to non-polymer sgroups or sgroups without atoms
        if len(s_group.GetAtoms()) == 0 or (not is_polymer(s_group) and
                                            len(s_group.GetBrackets()) == 0):
            Chem.AddMolSubstanceGroup(mol, s_group)
            continue

        if keep_existing_brackets and len(s_group.GetBrackets()) != 0:
            # do not recalculate bracket coordinates
            Chem.AddMolSubstanceGroup(mol, s_group)
            continue

        bond_positions = [(pos[mol.GetBondWithIdx(bnd).GetBeginAtomIdx()],
                           pos[mol.GetBondWithIdx(bnd).GetEndAtomIdx()])
                          for bnd in s_group.GetBonds()]
        s_group.ClearBrackets()
        if (len(bond_positions) == 0):
            # put brackets around all substance group atoms
            atom_pos = [pos[a] for a in s_group.GetAtoms()]
            offset_from_atom = .5
            min_x = min([xyz[0] for xyz in atom_pos]) - offset_from_atom
            min_y = min([xyz[1] for xyz in atom_pos]) - offset_from_atom
            max_x = max([xyz[0] for xyz in atom_pos]) + offset_from_atom
            max_y = max([xyz[1] for xyz in atom_pos]) + offset_from_atom
            left_bracket = [
                Geometry.Point3D(min_x, min_y, 0),
                Geometry.Point3D(min_x, max_y, 0),
                Geometry.Point3D(0, 0, 0),
            ]
            right_bracket = [
                Geometry.Point3D(max_x, min_y, 0),
                Geometry.Point3D(max_x, max_y, 0),
                Geometry.Point3D(0, 0, 0),
            ]
            s_group.AddBracket(left_bracket)
            s_group.AddBracket(right_bracket)
        elif (len(bond_positions) == 2):
            s_group.AddBracket(
                _get_perpendicular_bracket(bond_positions[0][0],
                                           bond_positions[0][1]))
            s_group.AddBracket(
                _get_perpendicular_bracket(bond_positions[1][0],
                                           bond_positions[1][1]))
        else:
            revert = True
            break
        Chem.AddMolSubstanceGroup(mol, s_group)
    if revert:
        # when brackets have to go through 2 or more bonds, it is unlikely that
        # the new coordinates will allow a good placement of the polymer
        # brackets
        mol = Chem.Mol(revert_to_mol)
    return mol