Source code for schrodinger.structutils.rgroup_enumerate

"""
Module for R-group enumeration.
"""

import collections
import csv
import io
import itertools

from schrodinger.infra import fast3d
from schrodinger.infra import mm
from schrodinger.infra.mmbitset import Bitset
from schrodinger.structutils import analyze
from schrodinger.structutils import rmsd
from schrodinger.ui.qt.filter_dialog_dir import filter_core
from schrodinger.utils import ligfilter
from schrodinger.utils import log
from schrodinger.utils.cgxutils import _get_stereo
from schrodinger.utils.cgxutils import _order
from schrodinger.utils.cgxutils import recompute_stereo

logger = log.get_output_logger('rgroup_enumerate')
"""
RGroup properties:

- atom_index: index of the atom bound to the core (aka the "leaving atom")
- source_index: index of the R-group source that will replace this group
- leaving_atoms: list of all the atoms in the leaving group (by index)
- staying_atom: index of the core atom bound to the leaving group
"""
RGroup = collections.namedtuple('RGroup', [
    'atom_index', 'source_index', 'leaving_atoms', 'staying_atom', 'bond_order'
])

# TODO: the RGroup namedtuple should probably be renamed LeavingGroup
# to reflect its current usage better.


[docs]class Concentration:
    """
    Concentration classes manage "concentration groups", which define the exact
    number of possible combinations to be explored. They construct new
    instances from commandline strings. For each concentration group, they also
    track the associated R group sources, which are written as properties to
    enumerated products.
    """

[docs]    def __init__(self, atoms, concentration):
        """
        :param atoms: the atom indices of the input structure corresponding to
            specified attachment points.
        :type atoms: list(int)

        :param concentration: the concentration, a float that defines the
            maximum number of simultaneous substitutions across the provided
            atoms
        :type concentration: int
        """
        self.atoms = atoms
        atom_set = set(atoms)
        if len(atom_set) != len(atoms):
            raise ValueError(f"Duplicate atoms specified in {atoms}")
        if concentration < 1:
            raise ValueError(
                f"Concentration {concentration} is too small. Concentrations "
                "must be at least 1.")
        # length is the number of attachment points to change at once
        self.length = int(concentration)
        # "concentration" is normalized to preserve old behavior
        self.concentration = self.length / len(atoms)
        self.rgroup_sources = []

    def __str__(self):
        return f"Concentration({self.atoms}:{self.concentration})"

[docs]    def appendSource(self, rgroup_source):
        """
        :param rgroup_source: a pair of attachment points and R group input
            structures, used to enumerate new R groups at the specified
            attachment points
        :type rgroup_source: rgroup_enumerate.RGgroupSource
        """
        self.rgroup_sources.append(rgroup_source)

[docs]    def addConcentrationProperty(self, product):
        """
        Add the concentration property (if applicable) to a product.
        :param product: target product
        :type product: structure.Structure
        """
        # current behavior (preserve legacy concentration specs only):
        # write concentrations only when all attachment points in the
        # concentration group have the same R group attached.
        if len(self.rgroup_sources) == 1:
            # source index properties are 0-indexed in the module but 1-indexed
            # in the product
            source_index = self.rgroup_sources[0].source_index + 1
            product.property[
                f'r_rge_occupation_R{source_index}'] = self.concentration


# Descriptors that this module cares to compute. Used to implement the -filter
# option of r_group_enumerate.py.
DESCRIPTOR_DICT = {
    # name: (func, limiter))
    'r_rge_Molecular_weight': (ligfilter.Molecular_weight, (0.0, 1000.0)),
    'i_rge_Num_rings': (ligfilter.Num_rings, (0, 10)),
    'i_rge_Num_aromatic_rings': (ligfilter.Num_aromatic_rings, (0, 10)),
    'i_rge_Num_aliphatic_rings': (ligfilter.Num_aliphatic_rings, (0, 10)),
    'i_rge_Num_heteroaromatic_rings':
        (ligfilter.Num_heteroaromatic_rings, (0, 10)),
    'i_rge_Num_rotatable_bonds': (ligfilter.Num_rotatable_bonds, (0, 50)),
    'i_rge_Num_atoms': (ligfilter.Num_atoms, (0, 500)),
    'i_rge_Num_chiral_centers': (ligfilter.Num_chiral_centers, (0, 10)),
    'i_rge_Total_charge': (ligfilter.Total_charge, (-5, 5)),
    'i_rge_Num_positive_atoms': (ligfilter.Num_positive_atoms, (0, 10)),
    'i_rge_Num_negative_atoms': (ligfilter.Num_negative_atoms, (0, 10)),
    'i_rge_Num_heavy_atoms': (ligfilter.Num_heavy_atoms, (0, 300)),
}

DUMMY_ATOMIC_NUMBER = -2


[docs]class RGroupSource:
    """
    Class to generate rgroup structure iterators that are associated with
    specific attachment points.
    """

[docs]    def __init__(self, source_index, structures, atom_indices,
                 attachment_indices):
        """
        :param structure: structure iterator
        :type structure: iter(structure.Structure)

        :param attachment_indices: affected attachment points
        :type attachment_indices: list(int)
        """
        self.source_index = source_index
        self.structures = structures
        self.atom_indices = set(atom_indices)
        self.attachment_indices = set(attachment_indices)
        # consume the structure iterator so we can iterate multiple times
        self.structure_list = list(structures)
        self.length = len(self.structure_list)

    def __str__(self):
        return f"RGroupSource({self.structures}:{self.attachment_indices})"

[docs]    def makeIter(self, target_attachments):
        """
        Create a new iterator that stores and returns the associated target
        atoms.

        :param target_attachments: list of attachment point indices specifying
            affected attachment points.
        :type target_attachments: list(int)

        :return: iterator that knows which attachment points to modify
        :rtype: RGroupIter
        """
        if not set(target_attachments).issubset(set(self.attachment_indices)):
            raise ValueError(
                "Target attachments include atoms not associated with this rgroup."
            )
        for st in self.structure_list:
            yield (st, target_attachments)


[docs]class RGroupError(Exception):
    """
    Exception class for errors specific to this module, which the caller
    may want to present to the user as a simple error message, as opposed
    to a traceback. This is meant for "user errors", as opposed to bugs;
    for example, when an input structure doesn't fulfill the requirements.
    """


[docs]class BondOrderMismatch(RGroupError):
    """
    A specific kind of error that we'll ignore to allow libraries that
    include R-groups with different bond orders.
    """


[docs]class RGroupEnumerator(object):
    """
    Enumerate a structure using R-group sources.

    A source is a sequence with an iterable of Structure as its first element,
    followed by one or more core atom atom indices where the side chains from
    the source should be inserted.

    RGroupEnumerator objects are iterable. Example::

        sources = [
            (StructureReader('r1.maegz'), 4, 12),
            (StructureReader('r2.maegz'), 8),
        ]
        for prod_st in RGroupEnumerator(core_st, sources):
            ...

    will use the first reader to replace atoms 4 and 12 in an homo fashion
    (meaning that for a given product, the groups attached to atoms 4 and 12 are
    always the same), in combination with the structures for the second reader
    for atom 8.

    The generated structures have the title of the core structure and the title
    of each of the R-groups, encoded in CSV format. For ease of parsing, this
    information is also stored as separate properties: i_rge_num_r_groups has
    the number of R groups, and the title of each is goes in properties
    r_rge_R1, r_rge_R2, etc.

    As an option, all CT properties from the R groups can be copied to each
    product molecule. These properties have the original name prefixed with
    <type-char>_rge_R<index>_; for example, r_i_glide_gscore for the first
    R group becomes r_rge_R1_r_i_glide_gscore.

    The structures in each R-group source should each have one dummy atom
    (symbol '', atomic number zero).

    The user of the class can request only a slice of the full set of
    combinations to be yielded, by providing the optional 'start' and 'stop'
    constructor arguments. These follow the standard Python slicing convention.

    If the core structure came from an SDF file with R-group labels
    ("M  RGP" lines), the attachment atoms don't need to be specified; the
    labels from the file can be used implicitly.
    """

[docs]    def __init__(self,
                 core_st,
                 sources,
                 optimize_sidechains=True,
                 deduplicate=True,
                 start=0,
                 stop=None,
                 copy_properties=False,
                 enumerate_cistrans=True,
                 yield_renum_maps=False,
                 concentrations=()):
        """
        Initialize an enumerator for a given core structure and specification of
        rgroup sources.

        :param core_st: core structure
        :type core_st: `schrodinger.structure.Structure`

        :param sources: side chain sources. See class description for details.
        :type sources: list of list

        :param optimize_sidechains: if true, generate 3D coordinates for the
                side chain atoms using Fast3D. The input coordinates will only be
                used for determining stereochemistry. If false, position the side
                chains using rigid rotation and translation (and an arbitrary
                torsional angle around the new bond).
        :type optimize_sidechains: bool

        :param deduplicate: use unique SMILES to identify
            and reject duplicate products
        :type deduplicate: bool

        :param start: beginning of results slice (used by subjobs)
        :type start: int

        :param stop: end of results slice (used by subjobs)
        :type stop: int or None

        :param copy_properties: if true, copy all CT properties from each
            R-group to the constructed molecule.
        :type copy_properties: bool

        :param enumerate_cistrans: if True (default), emit both cis and
            trans isomers for double-bonded R-groups.
        :type enumerate_cistrans: bool

        :param yield_renum_maps: if True then on each iteration yield not only the
            product structure but also the relevant old-to-new atom index map
        :type yield_renum_maps: bool

        :param concentrations: List of concentrations, which define the number
            of simultaneous R group substitutions are made across all
            attachment point atoms in the concentration group.
        :type concentrations: iterable(Concentration) or None
        """
        self.core_st = core_st
        self.optimize_sidechains = optimize_sidechains
        self.deduplicate = deduplicate
        self.start = start
        self.stop = stop
        self.iters = [source[0] for source in sources]
        self.atom_map = {}
        self.r_iters = []
        if all(len(source) == 1 for source in sources):
            # Attachment atoms were not specified for any of the R-group
            # sources, so we'll try to figure them out from SDF atom properties.
            try:
                sources = get_sources_from_r_labels(core_st, self.iters)
            except ValueError as e:
                raise RGroupError(str(e))

        # ensure all R group and concentration group definitions are valid
        # and consistent, and throw an error if not possible
        self.concentrations = self._makeConcentrations(sources, concentrations)

        self.rgroups = []
        self.copy_properties = copy_properties
        self.enumerate_cistrans = enumerate_cistrans
        self.yield_renum_maps = yield_renum_maps
        self.rgroup_sources = []
        seen_atoms = set()
        natoms = core_st.atom_total
        for source_index, source in enumerate(sources):
            rgroups_indices = []
            structure = source[0]
            atoms = source[1:]
            for atom_index in atoms:
                # if any of the rgroup sets overlap, raise an error
                if atom_index in seen_atoms:
                    raise RGroupError(
                        'Core attachment atom %d specified more than once.' %
                        (atom_index))
                if not 1 <= atom_index <= natoms:
                    raise RGroupError('Invalid core attachment atom index %d. '
                                      '(Core has %d atoms.)' %
                                      (atom_index, natoms))
                leaving_atom = core_st.atom[atom_index]
                try:
                    staying_atom = find_staying_atom(core_st, leaving_atom)
                except ValueError as e:
                    raise RGroupError(str(e))
                leaving_atoms = list(
                    core_st.getMovingAtoms(staying_atom, leaving_atom))
                bond_order = core_st.getBond(leaving_atom, staying_atom).order
                r = RGroup(atom_index, source_index, leaving_atoms,
                           staying_atom.index, bond_order)
                self.rgroups.append(r)
                seen_atoms.add(atom_index)
                rgroups_indices.append(len(self.rgroups) - 1)
                self.atom_map[atom_index] = len(self.rgroups) - 1

            # create copyable rgroup iterators for combinations
            mapped_atoms = [self.atom_map[atom] for atom in atoms]
            rgroup_source = RGroupSource(source_index, structure, atoms,
                                         mapped_atoms)
            self.rgroup_sources.append(rgroup_source)
            concentration_group = self._findConcentrationGroupByAtoms(atoms)
            concentration_group.appendSource(rgroup_source)
            self.r_iters.append(rgroup_source)

        for conc in self.concentrations:
            conc.mapped_atoms = [self.atom_map[atom] for atom in conc.atoms]

        if self.optimize_sidechains:
            self.f3d_engine = fast3d.SingleConformerEngine()

    def _makeConcentrations(self, rgroups, concentrations):
        """
        Validate and create concentration groups for all provided
        concentrations, as well as any rgroups which do not have any
        concentration specified (default concentration, all attachment points
        have the same R group).
        :param rgroups: list of R groups, specified by R group string, e.g.
        amine.mae,1,3,12 for the R group structures from amine.mae, applied
        to attachment points at atoms 1, 3, and 12.
        :type rgroups: list(str, int, int, ...)

        :param concentrations: list of concentrations, specified as a list of
            Concentrations, or a list of numbers (legacy behavior)
        :type concentrations: list(str) or list(float) or list(Concentration)
        """
        all_concentrations = []
        # add user-specified concentrations, if any
        if concentrations:
            # legacy concentration spec: list of numbers
            if all(
                    isinstance(concentration, float) or
                    isinstance(concentration, int)
                    for concentration in concentrations):

                # legacy concentrations must all be between 0 and 1, and not 0
                assert all(
                    0 < concentration <= 1 for concentration in concentrations)

                all_concentrations = [
                    Concentration(
                        rgroup[1:],
                        _concentrationFromFraction(len(rgroup[1:]), conc))
                    for rgroup, conc in zip(rgroups, concentrations)
                ]
            # list of strings, which define the concentration and
            # the affected atoms
            elif all(
                    isinstance(concentration, Concentration)
                    for concentration in concentrations):
                all_concentrations = [
                    Concentration(conc.atoms, conc.length)
                    for conc in concentrations
                ]
            else:
                raise RGroupError("Invalid concentrations.")
        # ensure that every R group attachment point has a concentration, and
        # create a concentration-1 group for any ungrouped attachment points
        rgroup_sets = [set(rgroup[1:]) for rgroup in rgroups]
        conc_sets = [set(conc.atoms) for conc in all_concentrations]
        all_conc_atoms = set().union(*conc_sets)
        all_rgroup_atoms = set().union(*rgroup_sets)
        # if there are concentration atoms without rgroups, raise an error
        unassigned_concs = all_conc_atoms.difference(all_rgroup_atoms)
        if unassigned_concs:
            raise RGroupError(
                f"Atom(s) {unassigned_concs} do not correspond to any R groups")

        for rgroup_atoms in rgroup_sets:
            if all(not rgroup_atoms.issubset(conc_set)
                   for conc_set in conc_sets):
                # if any rgroup set is not a subset of a concentration group
                # raise an error (we don't currently support homo constraints
                # across concentration groups
                if all_conc_atoms.intersection(rgroup_atoms):
                    raise RGroupError(
                        f"R group atoms {rgroup_atoms} need to be wholly "
                        "contained by exactly one concentration group")
                # otherwise, create a concentration group for the R group
                else:
                    all_concentrations.append(
                        Concentration(rgroup_atoms, len(rgroup_atoms)))

        # if any concentration atoms are reused, raise an error
        atoms = [set(conc.atoms) for conc in all_concentrations]
        atom_counts = collections.Counter(itertools.chain.from_iterable(atoms))
        reused_atoms = [
            atom for atom, count, in atom_counts.items() if count > 1
        ]
        if reused_atoms:
            raise RGroupError(
                'Atom(s) specified in more than one concentration '
                f'group: {reused_atoms}')

        return all_concentrations

    def _findConcentrationGroupByAtoms(self, rgroup_atoms):
        matches = [
            conc for conc in self.concentrations
            if set(rgroup_atoms).issubset(conc.atoms)
        ]
        if len(matches) != 1:
            raise RGroupError(f"Rgroup atoms {rgroup_atoms} not wholly "
                              "contained by a concentration group.")
        return matches[0]

    def __iter__(self):
        yield from self._enumerate_and_duduplicate() \
            if self.deduplicate else self._enumerate()

    def _enumerate_and_duduplicate(self):
        seen = set()
        for prod in self._enumerate():
            code = analyze.generate_smiles(
                prod[0] if self.yield_renum_maps else prod)
            if code in seen:
                continue
            else:
                seen.add(code)
                yield prod

    def _enumerate(self):
        conc_combinations = [
            itertools.combinations(conc.mapped_atoms, conc.length)
            for conc in self.concentrations
        ]

        product_of_combinations = filtered_combinations(self.r_iters,
                                                        conc_combinations)
        combinations = itertools.islice(product_of_combinations, self.start,
                                        self.stop)

        for assignments in combinations:
            structs = [assignment[0][0] for assignment in assignments]
            sidechains = [None] * len(self.rgroups)
            for assignment in assignments:
                for struct, r_indices in assignment:
                    for assigned_index in r_indices:
                        sidechains[assigned_index] = struct

            try:
                for prod, renum_map in self.attachSidechains(sidechains):
                    self._addMetadata(prod, tuple(structs))
                    if self.yield_renum_maps:
                        yield prod, renum_map
                    else:
                        yield prod
            except BondOrderMismatch as e:
                logger.warning(str(e))

[docs]    def attachSidechains(self, sidechains):
        """
        Attach the sidechains to the core structure and return the resulting
        structure and index map.

        :param sidechains: list of sidechains. Should have the same length as
                           the number of attachment atoms in the core.
        :type sidechains: list of `schrodinger.structure.Structure`

        :yield: product structures and index maps
        :ytype: `schrodinger.structure.Structure`, dict
        """
        prod = self.core_st.copy()
        atoms_to_delete = []
        r_atoms = []
        ezs = []
        for r, sidechain in zip(self.rgroups, sidechains):
            if sidechain is None:
                continue

            try:
                to_del, r_atom, s_atom = self._attachSidechain(
                    prod, sidechain, r)
                atoms_to_delete += to_del
                r_atoms.append(r_atom)
                if r.bond_order == 2 and self.enumerate_cistrans:
                    ezs.append((r_atom, s_atom))
            except (ValueError, RGroupError) as e:
                cls = type(e) if isinstance(e, RGroupError) else RGroupError
                raise cls('Error attaching sidechain "%s" from source %d '
                          'to core atom %d: %s' %
                          (sidechain.title, r.source_index + 1, r.atom_index,
                           e.args[0]))

        renum_map = prod.deleteAtoms(atoms_to_delete, renumber_map=True)
        product_core_atoms = [
            renum_map[i]
            for i in self.core_st.getAtomIndices()
            if renum_map[i] is not None
        ]
        input_core_atoms = [
            i for i in self.core_st.getAtomIndices() if renum_map[i] is not None
        ]
        product_ezs = set()
        for (i, j) in ezs:
            _i, _j = renum_map[i], renum_map[j]
            if _i is not None and _j is not None:
                product_ezs.add(_order(_i, _j))

        if len(input_core_atoms) < 3:
            # Not enough core atoms are left for perfect alignment, so we'll
            # also try to align the first atom from each R group on top of the
            # core attachment atom it replaced.
            input_core_atoms += [r.atom_index for r in self.rgroups]
            product_core_atoms += [renum_map[i] for i in r_atoms]

        for prod_iso in _enumerate_ezs(prod, product_core_atoms, product_ezs):
            if self.optimize_sidechains:
                self._generateCoordinates(prod_iso, product_core_atoms)

            self._alignCore(prod_iso, input_core_atoms, product_core_atoms)
            yield prod_iso, renum_map

    def _attachSidechain(self, prod, sidechain, r):
        """
        Attach a sidechain to the core at the specified atom index. Returns the
        indices of the two atoms that should be deleted, as well as the index of
        the sidechain atom that is now bonded to the core. (This low-level
        method doesn't delete them because the caller may be attaching multiple
        sidechains, and deleting each time could trigger confusing renumbering
        and be less efficient.)

        :param prod: core/product, to be modified in place
        :type prod: `schrodinger.structure.Structure`

        :param sidechain: sidechain structure (not modified)
        :type sidechain: `schrodinger.structure.Structure`

        :param r: RGroup object, which specifies the atom to bind to and the
                atoms to delete.
        :type r: RGroup

        :return: list of atoms to delete, R-group atom bound to core,
            staying core atom
        :rtype: 3-tuple (list of int, int, int)
        """

        sidechain = sidechain.copy()

        dummy_atom = _find_dummy(sidechain)
        r_atom, r_order = _find_neighbor(sidechain.atom[dummy_atom])

        try:
            core_atom, core_order = _find_neighbor(prod.atom[r.atom_index])
        except ValueError:
            # The reason we don't always use the staying atom is that it might
            # have been deleted in the edge case of a 2-atom core.
            core_atom = r.staying_atom
            core_order = r.bond_order

        if core_order != r_order:
            raise BondOrderMismatch('Attachment bond order mismatch.')

        rmsd.superimpose_bond(prod, (core_atom, r.atom_index), sidechain,
                              (dummy_atom, r_atom))

        new_dummy_atom = prod.atom_total + dummy_atom
        old_atom_total = prod.atom_total
        new_r_atom = prod.atom_total + r_atom
        prod.extend(sidechain)
        # Change sidechain atoms residue name and number to match the ones
        # for the 'staying' atom
        res_num = prod.atom[r.staying_atom].resnum
        res_name = prod.atom[r.staying_atom].pdbres
        ins_code = prod.atom[r.staying_atom].inscode
        chain = prod.atom[r.staying_atom].chain
        for atom_index in range(old_atom_total + 1, prod.atom_total + 1):
            prod.atom[atom_index].resnum = res_num
            prod.atom[atom_index].pdbres = res_name
            prod.atom[atom_index].inscode = ins_code
            prod.atom[atom_index].chain = chain

        prod.addBond(core_atom, new_r_atom, r_order)
        prod.deleteBond(core_atom, r.atom_index)
        atoms_to_delete = r.leaving_atoms + [new_dummy_atom]
        return atoms_to_delete, new_r_atom, core_atom

    def _generateCoordinates(self, st, core_atoms):
        """
        Generate 3D coordinates for the non-core atoms in the structure using
        fast3d.

        :param st: structure, to be modified in place
        :type st: `schrodinger.structure.Structure`

        :param core_atoms: list of core atoms which will be kept frozen (may be
                           empty)
        :type core_atoms: list of int
        """

        core_atom_indices = set(core_atoms)

        amide_ez_props = add_amide_constraints(st, core_atom_indices)

        core_atom_indices |= get_metals_and_neighbors(st)

        try:
            self.f3d_engine.run(st, list(core_atom_indices))
        except RuntimeError:
            pass

        for prop in amide_ez_props:
            del st.property[prop]

    def _alignCore(self, st, input_core_atoms, product_core_atoms):
        """
        Superimpose the product structure on the input core.

        :param st: structure to rotate in place
        :type st: `schrodinger.structure.Structure`

        :param input_core_atoms: list of core atoms in self.core_st
        :type input_core_atoms: list of int

        :param product_core_atoms: list of core atoms in 'st
        :type product_core_atoms: list of int

        """
        if len(input_core_atoms) > 2:
            rmsd.superimpose(self.core_st,
                             input_core_atoms,
                             st,
                             product_core_atoms,
                             use_symmetry=False,
                             move_which=rmsd.CT)
        elif len(input_core_atoms) == 2:
            rmsd.superimpose_bond(self.core_st, input_core_atoms, st,
                                  product_core_atoms)
        else:
            # This should never happen; cores with < 2 atoms would have
            # failed before this point is reached, but leaving the else
            # for completeness.
            raise RGroupError("Core must have at least two atoms.")

    def _addMetadata(self, prod, sidechains):
        prod.property['i_rge_num_r_groups'] = len(sidechains)

        # Set concentration
        for conc in self.concentrations:
            conc.addConcentrationProperty(prod)

        for i, st in enumerate(sidechains, 1):
            rlabel = 's_rge_R%d' % i
            prod.property[rlabel] = st.title
            if self.copy_properties:
                for prop in st.property:
                    new_prop = '%s_rge_R%d_%s' % (prop[0], i, prop)
                    prod.property[new_prop] = st.property[prop]
        prod.title = list_to_csv([st.title for st in (prod,) + sidechains])

[docs]    def combinations(self):
        """
        Return the number of combinations that will be generated for each
        tuple of R-groups. That is, combinations due to occupancy of the various
        attachment points when all the concentrations are not 1.0.
        """
        total = 1

        # since concentration groups can span multiple R groups now, we need to
        # unroll the calculation a bit more to avoid overcounting.
        for concentration in self.concentrations:
            combinations = itertools.combinations(concentration.atoms,
                                                  concentration.length)
            subtotal = 0
            for combination in combinations:
                subsubtotal = 1
                for rgroup_source in concentration.rgroup_sources:
                    if any(x in combination
                           for x in rgroup_source.atom_indices):
                        subsubtotal *= rgroup_source.length
                subtotal += subsubtotal
            total *= subtotal

        return total


def _flip_bond(st, core, i, j, k, l):  # noqa: E741, bond atom
    '''
    Flips i-j-k-l dihedral 180 degrees. Assumes that i-j, j-k,
    and k-l are bonded, and j-k bond is exocyclic.

    :param st: Structure
    :type st: `schrodinger.structure.Structure`

    :param core: Indices of the "core" atoms (that not to be moved).
    :type core: set(int)

    :param i: Atom index.
    :type i: int

    :param j: Atom index.
    :type j: int

    :param k: Atom index.
    :type k: int

    :param l: Atom index.
    :type l: int

    :raises: ValueError, mm.MmException
    '''

    bs = Bitset(size=st.atom_total)

    if k not in core:
        fixed, moving = j, k
    elif j not in core:
        fixed, moving = k, j
    else:
        raise ValueError('no movable atoms')

    mm.mmct_atom_get_moving(st, fixed, st, moving, bs)

    angle = st.measure(i, j, k, l)

    if angle > 0.0:
        angle -= 180.0
    else:
        angle += 180.0

    mm.mmct_atom_set_dihedral_angle(
        angle, mm.MM_ANGLE_DEG,
        st, i, st, j, st, k, st, l, bs) # yapf: disable


[docs]def filtered_combinations(rgroup_iters, selection_iters):
    """
    Given a set of rgroup structures (provided as a list of RGroupIterFactories)
    and attachment point combination iterators, return an iterator over
    all distinct combinations of structures and attachment point combinations.
    (Note that while the combinations are unique, the corresponding products
    may still have duplicates.)

    :param rgroup_iters: iterators over R groups, one for every source.
    :type rgoup_iters: list(RGroupSource)

    :param selection_iters: iterators over combinations, one for every set of
        attachment points from which combinations are chosen
    :type selection_iters: list(iter(tuple(int)))

    :return: iterator over all distinct product combinations
    :rtype: itertools.product(tuple(structure.Structure, list(int)))
    """
    combinations = [
        filtered_combinations_for_selection(rgroup_iters, selection_iter)
        for selection_iter in selection_iters
    ]
    chains = [itertools.chain(*combination) for combination in combinations]
    return itertools.product(*chains)


[docs]def filtered_combinations_for_selection(rgroup_iters, selection_iter):
    """
    Helper function to reduce the loop nesting in filtered_combinations.
    Handles the filtering of rgroup structure iterators per attachment point
    set (associating the rgroup iterator to its respective attachment points).
    (Note that while the combinations are unique, the corresponding products
    may still have duplicates.)

    :param rgroup_iters: iterators over R groups, one for every source.
    :type rgoup_iters: list(RGroupSource)

    :param selection_iter: iterator over combinations for a set of attachment
        points
    :type selection_iters: iter(tuple(int))

    :yield: iterator over all distinct product (R group + attachment point)
        combinations
    :ytype: itertools.product(tuple(structure.Structure, list(int)))
    """
    filtered_combinations = []
    for selection in selection_iter:
        atom_set = set(selection)
        selection_rgroup_iters = []
        for rgroup_iter in rgroup_iters:
            rgroup_atoms = rgroup_iter.attachment_indices
            intersection = atom_set.intersection(rgroup_atoms)
            if len(intersection) > 0:
                selection_rgroup_iters.append(
                    rgroup_iter.makeIter(intersection))
        # don't call itertools.product on single list to avoid reading it in
        # memory and producing products of products downstream, instead return
        # a generator that simulates itertools.product output by wrapping each
        # return in an extra tuple
        if len(selection_rgroup_iters) == 1:
            yield ((x,) for x in selection_rgroup_iters[0])
        else:
            yield itertools.product(*selection_rgroup_iters)


def _enumerate_ezs(st, core, ezs):
    '''
    Generator that enumerates E/Z isomers for the given bonds.

    :param st: Original structure.
    :type st: `schrodinger.structure.Structure`

    :param core: Indices of the "core" atoms (that are not to be moved).
    :type core: set(int)

    :param ezs: Set of ordered pairs of atom indices for the
        E/Z bonds to enumerate.
    :type ezs: set(tuple(int, int))

    :yield: Structures.
    :ytype: `schrodinger.structure.Structure`
    '''

    recompute_stereo(st)
    _, cistrans_bonds = _get_stereo(st, logger)

    todo = [ez for ez in ezs if ez in cistrans_bonds]

    for comb in itertools.product((False, True), repeat=len(todo)):
        st_iso = st.copy()
        for (ez, flip) in zip(todo, comb):
            if flip:
                atoms = cistrans_bonds[ez][1:]
                _flip_bond(st_iso, core, *atoms)
        recompute_stereo(st_iso)
        yield st_iso


def _find_neighbor(atom):
    """
    Return the index of the atom bonded to a given atom
    and corresponding bond order.

    :param atom: atom
    :type atom: `schrodinger.structure._StructureAtom`

    :return: atom index and bond order
    :rtype: (int, int)

    :raises: ValueError if the atom does not have exactly one bond.
    """
    if atom.bond_total != 1:
        raise ValueError("R-group attachment atom must have exactly one bond.")

    bond = atom.bond[1]  # 1-based indices
    return (bond.atom2.index, bond.order)


def _find_dummy(st):
    """
    Return the index of the dummy atom in a structure. Atoms with atomic number
    -2 (mmat convention) or 0 (RDKit convention) are considered dummies.

    :param st: structure to search
    :type st: `schrodinger.structure.Structure`

    :return: dummy atom index
    :rtype: int

    :raises: ValueError if the structure does not contain exactly one dummy atom.
    """
    dummies = [at.index for at in st.atom if at.atomic_number in (0, -2)]
    if len(dummies) != 1:
        raise ValueError(
            "R-group structure must have exactly one dummy atom.\n"
            "R-group libraries should be prepared using the R-Group Creator "
            "panel or the r_group_creator.py command-line script.")
    return dummies[0]


def _get_attachment_bond_order(st):
    '''
    Returns order of the bond that joins "dummy" atom to to the rest.

    :param st: R-group structure.
    :type st: `schrodinger.structure.Structure`

    :return: Attachment bond order.
    :rtype: int

    :raises: ValueError if the structure does not contain exactly one
        "dummy" atom or if the "dummy" atom does not have exactly one bond.
    '''

    _, order = _find_neighbor(st.atom[_find_dummy(st)])

    return order


[docs]def find_rgroup_from_smarts(st,
                            smarts,
                            leaving_atom_pos,
                            staying_atom_pos,
                            bond_order=None):
    """
    Find the various ways in which a structure can be split into "R-group" and
    "functional group" using a SMARTS pattern.

    The SMARTS pattern must consist of at least two atoms. Two of the atoms,
    identified by their position in the SMARTS string, are used to define
    the bond to be broken between the R group and the "leaving group".
    If the two atoms are not directly connected, the bond leading from
    the leaving atom to the staying atom is broken.

    For example, consider the structure c1ccccc1cC(=O)O and the SMARTS pattern
    *C(=O)O. With leaving_atom_pos=2, staying_atom_pos=1, the entire carboxylate
    is removed, producing the R-group c1ccccc1*. With leaving_atom_pos=4,
    staying_atom_pos=2, only the terminal O is removed, leading to the R-group
    c1ccccc1C(=O)*. (The asterisks are shown here only to highlight the bond
    that was broken.)

    The return value is a list of tuples, where the first element is the
    attachment atom index and the second is a list of the indexes of the atoms
    comprising the R-group. In the first example above, if we pretend there are
    no hydrogens, the return value might be [(7, [1,2,3,4,5,6])].

    Notes: 1) ring bonds can't be broken because they don't split the structure
    in two; 2) if `bond_order` is not `None`, skip matches having the attachment
    bond of different order.

    :param st: structure to analyze
    :type st: `schrodinger.structure.Structure`

    :param smarts: SMARTS pattern describing the functional group
    :type smarts: str

    :param leaving_atom_pos: position of the leaving atom in the
            SMARTS pattern (1-based)
    :type leaving_atom_pos: index

    :param staying_atom_pos: position of the attachment atom in the
            SMARTS pattern (1-based)
    :type staying_atom_pos: index

    :param bond_order: If `None` (default), has no effect. Otherwise
        skip matches having R-group attachment bond of different order.
    :type bond_order: int or NoneType

    :return: list of tuples (attachment atom, list of R-group atom indexes). If
            no matches satisfied all the requirements, the list may be empty.
            May include duplicate R-groups (R-group in this context is the
            substructure made of the newly found R-group atoms).
    :rtype: list
    """
    seen = set()
    matches = []
    for match in analyze.evaluate_smarts_canvas(st, smarts, uniqueFilter=False):
        leaving_atom_idx = match[leaving_atom_pos - 1]
        staying_atom_idx = match[staying_atom_pos - 1]
        if (leaving_atom_idx, staying_atom_idx) in seen:
            continue
        else:
            seen.add((leaving_atom_idx, staying_atom_idx))
        try:
            leaving_atoms_idx = st.getMovingAtoms(staying_atom_idx,
                                                  leaving_atom_idx)
            rgroup_atoms_idx = set(st.getAtomIndices()) - leaving_atoms_idx
        except ValueError:
            continue  # bond is in a ring, so skip
        leaving_atom = st.atom[leaving_atom_idx]
        bond = next(
            b for b in leaving_atom.bond if b.atom2.index in rgroup_atoms_idx)
        if bond_order is not None and bond.order != bond_order:
            continue  # undesired bond order
        matches.append((leaving_atom_idx, sorted(rgroup_atoms_idx)))
    return matches


[docs]def find_staying_atom(st, leaving_atom):
    """
    Given a picked "leaving" atom, determine which of the atoms it is bonded
    to is part of the larger molecule - the "staying" atom. All other atoms
    bound to the leaving atom are considered to be part of the leaving group.

    :param leaving_atom: atom which defines the start of the leaving group
    :type leaving_atom: `schrodinger.structure._StructureAtom`

    :return: "staying atom": the core atom bound to the leaving atom
    :rtype leaving_atom: `schrodinger.structure._StructureAtom`
    """
    num_bonds = len(leaving_atom.bond)
    if num_bonds == 0:
        raise ValueError("Selected atom %i is not bound to any other atom" %
                         leaving_atom.index)

    # For all other errors, there is a standard error text:
    err_msg = "Not possible to detect the leaving group from selected atom %i." % leaving_atom.index

    moving_atoms_dict = {}
    for bond in leaving_atom.bond:
        at2 = bond.atom2
        try:
            moving_atoms_dict[at2] = len(st.getMovingAtoms(at2, leaving_atom))
        except ValueError:
            # these atoms are in a ring; so whole molecule would move
            pass

    if len(moving_atoms_dict) == 0:
        # All atoms are in rings; freezing either will cause the
        # whole molecule to move.
        raise ValueError(err_msg)

    # Size of the smallest group of bonded atoms:
    smallest_moving = min(moving_atoms_dict.values())

    if smallest_moving > len(leaving_atom.getMolecule().atom) / 2:
        # The wrong "side" of the only non-ring bond was selected. The
        # leaving group should never be bigger than 1/2 of the molecule.
        raise ValueError(err_msg)

    # Ideal atom(s) to freeze:
    best_bonded_atoms = [
        b for b in moving_atoms_dict if moving_atoms_dict[b] == smallest_moving
    ]

    if len(best_bonded_atoms) > 1:
        # 2 (or more) neighbors, if fixed, would yield the same sized
        # leaving group.
        # e.g. carbon in a methane, or middle carbon in a propane.
        raise ValueError(err_msg)

    return best_bonded_atoms[0]


[docs]def get_dummy_filter():
    """
    Return a filter which has as criteria all the descriptors that can be
    computed by this module, along with their suggested default limiters (ranges).

    :rtype: `schrodinger.ui.qt.filter_dialog_dir.filter_core.Filter`
    """
    criteria = []
    for name in sorted(DESCRIPTOR_DICT):
        _, limiter = DESCRIPTOR_DICT[name]
        criteria.append(filter_core.Criterion(name, limiter))
    return filter_core.Filter('dummy filters', criteria)


[docs]def add_descriptors(st, filter_obj):
    """
    Add the descriptors required by a filter to a given Structure.

    :type st: `schrodinger.structure.Structure`
    :type filter_obj: `schrodinger.ui.qt.filter_dialog_dir.filter_core.Filter`
    """
    for c in filter_obj.criteria:
        try:
            func = DESCRIPTOR_DICT[c.prop][0]
        except KeyError:
            # We'll ignore properties that we don't know how to compute,
            # which might be the case for properties that are supposed to be
            # already present in the structure.
            continue
        st.property[c.prop] = func(st)


[docs]def list_to_csv(fields):
    """
    Convert a list into a CSV string representation.

    :param fields: list to convert
    :type fields: list

    :rtype: str
    """
    dialect = csv.excel()
    dialect.lineterminator = ''
    buf = io.StringIO()
    writer = csv.writer(buf, dialect=dialect)
    writer.writerow(fields)
    return buf.getvalue()


[docs]def add_amide_constraints(st, frozen_set):
    """
    For amide bonds which have one atom frozen and the other not, add the
    necessary ct properties to tell fast3d to constrain the amides to the trans
    conformation.

    :param st: structure, to be modified in place
    :type st: `schrodinger.structure.Structure`

    :param frozen_set: set of frozen atoms, by atom index
    :type frozen_set: set of int

    :return: names of properties that were added
    :rtype: list of str
    """
    amide_bonds = analyze.evaluate_smarts_canvas(st,
                                                 r'O=C[NH1X3][#6]',
                                                 uniqueFilter=True)
    index = get_last_EZ_property_index(st)
    props = []
    for i, j, k, l in amide_bonds:
        if len({j, k} & frozen_set) == 1:
            index += 1
            prop = 's_st_EZ_%d' % index
            props.append(prop)
            # Making the O=CNC dihedral Z makes it a "trans amide".
            st.property[prop] = '%d_%d_%d_%d_Z' % (i, j, k, l)
    return props


[docs]def get_metals_and_neighbors(st):
    '''
    Returns indices of metal atoms, and atoms bonded to them.

    :param st: Structure.
    :type st:

    :return: Set of atom indices in `st`.
    :rtype: set(int)
    '''

    indices = set()

    for i in analyze.evaluate_asl(st, 'metals'):
        indices.add(i)
        for neighbor in st.atom[i].bonded_atoms:
            indices.add(int(neighbor))

    return indices


[docs]def get_last_EZ_property_index(st):
    """
    Return the maximum index of the s_st_EZ_<index> properties of st.
    If there are no such properties, return 0.

    :type st: `schrodinger.structure.Structure`

    :rtype: int
    """
    prefix = 's_st_EZ_'
    indexes = [
        int(p.split(prefix)[1]) for p in st.property if p.startswith(prefix)
    ]
    return max(indexes) if indexes else 0


[docs]def get_sources_from_r_labels(st, iters, prop='i_rdkit__MolFileRLabel'):
    """
    Given a Structure and a list of iterables, return the "sources" data
    structure needed by RGroupEnumerator. The structure must have (some) atoms
    with the specified property; the values of this property must be in the
    range [1, len(iters)] and all the values in that range must be represented
    at least once. If this condition is not met, raise a ValueError.

    :param st: core Structure
    :type st: schrodinger.structure.Structure

    :param iters: list of iterables of structures
    :type iters: list

    :param prop: name of the atom property holding the R-group labels. The
        default is what comes from reading an SD file with "M  RGP" fields using
        RDKit.
    :type prop: str

    :return: sources data structure. See RGroupEnumerator for details.
    :rtype: list of list
    """
    n = len(iters)
    source_dict = collections.defaultdict(list)
    for atom in st.atom:
        r_index = atom.property.get(prop)
        if r_index is not None:
            source_dict[r_index].append(atom.index)
    expected_keys = set(range(1, n + 1))
    got_keys = set(source_dict.keys())
    if got_keys != expected_keys:
        raise ValueError('Structure does not have the necessary R-group '
                         'atom properties; expected {}, got {}'.format(
                             expected_keys, got_keys))
    return [[iters[i]] + source_dict[i + 1] for i in range(n)]


[docs]def convert_attachment_point(struct, bond_order=None):
    """
    Converts attachment point from methyl to dummy atom. If r-group fragment
    is 'Null' returns False, otherwise returns True. Null r-group has atom
    with atomic number -2 and growname 'rpc1'.

    :param struct: structure object
    :type struct: `structure.Structure`

    :param bond_order: If `None`, has no effect. Otherwise
        return False if attachment bond is of different order.
    :type bond_order: int or NoneType

    :return: True if conversion succeeded and False otherwise.
    :rtype: bool
    """

    # Check whether r-group fragment is a 'Null' group or has more than
    # one attachment point. The later case is identified by the
    # presence of s_cg_attachment property.
    for atom in struct.atom:
        if atom.growname == 'rpc1' and \
           atom.atomic_number == DUMMY_ATOMIC_NUMBER:
            return False
        if 's_cg_attachment' in atom.property:
            return False

    for atom in struct.atom:
        if atom.growname == "rpc2":
            delete_atoms = [
                a for a in atom.bonded_atoms if not a.growname == "rpc1"
            ]
            struct.deleteAtoms(delete_atoms)
            atom.atomic_number = DUMMY_ATOMIC_NUMBER
            try:
                _, order = _find_neighbor(atom)
            except ValueError:
                return False
            if bond_order is not None and order != bond_order:
                return False
            return True
    # No conversion was needed. Here we need to check that fragment contains
    # attachment point (a dummy atom).
    return check_attachment_point(struct, bond_order)


[docs]def get_attachment_point(st):
    """
    Identifies attachment point (dummy atom with a single neighbor)
    in the provided structure.

    :param st: Structure
    :type st: `schrodinger.structure.Structure`

    :return: Index of the dummy atom and order of the bond that
        joins the dummy to the rest.
    :rtype: (int, int)
    """

    try:
        dummy = _find_dummy(st)
        neighbor, order = _find_neighbor(st.atom[dummy])
        return dummy, order
    except ValueError:
        return (None, None)


[docs]def check_attachment_point(struct, bond_order=None):
    """
    Checks that provided structure contains attachment point.

    :param bond_order: If `None`, has no effect. Otherwise
        return False if attachment bond is of different order.
    :type bond_order: int or NoneType

    :return: True if attachment point was found and False otherwise.
    :rtype: bool
    """

    dummy, order = get_attachment_point(struct)
    if dummy is not None:
        if bond_order is not None and order != bond_order:
            return False
        else:
            return True
    return False


[docs]def create_fragment_structure(st, rgroup_data):
    """
    Creates r-group fragment structures from a given structure and a list
    of atoms that should be included in the r-group.

    :param st: structure object
    :type st: `structure.Structure`

    :param rgroups: r-group data that contains index of attachment atom and
        indices of R-group fragment atoms.
    :type rgroups: tuple

    :return: fragment structure
    :rtype: `structure.Structure`
    """
    attach_atom, rgroup_indices = rgroup_data
    frag_st = st.copy()
    frag_st.atom[attach_atom].atomic_number = DUMMY_ATOMIC_NUMBER
    # r-group structure should include an attachment point, so we
    # are adding it here.
    rgroup_indices.append(attach_atom)
    frag_st = frag_st.extract(rgroup_indices, copy_props=True)

    return frag_st


def _concentrationFromFraction(num_attachment_points, conc):
    """
    Round user-specified concentration to the nearest integer. This function
    exists to convert legacy fractional concentrations to the much less
    ambiguous integer concentrations (e.g., 0.66 vs. 2/3).

    Calling this function will always generate a warning, which will hopefully
    protect users from using this function unintentionally.

    :param num_atoms: number of attachment points in group
    :type num_atoms: int

    :param conc: fractional concentration between 0 and 1
    :type: conc: str

    :return: the number of attachment points that change at once
    :rtype: int
    """
    integer_conc = int(round(num_attachment_points * conc))
    logger.warning(f"WARNING: Rounding {conc} to {integer_conc}/"
                   f"{num_attachment_points}. This functionality can be "
                   "imprecise and is not recommended.")
    return integer_conc


# Preserve old namespaces for backwards compatibility
RgroupEnumerator = RGroupEnumerator
RgroupError = RGroupError