Source code for schrodinger.protein.biounit

"""
Expand structure into biological units

A structure generated from crystal data (typically .pdb or .cif) may
represent/include multiple "Biological Units" (or biomolecules).

A biological unit is a (putatively) biologically active structure that is not
explicitely encoded in the file. It may include fewer chains than the file,
or it may include symmetry adjusted copies of one or more chains. Copies are
given a new chain name.

Example usage::

    st = structure.Structure.read('1wpl.pdb')
    biomolecules = biounits_from_structure(st)
    for biomolecule_transformation in biomolecules:
        biomolecule_st, _ = apply_biounit(st, biomolecule_transformation)

"""
import collections
import re
from itertools import chain

import numpy as np

from schrodinger import structure
from schrodinger.protein import seqres
from schrodinger.structutils.analyze import evaluate_asl
from schrodinger.structutils.measure import get_close_atoms

CHAIN_NAMES = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
SYM_TOLERANCE = 0.3
BB_ORIG_CHAIN_PROP = 's_bb_asu_chains'


class _Transformation:

    def __init__(self, transform_text):
        transform = [v.split() for v in transform_text.split(';')]
        transform = np.array(transform)
        transform = transform.astype(float)

        self.rot = transform[:, :3]
        self.trans = transform[:, 3]


[docs]class BiounitOverrunError(Exception):
    pass


[docs]def biounits_from_structure(st):
    """
    Find the biomolecules (aka biounits) stored in the properties of a Structure

    Each biomolecule is represented as a dictionary. The keys are groups of
    chains, and the values are the transformations to be applied to those
    chains.

    :param st: Structure from which to glean biounits
    :type st: schrodinger.structure.Structure

    :rtype: list[dict(frozenset, list[_Transformation])]
    :return: Mapping between chain ID sets and a list of _Transformation
        containing the rotation and translation matrices for those chains

    """

    # Each biomolecule has chains and transformations that need to be applied
    # to those chains.

    # Property name format:
    #                             Biomolecule #
    #                                  |     chain group # within biomol
    #                                  |           |  transformation # for these chains
    #                                  |           |        |
    # s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_1_Chains |
    # s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_1_Matrix_1
    # s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_2_Chains
    # s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_2_Matrix_1
    # s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_2_Matrix_2

    _chain_re = re.compile(
        r's_pdb_PDB_REMARK_350_Biomolecule_(\d+)_Transform_(\d+)_Chains')
    _matrix_re = re.compile(
        r's_pdb_PDB_REMARK_350_Biomolecule_(\d+)_Transform_(\d+)_Matrix_(\d+)')

    transforms_per_biomolecule = collections.defaultdict(set)
    id_to_chains = {}

    transformations = collections.defaultdict(list)

    for p in st.property:
        if not p.startswith('s_pdb_PDB_REMARK_350_Biomolecule_'):
            continue
        chain_match = _chain_re.search(p)
        if chain_match:
            biomolecule_id = int(chain_match.group(1))
            transform_id = int(chain_match.group(2))

            chain_text = st.property[p]
            chains = frozenset(chain_text.split(', '))
            id_to_chains[(biomolecule_id, transform_id)] = chains
            transforms_per_biomolecule[biomolecule_id].add(transform_id)

        else:
            matrix_match = _matrix_re.search(p)
            if matrix_match is None:
                continue
            biomolecule_id = int(matrix_match.group(1))
            transform_id = int(matrix_match.group(2))
            matrix_id = int(matrix_match.group(3))

            transform = _Transformation(st.property[p])

            transformations[(biomolecule_id, transform_id)].append(
                (matrix_id, transform))

    biomolecules = []
    for biomolecule_id in sorted(transforms_per_biomolecule):
        biomolecule = collections.defaultdict(list)
        for transform_id in sorted(transforms_per_biomolecule[biomolecule_id]):
            k = (biomolecule_id, transform_id)
            chains = id_to_chains[k]
            for _, transform in sorted(transformations[k]):
                biomolecule[chains].append(transform)
        biomolecules.append(dict(biomolecule))

    # a list of biomolecules, sorted by ID.
    return biomolecules


def _remove_symmetry_atoms(ct):
    sym_atoms = []
    for at1, at2 in get_close_atoms(ct, SYM_TOLERANCE):
        atom1 = ct.atom[at1]
        atom2 = ct.atom[at2]
        if atom1.pdbname == atom2.pdbname and atom1.resnum == atom2.resnum and atom1.inscode == atom2.inscode:
            sym_atoms.append(atom2)
    if sym_atoms:
        ct.deleteAtoms(
            evaluate_asl(
                ct, 'fillres atom.entrynum {}'.format(','.join(
                    str(int(at)) for at in sym_atoms))))


_DONT_MOVE = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]], dtype=float)


def _transform(st, transf):
    """
    If the supplied transform is a change, apply it to the structure.
    """
    rotation = transf.rot
    translation = transf.trans
    needs_rotation = not np.allclose(rotation, _DONT_MOVE[:, :3])
    needs_translation = not np.allclose(translation, _DONT_MOVE[:, 3])

    if not needs_rotation and not needs_translation:
        return

    xyz = st.getXYZ()
    if needs_rotation:
        xyz = np.inner(xyz, rotation)
    if needs_translation:
        xyz += translation
    # Need to save alt_xyz before setting xyz to avoid overwrite.
    alt_xyz_map = {}
    for atom in st.atom:
        alt_xyz_map[atom.index] = atom.alt_xyz
    st.setXYZ(xyz)
    # Make sure alternate positions are also transformed.
    for atom_idx in alt_xyz_map:
        atom = st.atom[atom_idx]
        alt_xyz = alt_xyz_map[atom_idx]
        if alt_xyz is not None:
            new_alt_xyz = np.inner(np.array(alt_xyz), transf.rot) + transf.trans
            atom.alt_xyz = new_alt_xyz


[docs]def apply_biounit(original_st, biounit):
    """
    Apply biounit transformations to the given structure.

    If a chain is duplicated, the duplicate is given a new chain name in
    alphabetical order.

    :param original_st: Structure to transform
    :type original_st: schrodinger.structure.Structure

    :param biounit: Biounit data (produced by `biounits_from_structure`) mapping
        between chain ID sets and a list of _Transformation containing the
        rotation and translation matrices
    :type biounit: dict(frozenset, list[_Transformation])

    :return: A new Structure for the biounit applied to the original_st
    :rtype: schrodinger.structure.Structure
    """

    all_chains = {c.name for c in original_st.chain}
    used_chains = set(chain.from_iterable(biounit))

    # Could probably improve this by starting from an extract instead of an
    # empty structure. Most transformations include only one set of chains.
    st = structure.create_new_structure(0)
    st.property = original_st.property
    # Keep track of SEQRES block for each chain, so missing loops can be build
    # and detected in the prepwizard.
    original_seqres = seqres.get_seqres(original_st)
    if original_seqres is None:
        original_seqres = {}
    new_seqres = {}

    for chains, transformations in biounit.items():
        chains &= all_chains
        if not chains:
            continue
        chain_atoms = chain.from_iterable(
            (original_st.chain[c].getAtomIndices() for c in chains))
        chain_atoms = sorted(chain_atoms)

        chain_st = original_st.extract(chain_atoms, copy_props=True)
        if len(transformations) == 1:
            _transform(chain_st, transformations[0])
            st.extend(chain_st)
            for c in (chains & set(original_seqres)):
                new_seqres[c] = original_seqres[c]
        elif len(transformations) > 1:
            # If there are multiple transformations, we should cache the
            # starting coordinates. We'll also need to update the chain names.
            chains = list(chain_st.chain)

            original_xyz = chain_st.getXYZ()
            transformations = iter(transformations)

            t = next(transformations)
            _transform(chain_st, t)
            st.extend(chain_st)

            for t in transformations:
                chain_st.setXYZ(original_xyz)
                _transform(chain_st, t)
                for c in chains:
                    try:
                        new_name = next(
                            (c for c in CHAIN_NAMES if c not in used_chains))
                    except StopIteration:
                        raise BiounitOverrunError(
                            'Unable to assign unique chain name')
                    if c.name in original_seqres:
                        new_seqres[c.name] = original_seqres[c.name]
                        new_seqres[new_name] = original_seqres[c.name]
                    c.name = new_name
                    used_chains.add(new_name)
                st.extend(chain_st)

    _remove_symmetry_atoms(st)
    seqres.set_seqres(st, new_seqres)
    return st


if __name__ == '__main__':
    # $SCHRODINGER/run python3 -m schrodinger.protein.biounit 2HHB.pdb biounits.mae
    import sys

    # yapf: disable
    with structure.StructureReader(sys.argv[1]) as reader, structure.StructureWriter(sys.argv[2]) as writer:
        for st in reader:
            for biounit in biounits_from_structure(st):
                bio_st = apply_biounit(st, biounit)
                writer.append(bio_st)