"""
Expand structure into biological units
A structure generated from crystal data (typically .pdb or .cif) may
represent/include multiple "Biological Units" (or biomolecules).
A biological unit is a (putatively) biologically active structure that is not
explicitely encoded in the file. It may include fewer chains than the file,
or it may include symmetry adjusted copies of one or more chains. Copies are
given a new chain name.
Example usage::
st = structure.Structure.read('1wpl.pdb')
biomolecules = biounits_from_structure(st)
for biomolecule_transformation in biomolecules:
biomolecule_st, _ = apply_biounit(st, biomolecule_transformation)
"""
import collections
import re
from itertools import chain
import numpy as np
from schrodinger import structure
from schrodinger.protein import seqres
from schrodinger.structutils.analyze import evaluate_asl
from schrodinger.structutils.measure import get_close_atoms
CHAIN_NAMES = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
SYM_TOLERANCE = 0.3
BB_ORIG_CHAIN_PROP = 's_bb_asu_chains'
class _Transformation:
def __init__(self, transform_text):
transform = [v.split() for v in transform_text.split(';')]
transform = np.array(transform)
transform = transform.astype(float)
self.rot = transform[:, :3]
self.trans = transform[:, 3]
[docs]class BiounitOverrunError(Exception):
pass
[docs]def biounits_from_structure(st):
"""
Find the biomolecules (aka biounits) stored in the properties of a Structure
Each biomolecule is represented as a dictionary. The keys are groups of
chains, and the values are the transformations to be applied to those
chains.
:param st: Structure from which to glean biounits
:type st: schrodinger.structure.Structure
:rtype: list[dict(frozenset, list[_Transformation])]
:return: Mapping between chain ID sets and a list of _Transformation
containing the rotation and translation matrices for those chains
"""
# Each biomolecule has chains and transformations that need to be applied
# to those chains.
# Property name format:
# Biomolecule #
# | chain group # within biomol
# | | transformation # for these chains
# | | |
# s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_1_Chains |
# s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_1_Matrix_1
# s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_2_Chains
# s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_2_Matrix_1
# s_pdb_PDB_REMARK_350_Biomolecule_1_Transform_2_Matrix_2
_chain_re = re.compile(
r's_pdb_PDB_REMARK_350_Biomolecule_(\d+)_Transform_(\d+)_Chains')
_matrix_re = re.compile(
r's_pdb_PDB_REMARK_350_Biomolecule_(\d+)_Transform_(\d+)_Matrix_(\d+)')
transforms_per_biomolecule = collections.defaultdict(set)
id_to_chains = {}
transformations = collections.defaultdict(list)
for p in st.property:
if not p.startswith('s_pdb_PDB_REMARK_350_Biomolecule_'):
continue
chain_match = _chain_re.search(p)
if chain_match:
biomolecule_id = int(chain_match.group(1))
transform_id = int(chain_match.group(2))
chain_text = st.property[p]
chains = frozenset(chain_text.split(', '))
id_to_chains[(biomolecule_id, transform_id)] = chains
transforms_per_biomolecule[biomolecule_id].add(transform_id)
else:
matrix_match = _matrix_re.search(p)
if matrix_match is None:
continue
biomolecule_id = int(matrix_match.group(1))
transform_id = int(matrix_match.group(2))
matrix_id = int(matrix_match.group(3))
transform = _Transformation(st.property[p])
transformations[(biomolecule_id, transform_id)].append(
(matrix_id, transform))
biomolecules = []
for biomolecule_id in sorted(transforms_per_biomolecule):
biomolecule = collections.defaultdict(list)
for transform_id in sorted(transforms_per_biomolecule[biomolecule_id]):
k = (biomolecule_id, transform_id)
chains = id_to_chains[k]
for _, transform in sorted(transformations[k]):
biomolecule[chains].append(transform)
biomolecules.append(dict(biomolecule))
# a list of biomolecules, sorted by ID.
return biomolecules
def _remove_symmetry_atoms(ct):
sym_atoms = []
for at1, at2 in get_close_atoms(ct, SYM_TOLERANCE):
atom1 = ct.atom[at1]
atom2 = ct.atom[at2]
if atom1.pdbname == atom2.pdbname and atom1.resnum == atom2.resnum and atom1.inscode == atom2.inscode:
sym_atoms.append(atom2)
if sym_atoms:
ct.deleteAtoms(
evaluate_asl(
ct, 'fillres atom.entrynum {}'.format(','.join(
str(int(at)) for at in sym_atoms))))
_DONT_MOVE = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]], dtype=float)
def _transform(st, transf):
"""
If the supplied transform is a change, apply it to the structure.
"""
rotation = transf.rot
translation = transf.trans
needs_rotation = not np.allclose(rotation, _DONT_MOVE[:, :3])
needs_translation = not np.allclose(translation, _DONT_MOVE[:, 3])
if not needs_rotation and not needs_translation:
return
xyz = st.getXYZ()
if needs_rotation:
xyz = np.inner(xyz, rotation)
if needs_translation:
xyz += translation
# Need to save alt_xyz before setting xyz to avoid overwrite.
alt_xyz_map = {}
for atom in st.atom:
alt_xyz_map[atom.index] = atom.alt_xyz
st.setXYZ(xyz)
# Make sure alternate positions are also transformed.
for atom_idx in alt_xyz_map:
atom = st.atom[atom_idx]
alt_xyz = alt_xyz_map[atom_idx]
if alt_xyz is not None:
new_alt_xyz = np.inner(np.array(alt_xyz), transf.rot) + transf.trans
atom.alt_xyz = new_alt_xyz
[docs]def apply_biounit(original_st, biounit):
"""
Apply biounit transformations to the given structure.
If a chain is duplicated, the duplicate is given a new chain name in
alphabetical order.
:param original_st: Structure to transform
:type original_st: schrodinger.structure.Structure
:param biounit: Biounit data (produced by `biounits_from_structure`) mapping
between chain ID sets and a list of _Transformation containing the
rotation and translation matrices
:type biounit: dict(frozenset, list[_Transformation])
:return: A new Structure for the biounit applied to the original_st
:rtype: schrodinger.structure.Structure
"""
all_chains = {c.name for c in original_st.chain}
used_chains = set(chain.from_iterable(biounit))
# Could probably improve this by starting from an extract instead of an
# empty structure. Most transformations include only one set of chains.
st = structure.create_new_structure(0)
st.property = original_st.property
# Keep track of SEQRES block for each chain, so missing loops can be build
# and detected in the prepwizard.
original_seqres = seqres.get_seqres(original_st)
if original_seqres is None:
original_seqres = {}
new_seqres = {}
for chains, transformations in biounit.items():
chains &= all_chains
if not chains:
continue
chain_atoms = chain.from_iterable(
(original_st.chain[c].getAtomIndices() for c in chains))
chain_atoms = sorted(chain_atoms)
chain_st = original_st.extract(chain_atoms, copy_props=True)
if len(transformations) == 1:
_transform(chain_st, transformations[0])
st.extend(chain_st)
for c in (chains & set(original_seqres)):
new_seqres[c] = original_seqres[c]
elif len(transformations) > 1:
# If there are multiple transformations, we should cache the
# starting coordinates. We'll also need to update the chain names.
chains = list(chain_st.chain)
original_xyz = chain_st.getXYZ()
transformations = iter(transformations)
t = next(transformations)
_transform(chain_st, t)
st.extend(chain_st)
for t in transformations:
chain_st.setXYZ(original_xyz)
_transform(chain_st, t)
for c in chains:
try:
new_name = next(
(c for c in CHAIN_NAMES if c not in used_chains))
except StopIteration:
raise BiounitOverrunError(
'Unable to assign unique chain name')
if c.name in original_seqres:
new_seqres[c.name] = original_seqres[c.name]
new_seqres[new_name] = original_seqres[c.name]
c.name = new_name
used_chains.add(new_name)
st.extend(chain_st)
_remove_symmetry_atoms(st)
seqres.set_seqres(st, new_seqres)
return st
if __name__ == '__main__':
# $SCHRODINGER/run python3 -m schrodinger.protein.biounit 2HHB.pdb biounits.mae
import sys
# yapf: disable
with structure.StructureReader(sys.argv[1]) as reader, structure.StructureWriter(sys.argv[2]) as writer:
for st in reader:
for biounit in biounits_from_structure(st):
bio_st = apply_biounit(st, biounit)
writer.append(bio_st)