import enum
from dataclasses import dataclass
from dataclasses import field
from functools import lru_cache
from functools import partial
from typing import Dict
from typing import Optional
from typing import Tuple
from schrodinger import structure
from schrodinger.protein import annotation
from schrodinger.structure import RESIDUE_MAP_3_TO_1_LETTER
try:
from schrodinger.application.prime.packages import antibody
except ImportError:
antibody = None
# Setup antibody hierarchy
"""
Hierarchy:
- Fab
- VL
- L1
- L2
- L3
- LFR1
- LFR2
- LFR3
- LFR4
- CL
- VH
- H1
- H2
- H3
- HFR1
- HFR2
- HFR3
- HFR4
- CH1
- Hinge
- Fc
- CH2
- CH3
- CH4
special cases denote DVD heavy and DVD light chains that have 2 VL / VH
regions - these are distinguished by suffix '_1' e.g.
VL_1
- LFR1_1...LFR4_1
- L1_1...L3_1
(there should be no CL_1)
VH_1
- HFR1...HFR4_1
- H1_1...H3_1
(there should be no CH1_1)
"""
REGION_CHILDREN = {}
## Set up REGION_CHILDREN
REGION_CHILDREN['Fab'] = ['CL', 'VL', 'VH', 'CH1']
REGION_CHILDREN['Fc'] = ['CH2', 'CH3']
REGION_CHILDREN['VL'] = ['L1', 'L2', 'L3', 'LFR1', 'LFR2', 'LFR3', 'LFR4']
REGION_CHILDREN['VH'] = ['H1', 'H2', 'H3', 'HFR1', 'HFR2', 'HFR3', 'HFR4']
### Set up indexed units (e.g. L1_0, L1_1, up through _2
for grandparent in ['VL', 'VH']:
for parent in REGION_CHILDREN[grandparent]:
REGION_CHILDREN[parent] = [
'{}_{}'.format(parent, index) for index in range(3)
]
## Populate REGION_DESCENDENTS for convenience
def _get_region_descendents(parent):
if parent in REGION_CHILDREN:
REGION_DESCENDENTS = REGION_CHILDREN[parent]
for child in REGION_CHILDREN[parent]:
REGION_DESCENDENTS += _get_region_descendents(child)
return REGION_DESCENDENTS
return []
REGION_DESCENDENTS = {}
for parent in list(REGION_CHILDREN):
REGION_DESCENDENTS[parent] = _get_region_descendents(parent)
# End hierarchy data
# the lists here are based on Kabat numbering, the lists in other numbering schemes
# can be converted by calling "reslist_between_schemes"
# TODO: CSR may vary depending on L1/L3 loop length/type, right now just a union set
VERNIER_RESIDUES = [
'H:2', 'H:27', 'H:28', 'H:29', 'H:30', 'H:47', 'H:48', 'H:49', 'H:67',
'H:69', 'H:71', 'H:73', 'H:78', 'H:93', 'H:94', 'H:103', 'L:2', 'L:4',
'L:35', 'L:36', 'L:46', 'L:47', 'L:48', 'L:49', 'L:64', 'L:66', 'L:68',
'L:69', 'L:71', 'L:98'
]
CANONICAL_STRUCTURAL_RESIDUES = [
'H:24', 'H:26', 'H:29', 'H:32', 'H:34', 'H:71', 'L:2', 'L:25', 'L:27B',
'L:28', 'L:29', 'L:30', 'H:33', 'L:71', 'L:48', 'L:64', 'L:90', 'L:94',
'L:95'
]
INTERFACE_RESIDUES = [
'H:35', 'H:37', 'H:39', 'H:45', 'H:91', 'L:34', 'L:38', 'L:44', 'L:87'
]
# Chothia: H1/26-32,52-56,95-102,L/24-34,50-56,89-97
# ignore terminal residues 1-4 on H/L
# include residues at VH/VL interface and canonical conformation determining residues
# VERNIER_RESIDUES = ['H:24','H:25','H:33','H:34','H:50','H:51','H:57','H:58',
# 'H:69','H:71','H:73','H:74','H:75','H:76','H:93','H:94','H:103',
# 'L:35','L:36','L:46','L:47','L:48','L:49','L:58',
# 'L:64','L:66','L:68','L:69','L:70','L:71','L:72','L:98','L:99',
# 'H:37','H:48','H:49']
#
# the list in Kabat numbering from JBC,2008,283,2,1156-1166
# Kabat: H1/30-35B,H2/50-65,H3/95-102,L/24-34,50-56,89-97
# VERNIER_RESIDUES = ['H:2','H:27','H:28','H:29','H:30','H:47','H:48','H:49',
# 'H:67','H:69','H:71','H:73','H:78','H:93','H:94','H:103',
# 'L:2','L:4','L:35','L:36','L:46','L:47','L:48','L:49',
# 'L:64','L:66','L:68','L:69','L:71','L:99']
# my change:
# VERNIER_RESIDUES = ['H:2','H:27','H:28','H:29','H:30','H:47','H:48','H:49',
# 'H:67','H:69','H:71','H:73','H:78','H:93','H:94','H:103',
# 'L:1','L:2','L:3','L:4','L:35','L:36','L:46','L:47','L:48','L:49','L:58',
# 'L:64','L:66','L:68','L:69','L:70','L:71','L:98','L:99']
# Reference sequences with maximum long CDR loops
_REF_SEQH = (
"QVQLVQSGAEVKKPGSSVKVSCKASGGTFGGSNYAINWVRQAPGQGLEWMGNIEPGGGGGGGGGGG"
"GGGGGGGGGGGGGGYFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARYFMSGGG"
"GGGGGGGGGGGGGGGGGGGGYKHLSDYWGQGTLVTVSSASTK")
_REF_SEQL = (
"DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTGGGGGGGGGGGGGGGGGGGGRKNYLAWYQQKP"
"GQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTITSVQAEDLAVYYCKQSYNGGGGGGGGGGGGG"
"GGGGGGGGGGGGGGLRTFGGGTKLEIK")
[docs]class Properties:
"""
Class to store custom property names associated with antibody annotation.
"""
PROTEIN_CATEGORY = 'i_bioluminate_protein_category'
VERNIER_RES = 'b_bioluminate_vernier_residue'
CANONICAL_STRUCTURAL_RES = 'b_bioluminate_canonical_structural_residue'
INTERFACE_RES = 'b_bioluminate_interface_residue'
[docs]class SpecialResidueTypes(enum.Enum):
VERNIER = enum.auto()
CANONICAL_STRUCTURAL = enum.auto()
INTERFACE = enum.auto()
[docs]class ChainType(enum.Enum):
HEAVY = 'heavy'
LIGHT = 'light'
NONE = 'none'
[docs]@dataclass(frozen=True)
class SpecialResidueZone:
"""
Class to store mappings of universal (scheme-independent) alignment residue
indices to their corresponding Kabat-numbered residue IDs for both heavy and
light chains.
"""
res_type: SpecialResidueTypes
res_ids_by_idx_l: Dict[int, str] = field(default_factory=dict, init=False)
res_ids_by_idx_h: Dict[int, str] = field(default_factory=dict, init=False)
def __post_init__(self):
"""
Populate the attributes containing maps of universal indices to
Kabat-numbered residue IDs.
"""
res_ids = _KABAT_RES_IDS_BY_RES_TYPE[self.res_type]
seq_type_h, seq_type_l = _get_ref_seq_types()
for res_id_raw_str in res_ids:
if res_id_raw_str.startswith(_SEQ_TYPE_CHAIN_NAME_H):
seq_type = seq_type_h
univ_indices = seq_type.refidx_vh
res_ids_by_idx = self.res_ids_by_idx_h
elif res_id_raw_str.startswith(_SEQ_TYPE_CHAIN_NAME_L):
seq_type = seq_type_l
univ_indices = seq_type.refidx_vl
res_ids_by_idx = self.res_ids_by_idx_l
else:
msg = f'Residue ID {res_id_raw_str} has an unsupported chain name'
raise ValueError(msg)
# Adhere to SeqType formatting
res_id_str = res_id_raw_str.replace(':', '')
res_id_idx = seq_type.resid_list.index(res_id_str)
univ_idx = univ_indices[res_id_idx]
res_ids_by_idx[univ_idx] = res_id_raw_str
[docs] def getResidueIdsByIndex(self, chain_type: ChainType) -> Dict[int, str]:
if chain_type is ChainType.HEAVY:
return self.res_ids_by_idx_h
elif chain_type is ChainType.LIGHT:
return self.res_ids_by_idx_l
raise ValueError(f'Unsupported chain type {chain_type}')
[docs] def residueIsInZone(self, univ_idx: int, chain_type: ChainType) -> bool:
res_ids_by_idx = self.getResidueIdsByIndex(chain_type)
return res_ids_by_idx.get(univ_idx) is not None
_INTERNAL_NUMBERING_SCHEME = annotation.AntibodyCDRScheme.Kabat
_SEQ_TYPE_CHAIN_NAME_H = 'H'
_SEQ_TYPE_CHAIN_NAME_L = 'L'
# Special residue numbers + INS codes for heavy and light chains in the kabat
# numbering scheme
_KABAT_RES_IDS_BY_RES_TYPE = {
SpecialResidueTypes.VERNIER: VERNIER_RESIDUES,
SpecialResidueTypes.CANONICAL_STRUCTURAL: CANONICAL_STRUCTURAL_RESIDUES,
SpecialResidueTypes.INTERFACE: INTERFACE_RESIDUES
}
[docs]@lru_cache
def get_res_zones_by_res_type(
) -> Dict[SpecialResidueTypes, SpecialResidueZone]:
"""
Return a mapping of special residue type to its corresponding
SpecialResidueZone instance.
"""
return {
res_type: SpecialResidueZone(res_type)
for res_type, res_ids in _KABAT_RES_IDS_BY_RES_TYPE.items()
}
@lru_cache
def _get_ref_seq_types() -> Tuple['antibody.SeqType']:
"""
Return the heavy and light reference seq types, respectively.
"""
seq_type_h = antibody.SeqType(_REF_SEQH,
scheme=_INTERNAL_NUMBERING_SCHEME.name)
seq_type_l = antibody.SeqType(_REF_SEQL,
scheme=_INTERNAL_NUMBERING_SCHEME.name)
return seq_type_h, seq_type_l
[docs]def is_special_residue(res: structure._Residue, chain: structure._Chain,
res_type: SpecialResidueTypes, scheme: str) -> bool:
"""
Return whether the supplied residue is a "special" residue of the specified
type and numbering scheme.
:param res: A residue
:param chain: The chain that the residue is inside
:param res_type: The special residue type
:param scheme: The antibody numbering scheme
"""
seq_type = seq_type_from_chain(chain, scheme)
chain_type = chain_type_from_seq_type(seq_type)
query_idx_local = _get_local_res_index(res, chain)
query_idx_univ = _univ_res_idx_from_local(query_idx_local, seq_type,
chain_type)
res_zone = get_res_zones_by_res_type()[res_type]
return res_zone.residueIsInZone(query_idx_univ, chain_type)
def _univ_res_idx_from_local(local_res_idx: int, seq_type: 'antibody.SeqType',
chain_type: ChainType) -> Optional[int]:
"""
Return the internal universal residue index determined from the local index
relative to the supplied `seq_type` or None if the chain type is invalid or
if the local residue index is outside of the range supported by our
reference sequences.
:param local_res_idx: The index of a given residue inside of its
corresponding chain.
:param seq_type: the SeqType instance for the sequence from which the local
residue index is derived.
"""
if chain_type is ChainType.HEAVY:
univ_indices = seq_type.refidx_vh
elif chain_type is ChainType.LIGHT:
univ_indices = seq_type.refidx_vl
else:
return None
try:
return univ_indices[local_res_idx]
except IndexError:
# Query seq contains more residues than the reference sequence supports
return None
def _get_local_res_index(res: structure._Residue,
chain: structure._Chain) -> int:
"""
Return the "local" residue index i.e. the index of the given residue inside
of the supplied chain's residues.
:param res: A residue
:param chain: A chain that should contain the residue.
"""
local_res_idx = None
# Start at 1 because residues are 1-indexed
for idx, chain_res in enumerate(
structure.get_residues_by_connectivity(chain), start=1):
if res == chain_res:
local_res_idx = idx
break
if local_res_idx is None:
raise ValueError('Expected the supplied residue to be a member of the '
'supplied chain, but it is not.')
return local_res_idx
[docs]@lru_cache
def seq_type_from_chain(chain: structure._Chain,
scheme: str) -> 'antibody.SeqType':
"""
Return a `SeqType` instance derived from `chain`. Cached due to `SeqType`
instantiation.
"""
seq = seq_from_chain(chain)
return antibody.SeqType(seq, scheme=scheme)
[docs]@lru_cache
def chain_type_from_seq_type(seq_type: 'antibody.SeqType') -> ChainType:
"""
Return the chain type corresponding to `seq_type`. Cached because this is
expected to be called many times with identical inputs e.g. when checking
if all residues of a given chain are in any of the special residue zones.
"""
if seq_type.type.startswith(ChainType.HEAVY.value):
return ChainType.HEAVY
elif seq_type.type.startswith(ChainType.LIGHT.value):
return ChainType.LIGHT
return ChainType.NONE
[docs]def seq_from_chain(chain: structure._Chain) -> str:
res_codes_3l = [
res_code for res_code in structure.get_residues_by_connectivity(chain)
]
res_codes_1l = [
RESIDUE_MAP_3_TO_1_LETTER.get(res_code.pdbres.strip(), 'X')
for res_code in res_codes_3l
]
return "".join(res_codes_1l)
[docs]def reslist_between_schemes(reslist1, scheme1, scheme2):
"""
Given a list of residue IDs in scheme1, return the corresponding residue ID list in scheme2
:type reslist1: list
:param: a list of residue IDs, must have H or L as chain name, e.g. [L:21, H:35B]
:type scheme1: string
:param: one of the supported numbering schemes
:type scheme2: string
:param: one of the supported numbering schemes
:rtype: reslist2
:return: a list of new residue IDs
"""
# two reference sequences with maximum long CDR loops
seqh = "QVQLVQSGAEVKKPGSSVKVSCKASGGTFGGSNYAINWVRQAPGQGLEWMGNIEPGGGGGGGGGGGGGGGGGGGGGGGGGYFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARYFMSGGGGGGGGGGGGGGGGGGGGGGGYKHLSDYWGQGTLVTVSSASTK"
seql = "DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTGGGGGGGGGGGGGGGGGGGGRKNYLAWYQQKPGQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTITSVQAEDLAVYYCKQSYNGGGGGGGGGGGGGGGGGGGGGGGGGGGLRTFGGGTKLEIK"
hseqtype1 = antibody.SeqType(seqh, scheme1)
hseqtype2 = antibody.SeqType(seqh, scheme2)
lseqtype1 = antibody.SeqType(seql, scheme1)
lseqtype2 = antibody.SeqType(seql, scheme2)
reslist2 = []
for res in reslist1:
resid = res.replace(':', '') # note that residue IDs have ":" here
if resid[0] == 'H':
seqtype1 = hseqtype1
seqtype2 = hseqtype2
elif resid[0] == 'L':
seqtype1 = lseqtype1
seqtype2 = lseqtype2
else:
continue
try:
idx = seqtype1.resid_list.index(resid) # resid has no ":" here
resid2 = seqtype2.resid_list[idx]
except IndexError:
continue
res2 = resid2[0] + ':' + resid2[1:]
reslist2.append(res2)
return reslist2
def _reg_annotate_atoms(res_list, chainregions, abregion, propname):
"""
helper function to annotate the actual atoms within a designated antibody
region. For VL and VH regions, we ensure proper order of property input
(smallest region is considered last).
:param res_list: list of residue objects
:type res_list: List[structure.Structure]
:param chainregions: range index of the designated antibody region
:type chainregions: dictionary with (from /to) elements
:param abregion: antibody region
:type abregion: string
:param propname: atom property name
:type propname: string
"""
WATERS = ('HOH', 'SPC', 'WAT', 'DOD', 'H2O')
subregions = []
if abregion.startswith('VL') or abregion.startswith('VH'):
subregions = REGION_CHILDREN[abregion[:2]]
s = abregion[-1]
if s in "012":
s = '_' + s
subregions = [a + s for a in subregions]
idx = chainregions.get(abregion, [])
if not idx or len(idx) != 2 or (idx[1] - idx[0]) < 3:
return
for i in range(idx[0], idx[1] + 1):
res = res_list[i]
rnm = res.pdbres.strip()
if rnm in WATERS or RESIDUE_MAP_3_TO_1_LETTER.get(rnm, 'X') == 'X':
continue
for atm in res.atom:
atm.property[propname] = abregion
chainregions[abregion] = () # done - remove from dict
for subreg in subregions:
subidx = chainregions.get(subreg, [])
if not subidx or len(subidx) != 2:
continue
for i in range(subidx[0], subidx[1] + 1):
res = res_list[i]
rnm = res.pdbres.strip()
if rnm in WATERS or RESIDUE_MAP_3_TO_1_LETTER.get(rnm, 'X') == 'X':
continue
for atm in res.atom:
atm.property[propname] = subreg
chainregions[subreg] = ()
def _annotate_residue_atoms(res: structure._Residue, chain: structure._Chain,
scheme: str):
"""
Annotate all atoms in the supplied residue with various properties.
:param res: The residue which will have its atoms annotated.
:param chain: The chain in which the residue can be found.
:param scheme: The numbering scheme to use
"""
antibody_category = 1 # TODO MAE-45744 use shared enum value
is_special_res_partial = partial(is_special_residue,
res=res,
chain=chain,
scheme=scheme)
annotations_by_prop = {
Properties.PROTEIN_CATEGORY: antibody_category,
Properties.VERNIER_RES:
is_special_res_partial(res_type=SpecialResidueTypes.VERNIER),
Properties.CANONICAL_STRUCTURAL_RES: is_special_res_partial(
res_type=SpecialResidueTypes.CANONICAL_STRUCTURAL),
Properties.INTERFACE_RES:
is_special_res_partial(res_type=SpecialResidueTypes.INTERFACE)
}
for atom in res.atom:
atom.property.update(annotations_by_prop)
[docs]def annotate_ab_regions(st, scheme='Chothia'):
"""
Annotate the given structure using atom-level properties to mark antibody
regions.
This is done by marking atoms by their minimum indivisible region
classification. Lineage is standard and as described here and by the
'REGION_CHILDREN' and 'REGION_DESCENDENTS' module level variables.
Children of VL and VH regions are further subdivided to account for the
cases of multispecifics such as L1_0, L1_1.
:param st: antibody st (this is modified in place)
:type st: structure.Structure
"""
# Identify regions and annotate
region_propname = 's_bioluminate_antibody_region_id_{}'.format(scheme)
for chain in st.chain:
residues = [
residue for residue in structure.get_residues_by_connectivity(chain)
]
seqobj = seq_type_from_chain(chain, scheme=scheme)
if not seqobj or seqobj.type == 'none':
continue
chainregions = seqobj.getRegions()
# crawl up the hierarchies for VL subregions
for reg in ('VL', 'VL_0', 'VL_1', 'VL_2'):
_reg_annotate_atoms(residues, chainregions, reg, region_propname)
# crawl up the hierarchies for VH subregions
for reg in ('VH', 'VH_0', 'VH_1', 'VH_2'):
_reg_annotate_atoms(residues, chainregions, reg, region_propname)
# and now deal with the rest outside the Fv region
for reg in chainregions.keys():
_reg_annotate_atoms(residues, chainregions, reg, region_propname)
for residue in residues:
_annotate_residue_atoms(residue, chain, scheme)