Source code for schrodinger.application.bioluminate.antibody.annotate

import enum
from dataclasses import dataclass
from dataclasses import field
from functools import lru_cache
from functools import partial
from typing import Dict
from typing import Optional
from typing import Tuple

from schrodinger import structure
from schrodinger.protein import annotation
from schrodinger.structure import RESIDUE_MAP_3_TO_1_LETTER

try:
    from schrodinger.application.prime.packages import antibody
except ImportError:
    antibody = None

# Setup antibody hierarchy
"""
    Hierarchy:
        - Fab
            - VL
                - L1
                - L2
                - L3
                - LFR1
                - LFR2
                - LFR3
                - LFR4
            - CL

            - VH
                - H1
                - H2
                - H3
                - HFR1
                - HFR2
                - HFR3
                - HFR4
            - CH1

        - Hinge

        - Fc
            - CH2
            - CH3
            - CH4

    special cases denote DVD heavy and DVD light chains that have 2 VL / VH
    regions - these are distinguished by suffix '_1' e.g.

        VL_1
            - LFR1_1...LFR4_1
            - L1_1...L3_1
        (there should be no CL_1)

        VH_1
            - HFR1...HFR4_1
            - H1_1...H3_1
        (there should be no CH1_1)

    """
REGION_CHILDREN = {}
## Set up REGION_CHILDREN
REGION_CHILDREN['Fab'] = ['CL', 'VL', 'VH', 'CH1']
REGION_CHILDREN['Fc'] = ['CH2', 'CH3']
REGION_CHILDREN['VL'] = ['L1', 'L2', 'L3', 'LFR1', 'LFR2', 'LFR3', 'LFR4']
REGION_CHILDREN['VH'] = ['H1', 'H2', 'H3', 'HFR1', 'HFR2', 'HFR3', 'HFR4']
### Set up indexed units (e.g. L1_0, L1_1, up through _2
for grandparent in ['VL', 'VH']:
    for parent in REGION_CHILDREN[grandparent]:
        REGION_CHILDREN[parent] = [
            '{}_{}'.format(parent, index) for index in range(3)
        ]


## Populate REGION_DESCENDENTS for convenience
def _get_region_descendents(parent):
    if parent in REGION_CHILDREN:
        REGION_DESCENDENTS = REGION_CHILDREN[parent]
        for child in REGION_CHILDREN[parent]:
            REGION_DESCENDENTS += _get_region_descendents(child)
        return REGION_DESCENDENTS
    return []


REGION_DESCENDENTS = {}
for parent in list(REGION_CHILDREN):
    REGION_DESCENDENTS[parent] = _get_region_descendents(parent)
# End hierarchy data

# the lists here are based on Kabat numbering, the lists in other numbering schemes
# can be converted by calling "reslist_between_schemes"
# TODO: CSR may vary depending on L1/L3 loop length/type, right now just a union set
VERNIER_RESIDUES = [
    'H:2', 'H:27', 'H:28', 'H:29', 'H:30', 'H:47', 'H:48', 'H:49', 'H:67',
    'H:69', 'H:71', 'H:73', 'H:78', 'H:93', 'H:94', 'H:103', 'L:2', 'L:4',
    'L:35', 'L:36', 'L:46', 'L:47', 'L:48', 'L:49', 'L:64', 'L:66', 'L:68',
    'L:69', 'L:71', 'L:98'
]
CANONICAL_STRUCTURAL_RESIDUES = [
    'H:24', 'H:26', 'H:29', 'H:32', 'H:34', 'H:71', 'L:2', 'L:25', 'L:27B',
    'L:28', 'L:29', 'L:30', 'H:33', 'L:71', 'L:48', 'L:64', 'L:90', 'L:94',
    'L:95'
]
INTERFACE_RESIDUES = [
    'H:35', 'H:37', 'H:39', 'H:45', 'H:91', 'L:34', 'L:38', 'L:44', 'L:87'
]

# Chothia: H1/26-32,52-56,95-102,L/24-34,50-56,89-97
# ignore terminal residues 1-4 on H/L
# include residues at VH/VL interface and canonical conformation determining residues
# VERNIER_RESIDUES = ['H:24','H:25','H:33','H:34','H:50','H:51','H:57','H:58',
#                    'H:69','H:71','H:73','H:74','H:75','H:76','H:93','H:94','H:103',
#                    'L:35','L:36','L:46','L:47','L:48','L:49','L:58',
#                    'L:64','L:66','L:68','L:69','L:70','L:71','L:72','L:98','L:99',
#                    'H:37','H:48','H:49']
#
# the list in Kabat numbering from JBC,2008,283,2,1156-1166
# Kabat: H1/30-35B,H2/50-65,H3/95-102,L/24-34,50-56,89-97
# VERNIER_RESIDUES = ['H:2','H:27','H:28','H:29','H:30','H:47','H:48','H:49',
#                    'H:67','H:69','H:71','H:73','H:78','H:93','H:94','H:103',
#                    'L:2','L:4','L:35','L:36','L:46','L:47','L:48','L:49',
#                    'L:64','L:66','L:68','L:69','L:71','L:99']
# my change:
# VERNIER_RESIDUES = ['H:2','H:27','H:28','H:29','H:30','H:47','H:48','H:49',
#                     'H:67','H:69','H:71','H:73','H:78','H:93','H:94','H:103',
#                     'L:1','L:2','L:3','L:4','L:35','L:36','L:46','L:47','L:48','L:49','L:58',
#                     'L:64','L:66','L:68','L:69','L:70','L:71','L:98','L:99']

# Reference sequences with maximum long CDR loops
_REF_SEQH = (
    "QVQLVQSGAEVKKPGSSVKVSCKASGGTFGGSNYAINWVRQAPGQGLEWMGNIEPGGGGGGGGGGG"
    "GGGGGGGGGGGGGGYFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARYFMSGGG"
    "GGGGGGGGGGGGGGGGGGGGYKHLSDYWGQGTLVTVSSASTK")
_REF_SEQL = (
    "DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTGGGGGGGGGGGGGGGGGGGGRKNYLAWYQQKP"
    "GQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTITSVQAEDLAVYYCKQSYNGGGGGGGGGGGGG"
    "GGGGGGGGGGGGGGLRTFGGGTKLEIK")


[docs]class Properties: """ Class to store custom property names associated with antibody annotation. """ PROTEIN_CATEGORY = 'i_bioluminate_protein_category' VERNIER_RES = 'b_bioluminate_vernier_residue' CANONICAL_STRUCTURAL_RES = 'b_bioluminate_canonical_structural_residue' INTERFACE_RES = 'b_bioluminate_interface_residue'
[docs]class SpecialResidueTypes(enum.Enum): VERNIER = enum.auto() CANONICAL_STRUCTURAL = enum.auto() INTERFACE = enum.auto()
[docs]class ChainType(enum.Enum): HEAVY = 'heavy' LIGHT = 'light' NONE = 'none'
[docs]@dataclass(frozen=True) class SpecialResidueZone: """ Class to store mappings of universal (scheme-independent) alignment residue indices to their corresponding Kabat-numbered residue IDs for both heavy and light chains. """ res_type: SpecialResidueTypes res_ids_by_idx_l: Dict[int, str] = field(default_factory=dict, init=False) res_ids_by_idx_h: Dict[int, str] = field(default_factory=dict, init=False) def __post_init__(self): """ Populate the attributes containing maps of universal indices to Kabat-numbered residue IDs. """ res_ids = _KABAT_RES_IDS_BY_RES_TYPE[self.res_type] seq_type_h, seq_type_l = _get_ref_seq_types() for res_id_raw_str in res_ids: if res_id_raw_str.startswith(_SEQ_TYPE_CHAIN_NAME_H): seq_type = seq_type_h univ_indices = seq_type.refidx_vh res_ids_by_idx = self.res_ids_by_idx_h elif res_id_raw_str.startswith(_SEQ_TYPE_CHAIN_NAME_L): seq_type = seq_type_l univ_indices = seq_type.refidx_vl res_ids_by_idx = self.res_ids_by_idx_l else: msg = f'Residue ID {res_id_raw_str} has an unsupported chain name' raise ValueError(msg) # Adhere to SeqType formatting res_id_str = res_id_raw_str.replace(':', '') res_id_idx = seq_type.resid_list.index(res_id_str) univ_idx = univ_indices[res_id_idx] res_ids_by_idx[univ_idx] = res_id_raw_str
[docs] def getResidueIdsByIndex(self, chain_type: ChainType) -> Dict[int, str]: if chain_type is ChainType.HEAVY: return self.res_ids_by_idx_h elif chain_type is ChainType.LIGHT: return self.res_ids_by_idx_l raise ValueError(f'Unsupported chain type {chain_type}')
[docs] def residueIsInZone(self, univ_idx: int, chain_type: ChainType) -> bool: res_ids_by_idx = self.getResidueIdsByIndex(chain_type) return res_ids_by_idx.get(univ_idx) is not None
_INTERNAL_NUMBERING_SCHEME = annotation.AntibodyCDRScheme.Kabat _SEQ_TYPE_CHAIN_NAME_H = 'H' _SEQ_TYPE_CHAIN_NAME_L = 'L' # Special residue numbers + INS codes for heavy and light chains in the kabat # numbering scheme _KABAT_RES_IDS_BY_RES_TYPE = { SpecialResidueTypes.VERNIER: VERNIER_RESIDUES, SpecialResidueTypes.CANONICAL_STRUCTURAL: CANONICAL_STRUCTURAL_RESIDUES, SpecialResidueTypes.INTERFACE: INTERFACE_RESIDUES }
[docs]@lru_cache def get_res_zones_by_res_type( ) -> Dict[SpecialResidueTypes, SpecialResidueZone]: """ Return a mapping of special residue type to its corresponding SpecialResidueZone instance. """ return { res_type: SpecialResidueZone(res_type) for res_type, res_ids in _KABAT_RES_IDS_BY_RES_TYPE.items() }
@lru_cache def _get_ref_seq_types() -> Tuple['antibody.SeqType']: """ Return the heavy and light reference seq types, respectively. """ seq_type_h = antibody.SeqType(_REF_SEQH, scheme=_INTERNAL_NUMBERING_SCHEME.name) seq_type_l = antibody.SeqType(_REF_SEQL, scheme=_INTERNAL_NUMBERING_SCHEME.name) return seq_type_h, seq_type_l
[docs]def is_special_residue(res: structure._Residue, chain: structure._Chain, res_type: SpecialResidueTypes, scheme: str) -> bool: """ Return whether the supplied residue is a "special" residue of the specified type and numbering scheme. :param res: A residue :param chain: The chain that the residue is inside :param res_type: The special residue type :param scheme: The antibody numbering scheme """ seq_type = seq_type_from_chain(chain, scheme) chain_type = chain_type_from_seq_type(seq_type) query_idx_local = _get_local_res_index(res, chain) query_idx_univ = _univ_res_idx_from_local(query_idx_local, seq_type, chain_type) res_zone = get_res_zones_by_res_type()[res_type] return res_zone.residueIsInZone(query_idx_univ, chain_type)
def _univ_res_idx_from_local(local_res_idx: int, seq_type: 'antibody.SeqType', chain_type: ChainType) -> Optional[int]: """ Return the internal universal residue index determined from the local index relative to the supplied `seq_type` or None if the chain type is invalid or if the local residue index is outside of the range supported by our reference sequences. :param local_res_idx: The index of a given residue inside of its corresponding chain. :param seq_type: the SeqType instance for the sequence from which the local residue index is derived. """ if chain_type is ChainType.HEAVY: univ_indices = seq_type.refidx_vh elif chain_type is ChainType.LIGHT: univ_indices = seq_type.refidx_vl else: return None try: return univ_indices[local_res_idx] except IndexError: # Query seq contains more residues than the reference sequence supports return None def _get_local_res_index(res: structure._Residue, chain: structure._Chain) -> int: """ Return the "local" residue index i.e. the index of the given residue inside of the supplied chain's residues. :param res: A residue :param chain: A chain that should contain the residue. """ local_res_idx = None # Start at 1 because residues are 1-indexed for idx, chain_res in enumerate( structure.get_residues_by_connectivity(chain), start=1): if res == chain_res: local_res_idx = idx break if local_res_idx is None: raise ValueError('Expected the supplied residue to be a member of the ' 'supplied chain, but it is not.') return local_res_idx
[docs]@lru_cache def seq_type_from_chain(chain: structure._Chain, scheme: str) -> 'antibody.SeqType': """ Return a `SeqType` instance derived from `chain`. Cached due to `SeqType` instantiation. """ seq = seq_from_chain(chain) return antibody.SeqType(seq, scheme=scheme)
[docs]@lru_cache def chain_type_from_seq_type(seq_type: 'antibody.SeqType') -> ChainType: """ Return the chain type corresponding to `seq_type`. Cached because this is expected to be called many times with identical inputs e.g. when checking if all residues of a given chain are in any of the special residue zones. """ if seq_type.type.startswith(ChainType.HEAVY.value): return ChainType.HEAVY elif seq_type.type.startswith(ChainType.LIGHT.value): return ChainType.LIGHT return ChainType.NONE
[docs]def seq_from_chain(chain: structure._Chain) -> str: res_codes_3l = [ res_code for res_code in structure.get_residues_by_connectivity(chain) ] res_codes_1l = [ RESIDUE_MAP_3_TO_1_LETTER.get(res_code.pdbres.strip(), 'X') for res_code in res_codes_3l ] return "".join(res_codes_1l)
[docs]def reslist_between_schemes(reslist1, scheme1, scheme2): """ Given a list of residue IDs in scheme1, return the corresponding residue ID list in scheme2 :type reslist1: list :param: a list of residue IDs, must have H or L as chain name, e.g. [L:21, H:35B] :type scheme1: string :param: one of the supported numbering schemes :type scheme2: string :param: one of the supported numbering schemes :rtype: reslist2 :return: a list of new residue IDs """ # two reference sequences with maximum long CDR loops seqh = "QVQLVQSGAEVKKPGSSVKVSCKASGGTFGGSNYAINWVRQAPGQGLEWMGNIEPGGGGGGGGGGGGGGGGGGGGGGGGGYFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARYFMSGGGGGGGGGGGGGGGGGGGGGGGYKHLSDYWGQGTLVTVSSASTK" seql = "DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTGGGGGGGGGGGGGGGGGGGGRKNYLAWYQQKPGQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTITSVQAEDLAVYYCKQSYNGGGGGGGGGGGGGGGGGGGGGGGGGGGLRTFGGGTKLEIK" hseqtype1 = antibody.SeqType(seqh, scheme1) hseqtype2 = antibody.SeqType(seqh, scheme2) lseqtype1 = antibody.SeqType(seql, scheme1) lseqtype2 = antibody.SeqType(seql, scheme2) reslist2 = [] for res in reslist1: resid = res.replace(':', '') # note that residue IDs have ":" here if resid[0] == 'H': seqtype1 = hseqtype1 seqtype2 = hseqtype2 elif resid[0] == 'L': seqtype1 = lseqtype1 seqtype2 = lseqtype2 else: continue try: idx = seqtype1.resid_list.index(resid) # resid has no ":" here resid2 = seqtype2.resid_list[idx] except IndexError: continue res2 = resid2[0] + ':' + resid2[1:] reslist2.append(res2) return reslist2
def _reg_annotate_atoms(res_list, chainregions, abregion, propname): """ helper function to annotate the actual atoms within a designated antibody region. For VL and VH regions, we ensure proper order of property input (smallest region is considered last). :param res_list: list of residue objects :type res_list: List[structure.Structure] :param chainregions: range index of the designated antibody region :type chainregions: dictionary with (from /to) elements :param abregion: antibody region :type abregion: string :param propname: atom property name :type propname: string """ WATERS = ('HOH', 'SPC', 'WAT', 'DOD', 'H2O') subregions = [] if abregion.startswith('VL') or abregion.startswith('VH'): subregions = REGION_CHILDREN[abregion[:2]] s = abregion[-1] if s in "012": s = '_' + s subregions = [a + s for a in subregions] idx = chainregions.get(abregion, []) if not idx or len(idx) != 2 or (idx[1] - idx[0]) < 3: return for i in range(idx[0], idx[1] + 1): res = res_list[i] rnm = res.pdbres.strip() if rnm in WATERS or RESIDUE_MAP_3_TO_1_LETTER.get(rnm, 'X') == 'X': continue for atm in res.atom: atm.property[propname] = abregion chainregions[abregion] = () # done - remove from dict for subreg in subregions: subidx = chainregions.get(subreg, []) if not subidx or len(subidx) != 2: continue for i in range(subidx[0], subidx[1] + 1): res = res_list[i] rnm = res.pdbres.strip() if rnm in WATERS or RESIDUE_MAP_3_TO_1_LETTER.get(rnm, 'X') == 'X': continue for atm in res.atom: atm.property[propname] = subreg chainregions[subreg] = () def _annotate_residue_atoms(res: structure._Residue, chain: structure._Chain, scheme: str): """ Annotate all atoms in the supplied residue with various properties. :param res: The residue which will have its atoms annotated. :param chain: The chain in which the residue can be found. :param scheme: The numbering scheme to use """ antibody_category = 1 # TODO MAE-45744 use shared enum value is_special_res_partial = partial(is_special_residue, res=res, chain=chain, scheme=scheme) annotations_by_prop = { Properties.PROTEIN_CATEGORY: antibody_category, Properties.VERNIER_RES: is_special_res_partial(res_type=SpecialResidueTypes.VERNIER), Properties.CANONICAL_STRUCTURAL_RES: is_special_res_partial( res_type=SpecialResidueTypes.CANONICAL_STRUCTURAL), Properties.INTERFACE_RES: is_special_res_partial(res_type=SpecialResidueTypes.INTERFACE) } for atom in res.atom: atom.property.update(annotations_by_prop)
[docs]def annotate_ab_regions(st, scheme='Chothia'): """ Annotate the given structure using atom-level properties to mark antibody regions. This is done by marking atoms by their minimum indivisible region classification. Lineage is standard and as described here and by the 'REGION_CHILDREN' and 'REGION_DESCENDENTS' module level variables. Children of VL and VH regions are further subdivided to account for the cases of multispecifics such as L1_0, L1_1. :param st: antibody st (this is modified in place) :type st: structure.Structure """ # Identify regions and annotate region_propname = 's_bioluminate_antibody_region_id_{}'.format(scheme) for chain in st.chain: residues = [ residue for residue in structure.get_residues_by_connectivity(chain) ] seqobj = seq_type_from_chain(chain, scheme=scheme) if not seqobj or seqobj.type == 'none': continue chainregions = seqobj.getRegions() # crawl up the hierarchies for VL subregions for reg in ('VL', 'VL_0', 'VL_1', 'VL_2'): _reg_annotate_atoms(residues, chainregions, reg, region_propname) # crawl up the hierarchies for VH subregions for reg in ('VH', 'VH_0', 'VH_1', 'VH_2'): _reg_annotate_atoms(residues, chainregions, reg, region_propname) # and now deal with the rest outside the Fv region for reg in chainregions.keys(): _reg_annotate_atoms(residues, chainregions, reg, region_propname) for residue in residues: _annotate_residue_atoms(residue, chain, scheme)