Source code for schrodinger.application.bioluminate.antibody.annotate

import enum
from dataclasses import dataclass
from dataclasses import field
from functools import lru_cache
from functools import partial
from typing import Dict
from typing import Optional
from typing import Tuple

from schrodinger import structure
from schrodinger.protein import annotation
from schrodinger.structure import RESIDUE_MAP_3_TO_1_LETTER

try:
    from schrodinger.application.prime.packages import antibody
except ImportError:
    antibody = None

# Setup antibody hierarchy
"""
    Hierarchy:
        - Fab
            - VL
                - L1
                - L2
                - L3
                - LFR1
                - LFR2
                - LFR3
                - LFR4
            - CL

            - VH
                - H1
                - H2
                - H3
                - HFR1
                - HFR2
                - HFR3
                - HFR4
            - CH1

        - Hinge

        - Fc
            - CH2
            - CH3
            - CH4

    special cases denote DVD heavy and DVD light chains that have 2 VL / VH
    regions - these are distinguished by suffix '_1' e.g.

        VL_1
            - LFR1_1...LFR4_1
            - L1_1...L3_1
        (there should be no CL_1)

        VH_1
            - HFR1...HFR4_1
            - H1_1...H3_1
        (there should be no CH1_1)

    """
REGION_CHILDREN = {}
## Set up REGION_CHILDREN
REGION_CHILDREN['Fab'] = ['CL', 'VL', 'VH', 'CH1']
REGION_CHILDREN['Fc'] = ['CH2', 'CH3']
REGION_CHILDREN['VL'] = ['L1', 'L2', 'L3', 'LFR1', 'LFR2', 'LFR3', 'LFR4']
REGION_CHILDREN['VH'] = ['H1', 'H2', 'H3', 'HFR1', 'HFR2', 'HFR3', 'HFR4']
### Set up indexed units (e.g. L1_0, L1_1, up through _2
for grandparent in ['VL', 'VH']:
    for parent in REGION_CHILDREN[grandparent]:
        REGION_CHILDREN[parent] = [
            '{}_{}'.format(parent, index) for index in range(3)
        ]


## Populate REGION_DESCENDENTS for convenience
def _get_region_descendents(parent):
    if parent in REGION_CHILDREN:
        REGION_DESCENDENTS = REGION_CHILDREN[parent]
        for child in REGION_CHILDREN[parent]:
            REGION_DESCENDENTS += _get_region_descendents(child)
        return REGION_DESCENDENTS
    return []


REGION_DESCENDENTS = {}
for parent in list(REGION_CHILDREN):
    REGION_DESCENDENTS[parent] = _get_region_descendents(parent)
# End hierarchy data

# the lists here are based on Kabat numbering, the lists in other numbering schemes
# can be converted by calling "reslist_between_schemes"
# TODO: CSR may vary depending on L1/L3 loop length/type, right now just a union set
VERNIER_RESIDUES = [
    'H:2', 'H:27', 'H:28', 'H:29', 'H:30', 'H:47', 'H:48', 'H:49', 'H:67',
    'H:69', 'H:71', 'H:73', 'H:78', 'H:93', 'H:94', 'H:103', 'L:2', 'L:4',
    'L:35', 'L:36', 'L:46', 'L:47', 'L:48', 'L:49', 'L:64', 'L:66', 'L:68',
    'L:69', 'L:71', 'L:98'
]
CANONICAL_STRUCTURAL_RESIDUES = [
    'H:24', 'H:26', 'H:29', 'H:32', 'H:34', 'H:71', 'L:2', 'L:25', 'L:27B',
    'L:28', 'L:29', 'L:30', 'H:33', 'L:71', 'L:48', 'L:64', 'L:90', 'L:94',
    'L:95'
]
INTERFACE_RESIDUES = [
    'H:35', 'H:37', 'H:39', 'H:45', 'H:91', 'L:34', 'L:38', 'L:44', 'L:87'
]

# Chothia: H1/26-32,52-56,95-102,L/24-34,50-56,89-97
# ignore terminal residues 1-4 on H/L
# include residues at VH/VL interface and canonical conformation determining residues
# VERNIER_RESIDUES = ['H:24','H:25','H:33','H:34','H:50','H:51','H:57','H:58',
#                    'H:69','H:71','H:73','H:74','H:75','H:76','H:93','H:94','H:103',
#                    'L:35','L:36','L:46','L:47','L:48','L:49','L:58',
#                    'L:64','L:66','L:68','L:69','L:70','L:71','L:72','L:98','L:99',
#                    'H:37','H:48','H:49']
#
# the list in Kabat numbering from JBC,2008,283,2,1156-1166
# Kabat: H1/30-35B,H2/50-65,H3/95-102,L/24-34,50-56,89-97
# VERNIER_RESIDUES = ['H:2','H:27','H:28','H:29','H:30','H:47','H:48','H:49',
#                    'H:67','H:69','H:71','H:73','H:78','H:93','H:94','H:103',
#                    'L:2','L:4','L:35','L:36','L:46','L:47','L:48','L:49',
#                    'L:64','L:66','L:68','L:69','L:71','L:99']
# my change:
# VERNIER_RESIDUES = ['H:2','H:27','H:28','H:29','H:30','H:47','H:48','H:49',
#                     'H:67','H:69','H:71','H:73','H:78','H:93','H:94','H:103',
#                     'L:1','L:2','L:3','L:4','L:35','L:36','L:46','L:47','L:48','L:49','L:58',
#                     'L:64','L:66','L:68','L:69','L:70','L:71','L:98','L:99']

# Reference sequences with maximum long CDR loops
_REF_SEQH = (
    "QVQLVQSGAEVKKPGSSVKVSCKASGGTFGGSNYAINWVRQAPGQGLEWMGNIEPGGGGGGGGGGG"
    "GGGGGGGGGGGGGGYFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARYFMSGGG"
    "GGGGGGGGGGGGGGGGGGGGYKHLSDYWGQGTLVTVSSASTK")
_REF_SEQL = (
    "DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTGGGGGGGGGGGGGGGGGGGGRKNYLAWYQQKP"
    "GQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTITSVQAEDLAVYYCKQSYNGGGGGGGGGGGGG"
    "GGGGGGGGGGGGGGLRTFGGGTKLEIK")


[docs]class Properties:
    """
    Class to store custom property names associated with antibody annotation.
    """
    PROTEIN_CATEGORY = 'i_bioluminate_protein_category'
    VERNIER_RES = 'b_bioluminate_vernier_residue'
    CANONICAL_STRUCTURAL_RES = 'b_bioluminate_canonical_structural_residue'
    INTERFACE_RES = 'b_bioluminate_interface_residue'


[docs]class SpecialResidueTypes(enum.Enum):
    VERNIER = enum.auto()
    CANONICAL_STRUCTURAL = enum.auto()
    INTERFACE = enum.auto()


[docs]class ChainType(enum.Enum):
    HEAVY = 'heavy'
    LIGHT = 'light'
    NONE = 'none'


[docs]@dataclass(frozen=True)
class SpecialResidueZone:
    """
    Class to store mappings of universal (scheme-independent) alignment residue
    indices to their corresponding Kabat-numbered residue IDs for both heavy and
    light chains.
    """
    res_type: SpecialResidueTypes
    res_ids_by_idx_l: Dict[int, str] = field(default_factory=dict, init=False)
    res_ids_by_idx_h: Dict[int, str] = field(default_factory=dict, init=False)

    def __post_init__(self):
        """
        Populate the attributes containing maps of universal indices to
        Kabat-numbered residue IDs.
        """
        res_ids = _KABAT_RES_IDS_BY_RES_TYPE[self.res_type]
        seq_type_h, seq_type_l = _get_ref_seq_types()
        for res_id_raw_str in res_ids:
            if res_id_raw_str.startswith(_SEQ_TYPE_CHAIN_NAME_H):
                seq_type = seq_type_h
                univ_indices = seq_type.refidx_vh
                res_ids_by_idx = self.res_ids_by_idx_h
            elif res_id_raw_str.startswith(_SEQ_TYPE_CHAIN_NAME_L):
                seq_type = seq_type_l
                univ_indices = seq_type.refidx_vl
                res_ids_by_idx = self.res_ids_by_idx_l
            else:
                msg = f'Residue ID {res_id_raw_str} has an unsupported chain name'
                raise ValueError(msg)
            # Adhere to SeqType formatting
            res_id_str = res_id_raw_str.replace(':', '')
            res_id_idx = seq_type.resid_list.index(res_id_str)
            univ_idx = univ_indices[res_id_idx]
            res_ids_by_idx[univ_idx] = res_id_raw_str

[docs]    def getResidueIdsByIndex(self, chain_type: ChainType) -> Dict[int, str]:
        if chain_type is ChainType.HEAVY:
            return self.res_ids_by_idx_h
        elif chain_type is ChainType.LIGHT:
            return self.res_ids_by_idx_l
        raise ValueError(f'Unsupported chain type {chain_type}')

[docs]    def residueIsInZone(self, univ_idx: int, chain_type: ChainType) -> bool:
        res_ids_by_idx = self.getResidueIdsByIndex(chain_type)
        return res_ids_by_idx.get(univ_idx) is not None


_INTERNAL_NUMBERING_SCHEME = annotation.AntibodyCDRScheme.Kabat
_SEQ_TYPE_CHAIN_NAME_H = 'H'
_SEQ_TYPE_CHAIN_NAME_L = 'L'

# Special residue numbers + INS codes for heavy and light chains in the kabat
# numbering scheme
_KABAT_RES_IDS_BY_RES_TYPE = {
    SpecialResidueTypes.VERNIER: VERNIER_RESIDUES,
    SpecialResidueTypes.CANONICAL_STRUCTURAL: CANONICAL_STRUCTURAL_RESIDUES,
    SpecialResidueTypes.INTERFACE: INTERFACE_RESIDUES
}


[docs]@lru_cache
def get_res_zones_by_res_type(
) -> Dict[SpecialResidueTypes, SpecialResidueZone]:
    """
    Return a mapping of special residue type to its corresponding
    SpecialResidueZone instance.
    """
    return {
        res_type: SpecialResidueZone(res_type)
        for res_type, res_ids in _KABAT_RES_IDS_BY_RES_TYPE.items()
    }


@lru_cache
def _get_ref_seq_types() -> Tuple['antibody.SeqType']:
    """
    Return the heavy and light reference seq types, respectively.
    """
    seq_type_h = antibody.SeqType(_REF_SEQH,
                                  scheme=_INTERNAL_NUMBERING_SCHEME.name)
    seq_type_l = antibody.SeqType(_REF_SEQL,
                                  scheme=_INTERNAL_NUMBERING_SCHEME.name)
    return seq_type_h, seq_type_l


[docs]def is_special_residue(res: structure._Residue, chain: structure._Chain,
                       res_type: SpecialResidueTypes, scheme: str) -> bool:
    """
    Return whether the supplied residue is a "special" residue of the specified
    type and numbering scheme.

    :param res: A residue
    :param chain: The chain that the residue is inside
    :param res_type: The special residue type
    :param scheme: The antibody numbering scheme
    """
    seq_type = seq_type_from_chain(chain, scheme)
    chain_type = chain_type_from_seq_type(seq_type)
    query_idx_local = _get_local_res_index(res, chain)
    query_idx_univ = _univ_res_idx_from_local(query_idx_local, seq_type,
                                              chain_type)
    res_zone = get_res_zones_by_res_type()[res_type]
    return res_zone.residueIsInZone(query_idx_univ, chain_type)


def _univ_res_idx_from_local(local_res_idx: int, seq_type: 'antibody.SeqType',
                             chain_type: ChainType) -> Optional[int]:
    """
    Return the internal universal residue index determined from the local index
    relative to the supplied `seq_type` or None if the chain type is invalid or
    if the local residue index is outside of the range supported by our
    reference sequences.

    :param local_res_idx: The index of a given residue inside of its
        corresponding chain.
    :param seq_type: the SeqType instance for the sequence from which the local
        residue index is derived.
    """
    if chain_type is ChainType.HEAVY:
        univ_indices = seq_type.refidx_vh
    elif chain_type is ChainType.LIGHT:
        univ_indices = seq_type.refidx_vl
    else:
        return None
    try:
        return univ_indices[local_res_idx]
    except IndexError:
        # Query seq contains more residues than the reference sequence supports
        return None


def _get_local_res_index(res: structure._Residue,
                         chain: structure._Chain) -> int:
    """
    Return the "local" residue index i.e. the index of the given residue inside
    of the supplied chain's residues.

    :param res: A residue
    :param chain: A chain that should contain the residue.
    """
    local_res_idx = None
    # Start at 1 because residues are 1-indexed
    for idx, chain_res in enumerate(
            structure.get_residues_by_connectivity(chain), start=1):
        if res == chain_res:
            local_res_idx = idx
            break
    if local_res_idx is None:
        raise ValueError('Expected the supplied residue to be a member of the '
                         'supplied chain, but it is not.')
    return local_res_idx


[docs]@lru_cache
def seq_type_from_chain(chain: structure._Chain,
                        scheme: str) -> 'antibody.SeqType':
    """
    Return a `SeqType` instance derived from `chain`. Cached due to `SeqType`
    instantiation.
    """
    seq = seq_from_chain(chain)
    return antibody.SeqType(seq, scheme=scheme)


[docs]@lru_cache
def chain_type_from_seq_type(seq_type: 'antibody.SeqType') -> ChainType:
    """
    Return the chain type corresponding to `seq_type`. Cached because this is
    expected to be called many times with identical inputs e.g. when checking
    if all residues of a given chain are in any of the special residue zones.
    """
    if seq_type.type.startswith(ChainType.HEAVY.value):
        return ChainType.HEAVY
    elif seq_type.type.startswith(ChainType.LIGHT.value):
        return ChainType.LIGHT
    return ChainType.NONE


[docs]def seq_from_chain(chain: structure._Chain) -> str:
    res_codes_3l = [
        res_code for res_code in structure.get_residues_by_connectivity(chain)
    ]
    res_codes_1l = [
        RESIDUE_MAP_3_TO_1_LETTER.get(res_code.pdbres.strip(), 'X')
        for res_code in res_codes_3l
    ]
    return "".join(res_codes_1l)


[docs]def reslist_between_schemes(reslist1, scheme1, scheme2):
    """
    Given a list of residue IDs in scheme1, return the corresponding residue ID list in scheme2

    :type reslist1: list
    :param: a list of residue IDs, must have H or L as chain name, e.g. [L:21, H:35B]

    :type scheme1: string
    :param: one of the supported numbering schemes

    :type scheme2: string
    :param: one of the supported numbering schemes

    :rtype: reslist2
    :return: a list of new residue IDs
    """

    # two reference sequences with maximum long CDR loops
    seqh = "QVQLVQSGAEVKKPGSSVKVSCKASGGTFGGSNYAINWVRQAPGQGLEWMGNIEPGGGGGGGGGGGGGGGGGGGGGGGGGYFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARYFMSGGGGGGGGGGGGGGGGGGGGGGGYKHLSDYWGQGTLVTVSSASTK"
    seql = "DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTGGGGGGGGGGGGGGGGGGGGRKNYLAWYQQKPGQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTITSVQAEDLAVYYCKQSYNGGGGGGGGGGGGGGGGGGGGGGGGGGGLRTFGGGTKLEIK"
    hseqtype1 = antibody.SeqType(seqh, scheme1)
    hseqtype2 = antibody.SeqType(seqh, scheme2)
    lseqtype1 = antibody.SeqType(seql, scheme1)
    lseqtype2 = antibody.SeqType(seql, scheme2)

    reslist2 = []
    for res in reslist1:
        resid = res.replace(':', '')  # note that residue IDs have ":" here
        if resid[0] == 'H':
            seqtype1 = hseqtype1
            seqtype2 = hseqtype2
        elif resid[0] == 'L':
            seqtype1 = lseqtype1
            seqtype2 = lseqtype2
        else:
            continue
        try:
            idx = seqtype1.resid_list.index(resid)  # resid has no ":" here
            resid2 = seqtype2.resid_list[idx]
        except IndexError:
            continue
        res2 = resid2[0] + ':' + resid2[1:]
        reslist2.append(res2)
    return reslist2


def _reg_annotate_atoms(res_list, chainregions, abregion, propname):
    """
    helper function to annotate the actual atoms within a designated antibody
    region. For VL and VH regions, we ensure proper order of property input
    (smallest region is considered last).

    :param res_list: list of residue objects
    :type  res_list: List[structure.Structure]

    :param chainregions: range index of the designated antibody region
    :type  chainregions: dictionary with (from /to) elements

    :param abregion: antibody region
    :type  abregion: string

    :param propname: atom property name
    :type  propname: string

    """

    WATERS = ('HOH', 'SPC', 'WAT', 'DOD', 'H2O')

    subregions = []
    if abregion.startswith('VL') or abregion.startswith('VH'):
        subregions = REGION_CHILDREN[abregion[:2]]
        s = abregion[-1]
        if s in "012":
            s = '_' + s
            subregions = [a + s for a in subregions]

    idx = chainregions.get(abregion, [])
    if not idx or len(idx) != 2 or (idx[1] - idx[0]) < 3:
        return

    for i in range(idx[0], idx[1] + 1):
        res = res_list[i]
        rnm = res.pdbres.strip()
        if rnm in WATERS or RESIDUE_MAP_3_TO_1_LETTER.get(rnm, 'X') == 'X':
            continue

        for atm in res.atom:
            atm.property[propname] = abregion

    chainregions[abregion] = ()  # done - remove from dict
    for subreg in subregions:
        subidx = chainregions.get(subreg, [])
        if not subidx or len(subidx) != 2:
            continue

        for i in range(subidx[0], subidx[1] + 1):
            res = res_list[i]
            rnm = res.pdbres.strip()
            if rnm in WATERS or RESIDUE_MAP_3_TO_1_LETTER.get(rnm, 'X') == 'X':
                continue

            for atm in res.atom:
                atm.property[propname] = subreg
        chainregions[subreg] = ()


def _annotate_residue_atoms(res: structure._Residue, chain: structure._Chain,
                            scheme: str):
    """
    Annotate all atoms in the supplied residue with various properties.

    :param res: The residue which will have its atoms annotated.
    :param chain: The chain in which the residue can be found.
    :param scheme: The numbering scheme to use
    """
    antibody_category = 1  # TODO MAE-45744 use shared enum value
    is_special_res_partial = partial(is_special_residue,
                                     res=res,
                                     chain=chain,
                                     scheme=scheme)
    annotations_by_prop = {
        Properties.PROTEIN_CATEGORY: antibody_category,
        Properties.VERNIER_RES:
            is_special_res_partial(res_type=SpecialResidueTypes.VERNIER),
        Properties.CANONICAL_STRUCTURAL_RES: is_special_res_partial(
            res_type=SpecialResidueTypes.CANONICAL_STRUCTURAL),
        Properties.INTERFACE_RES:
            is_special_res_partial(res_type=SpecialResidueTypes.INTERFACE)
    }
    for atom in res.atom:
        atom.property.update(annotations_by_prop)


[docs]def annotate_ab_regions(st, scheme='Chothia'):
    """
    Annotate the given structure using atom-level properties to mark antibody
    regions.

    This is done by marking atoms by their minimum indivisible region
    classification. Lineage is standard and as described here and by the
    'REGION_CHILDREN' and 'REGION_DESCENDENTS' module level variables.

    Children of VL and VH regions are further subdivided to account for the
    cases of multispecifics such as L1_0, L1_1.

    :param st: antibody st (this is modified in place)
    :type  st: structure.Structure

    """

    # Identify regions and annotate
    region_propname = 's_bioluminate_antibody_region_id_{}'.format(scheme)

    for chain in st.chain:
        residues = [
            residue for residue in structure.get_residues_by_connectivity(chain)
        ]
        seqobj = seq_type_from_chain(chain, scheme=scheme)
        if not seqobj or seqobj.type == 'none':
            continue

        chainregions = seqobj.getRegions()

        # crawl up the hierarchies for VL subregions
        for reg in ('VL', 'VL_0', 'VL_1', 'VL_2'):
            _reg_annotate_atoms(residues, chainregions, reg, region_propname)

        # crawl up the hierarchies for VH subregions
        for reg in ('VH', 'VH_0', 'VH_1', 'VH_2'):
            _reg_annotate_atoms(residues, chainregions, reg, region_propname)

        # and now deal with the rest outside the Fv region
        for reg in chainregions.keys():
            _reg_annotate_atoms(residues, chainregions, reg, region_propname)

        for residue in residues:
            _annotate_residue_atoms(residue, chain, scheme)