# -*- coding: utf-8 -*-
"""
Annotations for biological sequences
Copyright Schrodinger, LLC. All rights reserved.
"""
import collections
import enum
import itertools
import math
import re
import warnings
from enum import Enum
from functools import cached_property
from Bio import SeqIO
from schrodinger.application.msv import utils as msv_utils
from schrodinger.application.msv.utils import get_rolling_average
from schrodinger.infra import util
from schrodinger.models import jsonable
from schrodinger.Qt import QtCore
from schrodinger.structutils import analyze
DEFAULT_WINDOW_PADDING = 4
_MODULE_NOT_YET_IMPORTED = object()
antibody = _MODULE_NOT_YET_IMPORTED
_SeqType = None
_disulfide_bond_title = "S-S bond"
_ssa_title = "SSA"
def _delayed_antibody_import():
"""
The antibody modules imports alignment, which imports this module. To avoid
a circular import, we delay importing antibody until it's needed. (See
`_AntibodyCDRFinder.__init__`.)
"""
global antibody
try:
from schrodinger.application.prime.packages import antibody
except ImportError:
# Prime is not installed. We can still create _AntibodyCDRFinder
# instances, but they won't find any CDRs.
antibody = None
else:
global _SeqType
class _SeqType(SeqTypeMixin, antibody.SeqType):
pass
LOGO_MAX_DIVERSITY = math.log(20.0, 2.0)
class _BindingSiteDistance(jsonable.JsonableEnum):
d3 = 3
d4 = 4
d5 = 5
d6 = 6
d7 = 7
d8 = 8
BindingSiteDistance = util.enum_speedup(_BindingSiteDistance)
BINDING_SITE = Enum('BINDING_SITE', ['CloseContact', 'FarContact', 'NoContact'])
AntibodyCDRLabel = Enum('AntibodyCDRLabel',
['NotCDR', 'L1', 'L2', 'L3', 'H1', 'H2', 'H3'])
AntibodyCDR = collections.namedtuple('AntibodyCDR', 'label start end')
Domains = Enum('Domains', ['Domain', 'NoDomain'])
AntibodyCDRScheme = util.enum_speedup(
jsonable.JsonableEnum(
'AntibodyCDRScheme',
['Chothia', 'Kabat', 'IMGT', 'EnhancedChothia', 'AHo']))
DEFAULT_ANTIBODY_SCHEME = AntibodyCDRScheme.Kabat
ANTIBODY_CDR_ITEMS = {
"Chothia": AntibodyCDRScheme.Chothia,
"Enhanced Chothia": AntibodyCDRScheme.EnhancedChothia,
"Kabat": AntibodyCDRScheme.Kabat,
"IMGT": AntibodyCDRScheme.IMGT,
"AHo": AntibodyCDRScheme.AHo
} # yapf: disable
[docs]@enum.unique
class KinaseConservation(jsonable.JsonableEnum):
VeryLow = "Very Low"
Low = "Low"
Medium = "Medium"
High = "High"
VeryHigh = "Very High"
[docs]@enum.unique
class KinaseFeatureLabel(jsonable.JsonableEnum):
GLYCINE_RICH_LOOP = "Glycine Rich Loop"
ALPHA_C = "Alpha-C"
GATE_KEEPER = "Gate Keeper"
HINGE = "Hinge"
LINKER = "Linker"
HRD = "HRD"
CATALYTIC_LOOP = "Catalytic Loop"
DFG = "DFG"
ACTIVATION_LOOP = "Activation Loop"
NO_FEATURE = "No Feature"
[docs]class Consensus(Enum):
not_conserved = " "
fully_conserved = "*"
strongly_conserved = ":"
weakly_conserved = "."
@property
def tooltip(self):
return self.name.replace("_", " ")
class _AnnotationEnum(jsonable.JsonableEnum):
"""
An enum class with a title for use in row headers. The title is based on
the enum name by default but can be overridden if desired.
"""
def __init__(self, _=None):
# if an Enum subclass defines an init the value of the enum will be
# passed to it
super(_AnnotationEnum, self).__init__()
self._title = None
self._tooltip = ''
@property
def title(self):
if self._title is not None: # could be a custom empty string
return self._title
else:
return self.name.capitalize().replace("_", " ")
@title.setter
def title(self, value):
self._title = value
@property
def tooltip(self):
# Default is no tooltip. May override with custom tooltip.
return self._tooltip
@tooltip.setter
def tooltip(self, value):
self._tooltip = value
class _SequenceAnnotationEnum(_AnnotationEnum):
"""
An enum class for annotations that apply to a single sequence (as opposed to
global annotations that apply to the alignment as a whole).
:ivar requires_structure: Does this annotation only apply to sequences that
have an associated structure?
:vartype requires_structure: bool
:ivar multi_value: Can this annotation have multiple values?
:vartype multi_value: bool
:ivar can_expand: Can this annotation be used to expand selection?
Should only be set to True for annotations with categorical values
(e.g. enum, string, or limited int values). Annotations may need
special handling in `getAnnotationValueForComparison` in sequence.py
:vartype can_expand: bool
"""
def __init__(self, _):
super(_SequenceAnnotationEnum, self).__init__()
self.requires_structure = False
self.multi_value = False
self.can_expand = False
[docs]class TupleWithRange(tuple):
@cached_property
def range(self):
"""
The range of data contianed in this tuple. Will return a tuple of
(minimum value or zero whichever is less, maximum value or zero
whichever is greater). `None` values will be ignored. If there are no
`None` values in this tuple, will return (0, 0).
:rtype: tuple(int or float, int or float)
"""
values = [val for val in self if val is not None]
values.append(0)
return min(values), max(values)
[docs]class AbstractSequenceAnnotations(QtCore.QObject):
"""
A base class for single-chain and combined-chain sequence annotations
:ivar titleChanged: A signal emitted after an annotation's title (row header)
changes.
:vartype titleChanged: `QtCore.pyqtSignal`
"""
titleChanged = QtCore.pyqtSignal()
# avoid circular refs between sequences and annotations
sequence = util.WeakRefAttribute()
[docs] def __init__(self, seq):
"""
:param seq: The sequence to store annotations for.
:type seq: sequence.Sequence
"""
super().__init__()
self.sequence = seq
[docs]class AbstractProteinSequenceAnnotationsMixin:
domainsChanged = QtCore.pyqtSignal()
invalidatedDomains = QtCore.pyqtSignal()
[docs] def __init__(self, *args, **kwargs):
"""
:param seq: The sequence to store annotations for.
:type seq: sequence.Sequence
"""
super().__init__(*args, **kwargs)
self._binding_sites = None
self._ligands = None
self._ligand_asls = None
self._domains = None
self._domains_parsed = False
@cached_property
def max_b_factor(self):
values = [val for val in self.b_factor if val is not None]
return max(values) if len(values) > 0 else 0.0
@cached_property
def min_b_factor(self):
values = [val for val in self.b_factor if val is not None]
return min(values) if len(values) > 0 else 0.0
[docs] @QtCore.pyqtSlot()
def invalidateMaxMinBFactor(self):
try:
del self.max_b_factor
except AttributeError:
pass
try:
del self.min_b_factor
except AttributeError:
pass
[docs] def getAntibodyCDR(self, col, scheme):
"""
Returns the antibody CDR information of the col'th index in the sequence
under a given antibody CDR numbering scheme.
:param col: index into the sequence
:type col: int
:param scheme: The antibody CDR numbering scheme to use
:type scheme: `AntibodyCDRScheme`
:returns: Antibody CDR label, start, and end positions
:rtype: `AntibodyCDR`, which is a named tuple of
(`AntibodyCDRLabel`, int, int) if col is in a CDR,
otherwise (AntibodyCDRLabel.NotCDR, None, None)
"""
raise NotImplementedError
[docs] def getAntibodyCDRs(self, scheme):
"""
Returns a list of antibody CDR information for the entire sequence.
:param scheme: The antibody CDR numbering scheme to use
:type scheme: AntibodyCDRScheme
:returns: A list of Antibody CDR labels, starts, and end positions
:rtype: list(AntibodyCDR)
"""
raise NotImplementedError
[docs] def isAntibodyChain(self):
"""
:returns: Whether the sequence described is an antibody chain
:rtype: bool
"""
raise NotImplementedError
[docs] def isAntibodyHeavyChain(self):
"""
:returns: Whether the sequence described is an antibody heavy chain
:rtype: bool
"""
raise NotImplementedError
[docs] def isAntibodyLightChain(self):
"""
:returns: Whether the sequence described is an antibody light chain
:rtype: bool
"""
raise NotImplementedError
@property
def binding_sites(self):
if self._binding_sites is None:
self._updateLigandContacts()
return self._binding_sites
@property
def ligands(self):
if self._ligands is None:
self._updateLigandContacts()
return self._ligands
@property
def ligand_asls(self):
if self._ligand_asls is None:
self._updateLigandContacts()
return self._ligand_asls
def _updateLigandContacts(self):
"""
Update the ligand contacts cache based on the sequence's structure
"""
raise NotImplementedError
@QtCore.pyqtSlot()
def _invalidateLigandContacts(self):
"""
Invalidates the cached ligand contact data.
"""
self._binding_sites = None
self._ligands = None
[docs] def setLigandDistance(self, distance):
"""
Updates the ligand distance and invalidates the cache
"""
raise NotImplementedError
@property
def domains(self):
if self._domains is None and self._domains_parsed:
seen = set()
domains = []
for res in self.sequence.residues():
if res.domains is None:
continue
for domain_name in res.domains:
if domain_name in seen:
continue
seen.add(domain_name)
domains.append(domain_name)
self._domains = domains
return self._domains
[docs] def getSSBondPartner(self, index):
"""
Return the residue's intra-sequence disulfide bond partner, if any.
If the residue is not involved in a disulfide bond, its partner has
been deleted, or its partner is in another sequence, it will return
None.
:param index: Index of the residue to check
:type index: int
:return: the other Residue in the disulfide bond or None
:rtype: schrodinger.protein.residue.Residue or None
"""
current_res = self.sequence[index]
cur_ss_bond = current_res.disulfide_bond
if cur_ss_bond is None or not cur_ss_bond.is_intra_sequence:
return None
res_pair = cur_ss_bond.res_pair
for res in res_pair:
if res != current_res:
return res
# Res is deleted
return None
[docs] def clearAllCaching(self):
self._invalidateLigandContacts()
self.invalidateMaxMinBFactor()
[docs] def getNumAnnValues(self, ann):
if not ann.multi_value:
return 1
if ann in (self.ANNOTATION_TYPES.binding_sites,
self.ANNOTATION_TYPES.kinase_conservation):
return len(self.ligands)
elif ann is self.ANNOTATION_TYPES.domains:
return len(self.domains) if self.domains else 0
else:
raise ValueError(f"The multi-row annotation {ann} lacks a "
"definition for how many rows it takes up.")
[docs]class SequenceAnnotations(AbstractSequenceAnnotations):
"""
Knows how to annotate a single-chain sequence
Annotations can be set at the level of the sequence as a whole, or be per
sequence element annotations. If an attribute is accessed on the
SequenceAnnotations object, the attribute is first looked for on the object
and if not found is assumed to be a per sequence element annotation. If the
elements in the sequence lack the attribute, an AttributeError will be
raised.
"""
def _getSequenceLengthAnnotation(self, annotation_name):
annotation = [
None if elem.is_gap else getattr(elem, annotation_name)
for elem in self.sequence
]
return annotation
def _setSequenceLengthAnnotation(self, annotation_name, values):
self._lengthCheck(values)
for el, val in zip(self.sequence, values):
if val is not None:
setattr(el, annotation_name, val)
def _lengthCheck(self, values):
if len(values) != len(self.sequence):
raise ValueError("Values supplied don't match the sequence length")
def __getattr__(self, attr):
"""
Returns a list of annotations, one for each residue in the sequence.
"""
return self._getSequenceLengthAnnotation(attr)
[docs]class ProteinSequenceAnnotations(AbstractProteinSequenceAnnotationsMixin,
SequenceAnnotations):
"""
Knows how to annotate a ProteinSequence
"""
annotationInvalidated = QtCore.pyqtSignal(object)
invalidatedLigandContacts = QtCore.pyqtSignal()
invalidatedMaxMinBFactor = QtCore.pyqtSignal()
@enum.unique
class ANNOTATION_TYPES(_SequenceAnnotationEnum):
# the enum values must not be changed for backwards compatibility.
# declaration order determines the order in which the annotations are
# displayed.
pairwise_constraints = 1
alignment_set = 2
resnum = 3
rescode = 4
disulfide_bonds = 5
helix_propensity = 6
beta_strand_propensity = 7
turn_propensity = 8
helix_termination_tendency = 9
exposure_tendency = 10
steric_group = 11
side_chain_chem = 12
hydrophobicity = 13
isoelectric_point = 14
b_factor = 15
window_hydrophobicity = 16
window_isoelectric_point = 17
secondary_structure = 18
domains = 20
antibody_cdr = 21
sasa = 22
pfam = 23
pred_disulfide_bonds = 24
pred_secondary_structure = 25
pred_accessibility = 26
pred_disordered = 27
pred_domain_arr = 28
proximity_constraints = 29
binding_sites = 19
kinase_conservation = 31
kinase_features = 30
gpcr_segment = 32
gpcr_generic_number = 33
ANNOTATION_TYPES = util.enum_speedup(ANNOTATION_TYPES)
RES_PROPENSITY_ANNOTATIONS = {
ANNOTATION_TYPES.beta_strand_propensity,
ANNOTATION_TYPES.exposure_tendency, ANNOTATION_TYPES.helix_propensity,
ANNOTATION_TYPES.helix_termination_tendency,
ANNOTATION_TYPES.side_chain_chem, ANNOTATION_TYPES.steric_group,
ANNOTATION_TYPES.turn_propensity
}
PRED_ANNOTATION_TYPES = {
ANNOTATION_TYPES.pred_disulfide_bonds,
ANNOTATION_TYPES.pred_secondary_structure,
ANNOTATION_TYPES.pred_accessibility,
ANNOTATION_TYPES.pred_disordered,
ANNOTATION_TYPES.pred_domain_arr,
}
ANNOTATION_TYPES.resnum.can_expand = True
ANNOTATION_TYPES.rescode.can_expand = True
ANNOTATION_TYPES.helix_propensity.can_expand = True
ANNOTATION_TYPES.beta_strand_propensity.can_expand = True
ANNOTATION_TYPES.turn_propensity.can_expand = True
ANNOTATION_TYPES.helix_termination_tendency.can_expand = True
ANNOTATION_TYPES.exposure_tendency.can_expand = True
ANNOTATION_TYPES.steric_group.can_expand = True
ANNOTATION_TYPES.side_chain_chem.can_expand = True
ANNOTATION_TYPES.secondary_structure.can_expand = True
ANNOTATION_TYPES.domains.can_expand = True
ANNOTATION_TYPES.antibody_cdr.can_expand = True
ANNOTATION_TYPES.pfam.can_expand = True
ANNOTATION_TYPES.pred_secondary_structure.can_expand = True
ANNOTATION_TYPES.pred_accessibility.can_expand = True
ANNOTATION_TYPES.pred_disordered.can_expand = True
ANNOTATION_TYPES.pred_domain_arr.can_expand = True
ANNOTATION_TYPES.binding_sites.can_expand = True
# ANNOTATION_TYPES.kinase_conservation.can_expand = True # TODO MSV-3355
ANNOTATION_TYPES.kinase_features.can_expand = True
ANNOTATION_TYPES.resnum.title = "Residue number"
ANNOTATION_TYPES.resnum.tooltip = "Residue Numbers"
ANNOTATION_TYPES.rescode.title = "Residue code"
ANNOTATION_TYPES.disulfide_bonds.title = _disulfide_bond_title
ANNOTATION_TYPES.disulfide_bonds.tooltip = 'Disulfide Bonds'
ANNOTATION_TYPES.secondary_structure.title = _ssa_title
ANNOTATION_TYPES.secondary_structure.tooltip = "Secondary Structure Assignment"
ANNOTATION_TYPES.window_hydrophobicity.title = 'Hydrophobicity'
ANNOTATION_TYPES.window_isoelectric_point.title = 'Isoelectric Point'
ANNOTATION_TYPES.b_factor.requires_structure = True
ANNOTATION_TYPES.b_factor.tooltip = 'B Factor'
ANNOTATION_TYPES.binding_sites.requires_structure = True
ANNOTATION_TYPES.binding_sites.multi_value = True
ANNOTATION_TYPES.binding_sites.tooltip = "Binding Site (Ligand Contacts)"
ANNOTATION_TYPES.kinase_conservation.title = "Kinase Binding Site Conservation"
ANNOTATION_TYPES.kinase_conservation.requires_structure = True
ANNOTATION_TYPES.kinase_conservation.multi_value = True
ANNOTATION_TYPES.kinase_features.title = "Kinase Features"
ANNOTATION_TYPES.domains.tooltip = "Domains"
ANNOTATION_TYPES.domains.multi_value = True
ANNOTATION_TYPES.antibody_cdr.title = "Antibody CDRs"
ANNOTATION_TYPES.pairwise_constraints.title = "Constraints"
ANNOTATION_TYPES.proximity_constraints.title = "Constraints"
ANNOTATION_TYPES.pred_disulfide_bonds.title = f"{_disulfide_bond_title} (P)"
ANNOTATION_TYPES.pred_disulfide_bonds.tooltip = "Disulfide Bonds Prediction"
ANNOTATION_TYPES.pred_disordered.title = "Disordered (P)"
ANNOTATION_TYPES.pred_disordered.tooltip = "Disordered Prediction"
ANNOTATION_TYPES.pred_secondary_structure.title = f"{_ssa_title} (P)"
ANNOTATION_TYPES.pred_secondary_structure.tooltip = "Secondary Structure Assignment Prediction"
ANNOTATION_TYPES.pred_domain_arr.title = "Domains (P)"
ANNOTATION_TYPES.pred_domain_arr.tooltip = "Domains Prediction"
ANNOTATION_TYPES.pred_accessibility.title = "Accessibility (P)"
ANNOTATION_TYPES.pred_accessibility.tooltip = "Accessibility Prediction"
ANNOTATION_TYPES.gpcr_segment.title = "GPCR Segment"
ANNOTATION_TYPES.gpcr_segment.tooltip = "GPCR Segment"
ANNOTATION_TYPES.gpcr_generic_number.title = "GPCR Generic Number"
ANNOTATION_TYPES.gpcr_generic_number.tooltip = "GPCR Generic Number"
[docs] def __init__(self, seq):
super().__init__(seq)
self.from_maestro = False # If seq has corresponding entry in Maestro.
self.maestro_entry_id = None
self.maestro_chain_name = None
self.chain_id = None
self._binding_site_residues = collections.defaultdict(set)
self._pred_disulfide_bonds = None
self._ligand_distance = 5
self._cdr_finder = None
self._hydrophobicity_window_padding = DEFAULT_WINDOW_PADDING
self._isoelectric_point_window_padding = DEFAULT_WINDOW_PADDING
residue_changed_slots = (self._invalidateLigandContacts,
self._renumberAntibodyCDRSlot,
self._invalidateSASA,
self.invalidateWindowHydrophobicity,
self.invalidateWindowIsoelectricPoint,
self.invalidateMaxMinBFactor,
self._invalidateDomains)
for slot in residue_changed_slots:
self.sequence.residuesChanged.connect(slot)
self.sequence.residuesAdded.connect(self._invalidateAntibodyCDRSlot)
self.sequence.residuesRemoved.connect(self._invalidateAntibodyCDRSlot)
[docs] @QtCore.pyqtSlot()
def invalidateMaxMinBFactor(self):
super().invalidateMaxMinBFactor()
self.invalidatedMaxMinBFactor.emit()
@cached_property
def window_hydrophobicity(self):
return self._getWindowAvgAnnotation("hydrophobicity",
self.hydrophobicity_window_padding)
@property
def hydrophobicity_window_padding(self):
return self._hydrophobicity_window_padding
@hydrophobicity_window_padding.setter
def hydrophobicity_window_padding(self, value):
self._hydrophobicity_window_padding = value
self.invalidateWindowHydrophobicity()
@property
def isoelectric_point_window_padding(self):
return self._isoelectric_point_window_padding
@property
def binding_site_residues(self):
"""
Binding site residues of the sequence as a map, with key being the
ligand name(str) and value is the set of residues(protein.Residue).
"""
if not self._binding_site_residues:
self._updateLigandContacts()
return self._binding_site_residues
@isoelectric_point_window_padding.setter
def isoelectric_point_window_padding(self, value):
self._isoelectric_point_window_padding = value
self.invalidateWindowIsoelectricPoint()
[docs] @QtCore.pyqtSlot()
def invalidateWindowHydrophobicity(self):
"""
Invalidate the cached window hydrophobicity data. Note that this method
is also called from the sequence when the window size changes.
"""
try:
del self.window_hydrophobicity
except AttributeError:
pass
self.annotationInvalidated.emit(
self.ANNOTATION_TYPES.window_hydrophobicity)
@cached_property
def window_isoelectric_point(self):
return self._getWindowAvgAnnotation(
"isoelectric_point", self.isoelectric_point_window_padding)
[docs] @QtCore.pyqtSlot()
def invalidateWindowIsoelectricPoint(self):
"""
Invalidate the cached window isoelectric point data. Note that this
method is also called from the sequence when the window size changes.
"""
try:
del self.window_isoelectric_point
except AttributeError:
pass
self.annotationInvalidated.emit(
self.ANNOTATION_TYPES.window_isoelectric_point)
@cached_property
def sasa(self):
try:
finder = _SASAFinder(self.sequence)
except ValueError:
vals = [None] * len(self.sequence)
else:
vals = finder.getSASAs()
return TupleWithRange(vals)
@QtCore.pyqtSlot()
def _invalidateSASA(self):
"""
Invalidates the cached SASA data.
The connected signal emits (int, int) and the slot decorator must match
the signal, but the method does not need to have the params.
"""
try:
del self.sasa
except AttributeError:
pass
self.annotationInvalidated.emit(self.ANNOTATION_TYPES.sasa)
def _getWindowAvgAnnotation(self, annotation_name, window_padding):
"""
Calculate values for the specified window averaged annotation for the
entire sequence.
:param annotation_name: name of annotation to average in a window
:type annotation_name: str
:param window_padding: number of values to average over, on both
the left and the right; window_padding=4 gives window size of 9
:type window_padding: int
:return: list containing sliding window averages of annotation
values specified by annotation_name
:rtype: list(float or None)
"""
anno_vals = [
getattr(res, annotation_name) if res.is_res else None
for res in self.sequence
]
if window_padding == 0:
return TupleWithRange(anno_vals)
anno_vals_wo_gaps = [val for val in anno_vals if val is not None]
avgs_wo_gaps = get_rolling_average(anno_vals_wo_gaps, window_padding)
if not avgs_wo_gaps:
# The sequence is smaller than the window size, so we can't
# calculate any average values
return TupleWithRange([None] * len(anno_vals))
# Figure out the index of the first residue that has a fully populated
# window (i.e. the first residue with `window_padding` non-gap residues
# before it)
res_count = 0
start_index = 0
for i, val in enumerate(anno_vals):
if val is not None:
res_count += 1
if res_count == window_padding:
start_index = i + 1
break
# copy values from avgs_wo_gaps into avgs_with_gaps, taking gap
# locations into account
avgs_index = 0
avgs_with_gaps = [None] * len(anno_vals)
for i, val in enumerate(anno_vals[start_index:], start=start_index):
if val is not None:
avgs_with_gaps[i] = avgs_wo_gaps[avgs_index]
avgs_index += 1
if avgs_index == len(avgs_wo_gaps):
# We've reached the last residue that has a fully poulated
# window. There are exactly `window_padding` non-gap
# residues afer this one, which we can confirm by
# uncommenting the assertion below. (This should normally be
# left commented out for performance reasons.)
# assert sum(1 for val in anno_vals[i + 1:] if val
# is not None) == window_padding
break
return TupleWithRange(avgs_with_gaps)
[docs] def getAntibodyCDR(self, col, scheme):
# See AbstractProteinSequenceAnnotationMixin for method documentation
for cdr in self.getAntibodyCDRs(scheme):
if cdr.start <= col <= cdr.end:
return cdr
return AntibodyCDR(label=AntibodyCDRLabel.NotCDR, start=None, end=None)
[docs] def getAntibodyCDRs(self, scheme):
# See AbstractProteinSequenceAnnotationMixin for method documentation
if not self.sequence.getGaplessLength():
return []
if self._cdr_finder is None:
self._cdr_finder = _AntibodyCDRFinder(self.sequence)
return self._cdr_finder.getCDRs(scheme)
[docs] def isAntibodyChain(self):
# See AbstractProteinSequenceAnnotationMixin for method documentation
if not self.sequence.getGaplessLength():
return []
if self._cdr_finder is None:
self._cdr_finder = _AntibodyCDRFinder(self.sequence)
return self._cdr_finder.isAntibodyChain()
[docs] def isAntibodyHeavyChain(self):
# See AbstractProteinSequenceAnnotationMixin for method documentation
if not self.isAntibodyChain():
return False
return self._cdr_finder.isAntibodyHeavyChain()
[docs] def isAntibodyLightChain(self):
# See AbstractProteinSequenceAnnotationMixin for method documentation
if not self.isAntibodyChain():
return False
return self._cdr_finder.isAntibodyLightChain()
@QtCore.pyqtSlot()
def _invalidateAntibodyCDRSlot(self):
"""
Invalidates the cached antibody CDR data.
"""
self._invalidateAntibodyCDRs(recalculate=True)
@QtCore.pyqtSlot()
def _renumberAntibodyCDRSlot(self):
"""
Renumbers the cached antibody CDR data.
"""
self._invalidateAntibodyCDRs(recalculate=False)
def _invalidateAntibodyCDRs(self, recalculate=True):
"""
Invalidate the Antibody CDR cache. If we've only added gaps to the
sequence, we only need to re-assign the CDR indexes, but
if we add or insert residues, we need to recalculate CDRs entirely.
:param recalculate: whether to recalculate CDRs entirely
:type recalculate: bool
"""
if recalculate:
self._cdr_finder = None
elif self._cdr_finder is not None:
self._cdr_finder.forceIndexReassignment()
[docs] def getSparseRescodes(self, modulo):
codes = []
for res in self.sequence:
if res.is_res and res.resnum and res.resnum % modulo == 0:
codes.append(res.rescode)
else:
codes.append(None)
return codes
[docs] def onStructureChanged(self):
self._invalidateLigandContacts()
self._invalidateAntibodyCDRs(recalculate=True)
self._invalidateSASA()
[docs] def setLigandDistance(self, distance):
# See AbstractProteinSequenceAnnotationsMixin for method documentation
if distance == self._ligand_distance:
return
self._ligand_distance = distance
self._invalidateLigandContacts()
@QtCore.pyqtSlot()
def _invalidateLigandContacts(self):
# See AbstractProteinSequenceAnnotationsMixin for method documentation
super()._invalidateLigandContacts()
self._binding_site_residues.clear()
self.invalidatedLigandContacts.emit()
def _updateLigandContacts(self):
# See AbstractProteinSequenceAnnotationsMixin for method documentation
try:
contacts_finder = _LigandContactsFinder(self.sequence)
except ValueError:
self._ligands = []
self._ligand_asls = []
self._binding_sites = [[] for _ in self.sequence]
return
far_lig_dist = self._ligand_distance + 1
close_lig_dist = self._ligand_distance - 1
ligands, lig_asls = contacts_finder.getLigandsWithin(far_lig_dist)
binding_sites = [
[BINDING_SITE.NoContact] * len(ligands) for _ in self.sequence
]
for i, lig in enumerate(ligands):
far_idxs = contacts_finder.resIndexesNear(lig, far_lig_dist)
close_idxs = contacts_finder.resIndexesNear(lig, close_lig_dist)
for idx in far_idxs:
binding_sites[idx][i] = BINDING_SITE.FarContact
self._binding_site_residues[lig].add(self.sequence[idx])
for idx in close_idxs:
binding_sites[idx][i] = BINDING_SITE.CloseContact
self._binding_site_residues[lig].add(self.sequence[idx])
self._ligand_asls = lig_asls
self._binding_sites = binding_sites
need_signal = (self._ligands != ligands)
self._ligands = ligands
if need_signal:
self.titleChanged.emit()
@QtCore.pyqtSlot()
def _invalidateDomains(self):
"""
Clears the domain annotation cache.
"""
if self._domains is not None:
self._domains = None
self.invalidatedDomains.emit()
[docs] def parseDomains(self, filename):
"""
Parse XML file from UniProt database to get domain information.
:param filename: the XML file to parse for domain information
:type filename: str
:return: a list of the domains (names) for the sequence in order
:rtype: list(str)
"""
domains = []
record = SeqIO.read(filename, "uniprot-xml")
residues_by_resnum = collections.defaultdict(list)
for res in self.sequence.residues():
residues_by_resnum[res.resnum].append(res)
for feature in record.features:
if feature.type != "domain":
continue
name = feature.qualifiers.get('description')
if name is None:
continue
location = feature.location
start = int(location.start)
end = int(location.end)
any_res_found = False
for domain_resnum in range(start, end + 1):
residues = residues_by_resnum.get(domain_resnum)
if residues is None:
continue
any_res_found = True
for res in residues:
if res.domains is None:
res.domains = {name}
else:
res.domains.add(name)
if any_res_found:
domains.append(name)
self._domains = domains
self._domains_parsed = True
self.domainsChanged.emit()
return domains
[docs] def resetAnnotation(self, ann):
"Force a reset of an annotation's cache."
if ann is self.ANNOTATION_TYPES.binding_sites:
self._invalidateLigandContacts()
elif ann is self.ANNOTATION_TYPES.antibody_cdr:
self._invalidateAntibodyCDRs()
elif ann is self.ANNOTATION_TYPES.window_hydrophobicity:
self.invalidateWindowHydrophobicity()
elif ann is self.ANNOTATION_TYPES.window_isoelectric_point:
self.invalidateWindowIsoelectricPoint()
elif ann is self.ANNOTATION_TYPES.sasa:
self._invalidateSASA()
elif ann is self.ANNOTATION_TYPES.domains:
self._invalidateDomains()
[docs] def clearAllCaching(self):
for ann in self.ANNOTATION_TYPES:
self.resetAnnotation(ann)
@property
def inscode(self):
return self._getSequenceLengthAnnotation('inscode')
@inscode.setter
def inscode(self, codes):
self._setSequenceLengthAnnotation('inscode', codes)
@property
def resnum(self):
return self._getSequenceLengthAnnotation('resnum')
@resnum.setter
def resnum(self, nums):
self._setSequenceLengthAnnotation('resnum', nums)
[docs] def getCDRResidueList(self, scheme):
"""
:return: List of CDR Residues.
:rtype: List[str]
"""
if self._cdr_finder is None:
self._cdr_finder = _AntibodyCDRFinder(self.sequence)
if scheme is not self._cdr_finder.scheme:
self._cdr_finder.updateScheme(scheme)
return self._cdr_finder.getResidueList()
[docs]class NucleicAcidSequenceAnnotations(ProteinSequenceAnnotations):
# For the alpha release, we only have "limited support" of nucleic acids,
# so this class is just a stub.
# TODO (MSV-1504): Create proper nucleic acid domain objects
[docs] def isAntibodyChain(self):
return False
###############################################################################
# Global (Alignment) level Annotations
###############################################################################
[docs]class ProteinAlignmentAnnotations(object):
"""
Knows how to annotate an alignment (a collection of aligned sequences)
"""
@enum.unique
class ANNOTATION_TYPES(_AnnotationEnum):
# the enum values must not be changed for backwards compatibility.
# declaration order determines the order in which the annotations are
# displayed.
indices = 1
mean_hydrophobicity = 2
mean_isoelectric_point = 3
consensus_symbols = 4
consensus_seq = 5
consensus_freq = 6
sequence_logo = 7
ANNOTATION_TYPES = util.enum_speedup(ANNOTATION_TYPES)
ANNOTATION_TYPES.mean_hydrophobicity.tooltip = 'Mean Hydrophobicity'
ANNOTATION_TYPES.mean_isoelectric_point.tooltip = 'Mean Isoelectric Point'
ANNOTATION_TYPES.consensus_symbols.tooltip = 'Consensus Symbols'
ANNOTATION_TYPES.consensus_seq.tooltip = 'Consensus Sequence'
ANNOTATION_TYPES.sequence_logo.tooltip = 'Sequence Logo'
ANNOTATION_TYPES.indices.title = ""
# avoid circular refs between alignments and annotations
alignment = util.WeakRefAttribute()
[docs] def __init__(self, aln):
"""
:param aln: `alignment.Alignment`
"""
self.alignment = aln
def _alnMeanAnnotation(self, annotation_name):
"""
returns: Per-column averages of annotation values specified by
annotation_name, or 0.0 if columns includes only gaps.
"""
columns = list(self.alignment.columns(omit_gaps=True, match_type=True))
means = []
for column in columns:
values_with_missing = (
getattr(res, annotation_name) for res in column)
values = [val for val in values_with_missing if val is not None]
divisor = float(len(values)) if values else 1.0
mean = sum(values) / divisor
means.append(mean)
return TupleWithRange(means)
@cached_property
def indices(self):
"""
A numbering of all the column indices in an alignment
"""
return tuple(range(1, self.alignment.num_columns + 1))
@cached_property
def mean_hydrophobicity(self):
"""
returns: A list of floats representing per-column averages of the
hydrophobicity of residues in the alignment
"""
return self._alnMeanAnnotation('hydrophobicity')
@cached_property
def mean_isoelectric_point(self):
"""
returns: A list of floats representing per-column averages of the
isoelectric point of residues in the alignment
"""
return self._alnMeanAnnotation('isoelectric_point')
@cached_property
def consensus_seq(self):
"""
Consensus sequence in the alignment. If there is more than one
highest freq. residue in the column, save all of them.
:return: consensus sequence
:rtype: list(list(schrodinger.protein.residue.Residue))
"""
seq = []
for most_common in self.alignment.getFrequencies(normalize=False):
if most_common:
_, max_freq = most_common[0]
cons = [res for res, freq in most_common if freq == max_freq]
else:
cons = []
seq.append(cons)
return seq
@cached_property
def consensus_freq(self):
"""
Returns the frequency of the consensus residue in each
alignment column as a list. Gaps are not used for calculation.
:return: consensus residue frequencies
:rtype: TupleWithRange(float)
"""
consensus_only = []
for column in self.alignment.getFrequencies():
if column:
freq_percentage = round(float(column[0][1] * 100), 2)
consensus_only.append(freq_percentage)
else: # gap-only columns
consensus_only.append(0.00)
return TupleWithRange(consensus_only)
@cached_property
def consensus_symbols(self):
"""
Consensus symbols in the alignment based on pre-defined residue sets,
same as in ClustalW
:return: consensus symbols for each alignment position
:type: A list of ConsensusSymbol enums.
"""
strong_sets = [
set("STA"),
set("NEQK"),
set("NHQK"),
set("NDEQ"),
set("QHRK"),
set("MILV"),
set("MILF"),
set("HY"),
set("FYW")
]
weak_sets = [
set("CSA"),
set("ATV"),
set("SAG"),
set("STNK"),
set("STPA"),
set("SGND"),
set("SNDEQK"),
set("NDEQHK"),
set("NEQHRK"),
set("FVLIM"),
set("HFY")
]
levels = []
for column in self.alignment.columns(omit_gaps=True, match_type=True):
level = Consensus.not_conserved
if column:
column_res = set(res.short_code for res in column)
if len(column_res) == 1:
# fully conserved to a single residue
level = Consensus.fully_conserved
elif any(column_res.issubset(strong) for strong in strong_sets):
# conserved to one of the strong sets
level = Consensus.strongly_conserved
elif any(column_res.issubset(weak) for weak in weak_sets):
# conserved to one of the weak sets
level = Consensus.weakly_conserved
levels.append(level)
return levels
@cached_property
def sequence_logo(self):
"""
Calculates normalized frequencies of individual amino acids per
alignment position, and overall estimate of column composition
diversity ('bits'). Bit values are weighted by the number of gaps in
the column.
Schneider TD, Stephens RM (1990). "Sequence Logos: A New Way to
Display Consensus Sequences". Nucleic Acids Res 18 (20): 6097–6100.
doi:10.1093/nar/18.20.6097
:return: the list of bits and frequencies (in decreasing order) of the
residues in each column of the alignment.
:rtype: list(tuple(float, tuple(tuple(str, float))))
"""
alignment = self.alignment
logo = []
num_seqs = len(alignment.getSeqsMatchingRefType())
for (frequencies,
column) in zip(alignment.getFrequencies(),
alignment.columns(omit_gaps=True, match_type=True)):
if frequencies:
# Calculate column uncertainty
uncertainty = 0.00
for (_, freq) in frequencies:
uncertainty -= freq * math.log(freq, 2.0)
# Bits represent amount of information at column, weighted by
# number of gaps.
bit_weight = len(column) / num_seqs
bits = (LOGO_MAX_DIVERSITY - uncertainty) * bit_weight
entry = (bits, frequencies)
else: # gap-only columns
entry = (LOGO_MAX_DIVERSITY, ())
logo.append(entry)
return logo
[docs] def clearAllCaching(self):
for attr in ("indices", "mean_hydrophobicity", "mean_isoelectric_point",
"consensus_seq", "consensus_freq", "consensus_symbols",
"sequence_logo"):
try:
delattr(self, attr)
except AttributeError:
pass
class _LigandContactsFinder(object):
"""
An class that finds the ligands that are close to a sequence, as well as the
residues that are close to those ligands. Note that this only makes sense
for sequences that have structures.
"""
seq = util.WeakRefAttribute()
def __init__(self, seq):
"""
:param seq: The sequence to find the ligand contacts of
:type seq: `schrodinger.protein.sequence.Sequence`
:raises ValueError: if the sequence has no associated structure
"""
ct = seq.getStructure()
if ct is None:
raise ValueError("Ligand contacts can only be found for "
"a sequence with a structure.")
self.seq = seq
self.ct = ct
self._all_ligands = analyze.find_ligands(self.ct)
self._lig_name_dict = {
self._makeLigandName(lig): lig for lig in self._all_ligands
}
self._res_dict = {(res.resnum, res.inscode): i
for (i, res) in enumerate(seq)
if res.is_res}
def _makeLigandName(self, lig):
"""
:param lig: the ligand to make a name for
:type lig: schrodinger.structutils.analyze.Ligand
:return: the name for the ligand
:rtype: string
"""
return make_ligand_name(self.ct, lig)
def getLigandsWithin(self, dist):
"""
Find all ligands that are within a certain distance of a sequence.
:param dist: The distance around the sequence to search for ligands
:type dist: int
:return: a tuple of ligand names and a tuple of ligand ASLs
:rtype: tuple[tuple[str], tuple[str]]
"""
ligand_names_and_asls = []
for lig in self._all_ligands:
query = ("(within {dist} ({ligand_asl})) "
"and (protein) "
"and (chain. {chain_id})").format(
ligand_asl=lig.ligand_asl,
dist=dist,
chain_id=self.seq.structure_chain)
if analyze.evaluate_asl(self.ct, query):
# lig.ligand_asl can be "too compact" (e.g. `m.n 2`) so
# construct a more human-readable ASL
atom = self.ct.atom[lig.atom_indexes[0]]
lig_asl = f"(res. {atom.resnum} and res.ptype {atom.pdbres})"
ligand_names_and_asls.append(
(self._makeLigandName(lig), lig_asl))
if not ligand_names_and_asls:
return (), ()
ligand_names_and_asls.sort()
# Split name-ASL tuples into names and ASLs
lig_names, lig_asls = zip(*ligand_names_and_asls)
return lig_names, lig_asls
def resIndexesNear(self, lig_name, dist):
"""
Find all residues that are within a certain distance of a ligand.
:param lig_name: the name of the ligand to search for nearby residues
:type lig_name: string
:param dist: the distance around the ligand to search for residues
:type dist: int
:return: indexes of the residues in the sequence that are close
to the ligand
:rtype: set(int)
"""
try:
lig = self._lig_name_dict[lig_name]
except KeyError:
raise KeyError("Ligand {lig_name} not found on structure.".format(
lig_name=lig_name))
# Find atom indexes in structure within `dist` angstroms of the ligand
# which are on the sequence's chain
query = ("(within {dist} ({ligand_asl})) "
"and (protein) "
"and (chain. {chain_id})").format(
ligand_asl=lig.ligand_asl,
dist=dist,
chain_id=self.seq.structure_chain)
nearby_atom_indexes = analyze.evaluate_asl(self.ct, query)
# Get residue indexes in the sequence corresponding to those atoms
res_indexes = set()
for atom_index in nearby_atom_indexes:
atom = self.ct.atom[atom_index]
res_key = (atom.resnum, atom.inscode)
# FIXME: (MSV-1371)
# Remove this check once residue insertion/deletion is
# synchronized between the MSV and the workspace.
if res_key not in self._res_dict:
continue
res_index = self._res_dict[res_key]
res_indexes.add(res_index)
return res_indexes
class _AntibodyCDRFinder(object):
"""
Class to help with finding CDRs ("complementary determining regions") in
an antibody chain. There are different numbering schemes for identifying
where the CDRs span. Converting between the schemes doesn't cost much time,
but determining the where the CDRs are in the first place is rather expensive
(~0.3s), so we have to be careful about caching things.
"""
seq = util.WeakRefAttribute()
def __init__(self, seq):
"""
:param seq: The sequence to find the CDRs on
:type seq: `schrodinger.protein.sequence.ProteinSequence`
"""
if antibody is _MODULE_NOT_YET_IMPORTED:
# in-function import to avoid circular import
_delayed_antibody_import()
if antibody is None:
warnings.warn('Prime not installed; antibody CDRs cannot be found!')
# define dummy attributes so that we can still access this object
# normally, but it won't report any CDRs
self.getCDRs = lambda scheme: []
self.forceIndexReassignment = lambda: None
self.isAntibodyChain = lambda: False
return
# normal instantiation when Prime and Clustal are present
self.seq = seq
self.scheme = AntibodyCDRScheme.Chothia
self.seq_type = _SeqType(self.seq, self.scheme.name)
self._cdrs = None
def getCDRs(self, scheme):
"""
Returns the antibody CDRs present on the sequence, numbered by the given
numbering scheme.
:param scheme: The antibody CDR numbering scheme to use
:type scheme: `AntibodyCDRScheme` enum
:returns: the antibody CDRs on the sequence
:rtype: list of `AntibodyCDR` if the sequence is an antibody chain,
otherwise an empty list
"""
if scheme is not self.scheme:
self.updateScheme(scheme)
if self._cdrs is None:
self._cdrs = self._extractCDRs()
return self._cdrs
def updateScheme(self, scheme):
"""
Update the numbering scheme by asking the antibody.SeqType "backend".
:param scheme: The antibody CDR numbering scheme to use
:type scheme: `AntibodyCDRScheme` enum
"""
self.seq_type.convertScheme(scheme.name)
self.scheme = scheme
self.forceIndexReassignment()
def forceIndexReassignment(self):
"""
Force a recalculation of the CDR start and end indices. This is required
when the numbering scheme changes and when gaps are inserted/removed.
"""
self._cdrs = None
def _extractCDRs(self):
"""
Extract the CDR information from the antibody.SeqType "backend" that
calculated them. Note that the calculated indexes ignore gaps, so we
need to translate them back into actual indexes into the sequence.
:returns: the antibody CDRs on the sequence
:rtype: list of `AntibodyCDR` if the sequence is an antibody chain,
otherwise an empty list
"""
if not self.seq:
# SeqType doesn't create attributes for blank sequences, so we'd get
# a traceback below
return []
index_map = [i for (i, res) in enumerate(self.seq) if res.is_res]
cdr_indexes = ((index_map[start], index_map[end])
for start, end in self.seq_type.cdr_index)
cdr_labels = self.seq_type.cdr_label
return [
AntibodyCDR(label=AntibodyCDRLabel[label], start=start, end=end)
for label, (start, end) in zip(cdr_labels, cdr_indexes)
]
def isAntibodyChain(self):
"""
:returns: Whether the sequence is an antibody chain
:rtype: bool
"""
return bool(self.seq_type.cdr_label)
def isAntibodyHeavyChain(self):
"""
:returns: Whether the sequence is an antibody heavy chain
:rtype: bool
"""
return self.seq_type.isHeavyChain()
def isAntibodyLightChain(self):
"""
:returns: Whether the sequence is an antibody light chain
:rtype: bool
"""
return self.seq_type.isLightChain()
def getResidueList(self):
return self.seq_type.resid_list
def _prep_seq_for_prime(seq):
"""
Convert a ProteinSequence to an uppercase gapless seq str
"""
seq_str = str(seq).replace(seq.gap_char, "")
seq_str = seq_str.upper()
return seq_str
[docs]class SeqTypeMixin:
"""
Mixin to customize antibody.SeqType for MSV2. See _delayed_antibody_import
for class declaration.
"""
_HEAVY_TYPES = {"heavy"}
_LIGHT_TYPES = {"light_lambda", "light_kappa"}
[docs] def __init__(self, seq, *args, **kwargs):
"""
:type seq: schrodinger.protein.sequence.ProteinSequence
"""
seq_str = _prep_seq_for_prime(seq)
super().__init__(seq_str, *args, **kwargs)
[docs] def isHeavyChain(self):
return self.type in self._HEAVY_TYPES
[docs] def isLightChain(self):
return self.type in self._LIGHT_TYPES
class _SASAFinder(object):
"""
Class to help with calculating the solvent-accessible surface area on each
residue in a sequence.
"""
seq = util.WeakRefAttribute()
def __init__(self, seq):
"""
:param seq: The sequence to find the residue SASAs of
:type seq: schrodinger.protein.sequence.ProteinSequence
:raises ValueError: if the sequence doesn't have a structure
"""
self.seq = seq
ct = seq.getStructure()
if ct is None:
raise ValueError("SASA can only be calculated for sequences "
"with structure")
self.ct = ct.chain[seq.chain].extractStructure()
def getSASAs(self):
"""
:returns: The SASA of each residue, by index in the sequence. For
gaps in the sequence or structureless residues, the SASA is None.
:rtype: list of (float or None)
"""
sasa_by_atom = analyze.calculate_sasa_by_atom(self.ct)
# Iterating through residues in the structure may not correspond to
# residues in the sequence (because of gaps, solvents, ligands, etc),
# so we key each residue's sasa by residue number and insertion code.
sasas_by_res = {}
for res in self.ct.residue:
key = (res.resnum, res.inscode)
sasa = sum(sasa_by_atom[a.index - 1] for a in res.atom)
sasas_by_res[key] = sasa
sasas = [None] * len(self.seq)
for i, res in enumerate(self.seq):
if res.hasStructure():
key = (res.resnum, res.inscode)
sasas[i] = sasas_by_res[key]
return sasas
[docs]class CombinedChainProteinSequenceAnnotations(
AbstractProteinSequenceAnnotationsMixin,
AbstractSequenceAnnotations,
metaclass=CombinedChainSequenceAnnotationMeta,
wraps=ProteinSequenceAnnotations,
cached_annotations=("sasa", "window_hydrophobicity",
"window_isoelectric_point", "domains"),
wrapped_properties=("hydrophobicity_window_padding",
"isoelectric_point_window_padding", "domains")):
"""
Sequence annotations for a `sequence.CombinedChainProteinSequence`.
Annotations will be fetched from the `ProteinSequenceAnnotations` objects
for each split-chain sequence.
"""
sequence = util.WeakRefAttribute()
[docs] def __init__(self, seq):
"""
:param seq: The sequence to store annotations for.
:type seq: sequence.CombinedChainProteinSequence
"""
super().__init__(seq)
for chain in seq.chains:
self._connectSignals(chain)
def _getSignalsAndSlots(self, chain):
return [
(chain.annotations.titleChanged, self.titleChanged),
(chain.annotations.invalidatedLigandContacts,
self._invalidateLigandContacts),
(chain.annotations.invalidatedMaxMinBFactor,
self.invalidateMaxMinBFactor),
(chain.annotations.invalidatedDomains, self.invalidatedDomains),
(chain.annotations.annotationInvalidated,
self._invalidateAnnotation),
]
def _connectSignals(self, chain):
for signal, slot in self._getSignalsAndSlots(chain):
signal.connect(slot)
def _disconnectSignals(self, chain):
for signal, slot in self._getSignalsAndSlots(chain):
signal.disconnect(slot)
[docs] def chainAdded(self, chain):
"""
Respond to a new chain being added to the sequence. The sequence is
responsible for calling this method whenever a chain is added.
:param chain: The newly added chain.
:type chain: sequence.ProteinSequence
"""
self._connectSignals(chain)
self._invalidateLigandContacts()
[docs] def chainRemoved(self, chain):
"""
Respond to a chain being removed from the sequence. The sequence is
responsible for calling this method whenever a chain is removed.
:param chain: The removed chain.
:type chain: sequence.ProteinSequence
"""
self._disconnectSignals(chain)
self._invalidateLigandContacts()
def _adjustCDR(self, cdr, offset):
"""
Adjust the start and end residue indices in the given antibody CDR.
:param cdr: The antibody CDR to adjust.
:type cdr: AntibodyCDR
:param offset: The value to offset the indices by.
:type offset: int
:return: The adjusted CDR.
:rtype: AntibodyCDR
"""
# if the label is NotCDR, both start and end will be None
if cdr.label is AntibodyCDRLabel.NotCDR or offset == 0:
return cdr
else:
return AntibodyCDR(cdr.label, cdr.start + offset, cdr.end + offset)
[docs] def getAntibodyCDR(self, col, scheme):
# See AbstractProteinSequenceAnnotationMixin for method documentation
seq, seq_index, offset = self.sequence.indexToSeqAndIndex(col)
cdr = seq.annotations.getAntibodyCDR(seq_index, scheme)
return self._adjustCDR(cdr, offset)
[docs] def getAntibodyCDRs(self, scheme):
# See AbstractProteinSequenceAnnotationMixin for method documentation
cdrs = []
for chain, offset in zip(self.sequence.chains,
self.sequence.chain_offsets):
for cur_cdr in chain.annotations.getAntibodyCDRs(scheme):
cdrs.append(self._adjustCDR(cur_cdr, offset))
return cdrs
[docs] def isAntibodyChain(self):
# See AbstractProteinSequenceAnnotationMixin for method documentation
return any(chain.annotations.isAntibodyChain()
for chain in self.sequence.chains)
[docs] def setLigandDistance(self, distance):
# See AbstractProteinSequenceAnnotationsMixin for method documentation
for chain in self.sequence.chains:
chain.annotations.setLigandDistance(distance)
def _updateLigandContacts(self):
# See AbstractProteinSequenceAnnotationsMixin for method documentation
all_ligs = set(
itertools.chain.from_iterable(
chain.annotations.ligands for chain in self.sequence.chains))
self._ligands = sorted(all_ligs)
if not all_ligs:
self._binding_sites = [[] for _ in range(len(self.sequence))]
return
lig_data = {lig: [] for lig in all_ligs}
for chain in self.sequence.chains:
remaining_ligs = set(all_ligs)
cur_ann = chain.annotations
binding_site_per_lig = list(zip(*cur_ann.binding_sites))
for lig, binding_site in zip(cur_ann.ligands, binding_site_per_lig):
remaining_ligs.remove(lig)
lig_data[lig].extend(binding_site)
for lig in remaining_ligs:
lig_data[lig].extend([BINDING_SITE.NoContact] * len(chain))
binding_sites = [lig_data[lig] for lig in self._ligands]
self._binding_sites = list(zip(*binding_sites))
def _invalidateAnnotation(self, ann):
"""
Invalidate any caching for the specified annotation.
:param ann: The annotation to invalidate caching for.
:type ann: ProteinSequenceAnnotations.ANNOTATION_TYPES
"""
try:
delattr(self, ann.name)
except AttributeError:
pass
[docs] def clearAllCaching(self):
super().clearAllCaching()
# _cached_annotations is populated in the metaclass based on the
# cached_annotations argument on the class declaration line
for ann in self._cached_annotations:
self._invalidateAnnotation(ann)
[docs]def make_ligand_name_atom(ct, atom_index):
"""
Make a unique, human-readable name for a ligand identified by atom index.
:param ct: Structure the ligand belongs to
:type ct: schrodinger.structure.Structure
:param atom_index: the atom index of the ligand to make a name for
:type atom_index: int
:return: The name for the ligand
:rtype: str
"""
lig_atom = ct.atom[atom_index]
return f'{lig_atom.chain}: {lig_atom.pdbres.strip()} {lig_atom.resnum:d}'
[docs]def make_ligand_name(ct, ligand):
"""
Make a unique, human-readable name for a ligand. This name matches the
ligand name in the structure hierarchy.
:param ct: Structure the ligand belongs to
:type ct: schrodinger.structure.Structure
:param ligand: the ligand to make a name for
:type ligand: schrodinger.structutils.analyze.Ligand
:return: The name for the ligand
:rtype: str
"""
return make_ligand_name_atom(ct, ligand.atom_indexes[0])
[docs]def parse_antibody_rescode(newcode):
"""
Extract the resnum and inscode from residue number as per the scheme.
If the inscode is a number it will be converted to alphabet.
eg: 'H101.1' -> '101A'.
Residues that are outside of the numbering scheme catalog (FV) or can
not be assigned properly, will have residue number as '-1'. eg: 'H-1'
:param newcode: Residue code by the Antibody CDR numbering scheme.
:type newcode: str
:return: new residue number and insertion code.
:rtype: tuple
:raises KeyError: if newcode doesn't follow the expected pattern.
"""
pattern = r'[A-Z_](-?\d+)\.?([A-Za-z]?[0-9]*)?'
match = re.fullmatch(pattern, newcode)
if match:
res_num, inscode = match.groups()
res_num = int(res_num)
if inscode:
# Convert number to corresponding letter: eg: 1->A, 2->B. If the
# number > 27 inscode will be None.
if inscode.isnumeric():
if int(inscode) < 27:
inscode = chr(ord('@') + int(inscode))
else:
inscode = None
res_num = None
else:
inscode = ' '
else:
raise KeyError(f'Unknown residue code "{newcode}"')
return res_num, inscode