"""
Implementation of ProteinSequence, Sequence, and StructureSequence class.
StructureSequence allows iteration over all sequences in a given protein CT,
and iteration over residues of each (in sequence order).
"""
import bisect
import collections
import copy
import dataclasses
import enum
import itertools
import re
import string
import types
import typing
from collections import namedtuple
from functools import partial
from typing import Dict
from past.utils import old_div
from Bio.Seq import translate
from schrodinger import structure
from schrodinger.application.msv import utils as msv_utils
from schrodinger.infra import mm
from schrodinger.models import json
from schrodinger.protein import annotation
from schrodinger.protein import constants
from schrodinger.protein import properties
from schrodinger.protein import residue
from schrodinger.protein.constants import AMINO_ACIDS_3_TO_1
from schrodinger.protein.properties import PropertyType
from schrodinger.Qt import QtCore
from schrodinger.structutils import analyze
from schrodinger.test import ioredirect
from schrodinger.ui.sequencealignment.sequence import delete_from_str
from schrodinger.ui.sequencealignment.structure_utils import calculate_sasa_dict
SEQ_ANNO_TYPES = annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES
_SecondaryStructure = namedtuple('_SecondaryStructure', ['limits', 'ssa_type'])
DISALLOWED_SEQ_CHARS = re.compile(r"[^a-zA-Z0-9~\.\-]")
[docs]def guess_seq_type(res_names):
"""
Takes an iterable of residue names and returns the appropriate sequence
class
:param res_names: An iterable of residue names. Note that all residue names
must be uppercase.
:type res_names: Iterable(str)
:return: The appropriate class for the input sequence
:rtype: Type[AbstractSingleChainSequence]
Note that we use a pretty simple heuristic here.
"""
res_names = set(res_names)
seq_classes = [RNASequence, DNASequence, ProteinSequence]
element_names = []
for seq_class in seq_classes:
eles = set(seq_class.ALPHABET.keys())
unk = seq_class._unknown_res_type
eles.add(unk.short_code)
eles.add(unk.long_code)
element_names.append(eles)
# Determine type by the highest number of matching pdbres names
match_counts = [
len(res_names & seq_elem_names) for seq_elem_names in element_names
]
if not any(match_counts):
return ProteinSequence
idxmax = match_counts.index(max(match_counts))
return seq_classes[idxmax]
[docs]def make_sequence(elements, *args, **kwargs):
"""
Guesses the appropriates Sequence type from the names of residues and
returns an instance with those residues.
:param elements: A list of strings to examine
:type elements: list(str)
:rtype: protein.Sequence
:return: An instance of the appropriate type
"""
SeqClass = guess_seq_type(elements)
return SeqClass(elements, *args, **kwargs)
[docs]class Sequence:
# We use this as a base class for both actual sequence objects and
# read-only sequence proxies. This is necessary for PyQt signal typing.
pass
[docs]class AbstractSequence(Sequence, QtCore.QObject):
"""
A base class for single-chain and combined-chain biological sequences.
:cvar ORIGIN: Possible sequence origins
:vartype ORIGIN: enum.Enum
:cvar AnnotationClass: Class to use for annotations
:vartype AnnotationClass: annotation.SequenceAnnotations
:cvar ElementClass: Class to use for elements
:vartype ElementClass: Type[residue.Residue]
:cvar ALPHABET: A mapping of string representations of elements to
element types. Concrete subclasses must define this.
:vartype ALPHABET: dict(str, residue.ElementType)
:cvar _gap_chars: A tuple of permissible gap characters in the element list;
the first item will be used for serialization.
:vartype _gap_chars: tuple(str)
:cvar _unknown_res_type: The type for an unknown residue
:vartype _unknown_res_type: residue.ElementType
:cvar residuesChanged: A signal emitted when sequence residues are changed.
Emitted with the indices of the first and last changed residues.
:vartype residuesChanged: QtCore.pyqtSignal
:cvar lengthAboutToChange: A signal emitted when the sequence length is
about to change. Emitted with the old and new lengths.
:vartype lengthAboutToChange: QtCore.pyqtSignal
:cvar lengthChanged: A signal emitted when the sequence length is changed.
Emitted with the old and new lengths.
:vartype lengthChanged: QtCore.pyqtSignal
:cvar nameChanged: A signal emitted when the sequence name is changed.
:vartype nameChanged: QtCore.pyqtSignal
:cvar visibilityChanged: A signal emitted when the visibility is changed.
:vartype visibilityChanged: QtCore.pyqtSignal
:cvar structureChanged: A signal emitted when the structure changes.
:vartype structureChanged: QtCore.pyqtSignal
:cvar annotationTitleChanged: A signal emitted when an annotation title
is changed.
:vartype annotationTitleChanged: QtCore.pyqtSignal
"""
ORIGIN = enum.Enum("ORIGIN", ["Maestro", "PyMOL"])
AnnotationClass = None
ElementClass = residue.Residue
ALPHABET = {}
_gap_char = "~"
_gap_chars = frozenset((_gap_char))
_unknown_res_type = residue.UNKNOWN
residuesRemoved = QtCore.pyqtSignal(set)
residuesAdded = QtCore.pyqtSignal(set)
residuesChanged = QtCore.pyqtSignal(int, int)
lengthAboutToChange = QtCore.pyqtSignal(int, int)
lengthChanged = QtCore.pyqtSignal(int, int)
nameChanged = QtCore.pyqtSignal()
visibilityChanged = QtCore.pyqtSignal()
structureChanged = QtCore.pyqtSignal()
annotationTitleChanged = QtCore.pyqtSignal()
pfamChanged = QtCore.pyqtSignal()
descriptorsCleared = QtCore.pyqtSignal()
# ==========================================================================
# Magic Methods
# ==========================================================================
[docs] def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.AnnotationClass:
self._annotations = self.AnnotationClass(self)
self._annotations.titleChanged.connect(self.annotationTitleChanged)
else:
self._annotations = None
self.residuesRemoved.connect(
self._invalidateDescriptorsIfResiduesChanged)
self.residuesAdded.connect(self._invalidateDescriptorsIfResiduesChanged)
self.structureChanged.connect(self._invalidateDescriptors)
self.clearDescriptors()
# ==========================================================================
# Residues
# ==========================================================================
[docs] def residues(self):
"""
Return an iterable of residues, ignoring gaps.
:return: Iterable of residues
:rtype: iter(Residue)
"""
for res in self:
if res.is_res:
yield res
[docs] def getNextResidue(self, res):
"""
Return the next residue in the sequence (ignoring gaps) or None if this
is the last residue.
:param res: A given residue in the sequence
:type res: schrodinger.protein.residue.Residue
:return: The previous residue in the sequence
:rtype: schrodinger.protein.residue.Residue
"""
indices = range(res.idx_in_seq + 1, len(self))
for position in indices:
if not self[position].is_gap:
return self[position]
[docs] def getPreviousResidue(self, res):
"""
Return the previous residue in the sequence (ignoring gaps) or None if
this is the first residue.
:param res: A given residue in the sequence
:type res: schrodinger.protein.residue.Residue
:return: The previous residue in the sequence
:rtype: `schrodinger.protein.residue.Residue`
"""
indices = range(res.idx_in_seq - 1, -1, -1)
for position in indices:
if not self[position].is_gap:
return self[position]
[docs] def iterNeighbors(self):
"""
Return an iterable of three element tuples consisting of
(prev_res, curr_res, next_res), ignoring gaps.
None is used for neighbors of first and last residues in the sequence,
and does not indicate gaps here.
:return: Iterable of 3-tuples, each element of the each tuple being
either a `schrodinger.protein.residue.Residue` or None
:rtype: iter(tuple(Residue or NoneType, Residue, Residue or NoneType))
"""
res_iter = self.residues()
try:
next_res = next(res_iter)
except StopIteration:
return
prev_res, cur_res = None, None
# Add None to the end to represent the neighbor of the last residue
for res in itertools.chain(res_iter, [None]):
prev_res, cur_res, next_res = cur_res, next_res, res
yield (prev_res, cur_res, next_res)
[docs] def index(self, res):
"""
Returns the index of the specified residue.
:param res: The residue to find
:type res: residue.Residue
:return: The index of the residue
:rtype: int
"""
raise NotImplementedError
[docs] def indices(self, residues):
"""
Returns the indices of all specified residues. Note that there is no
guarantee that the returned integers will be in the same order as the
input residues. (For combined-chain sequences, it's highly likely that
they *won't* be.)
:param res: The residues to find indices of
:type res: Iterable(residue.Residue)
:return: The indices of the residues
:rtype: list[int]
"""
raise NotImplementedError
[docs] def getRun(self, res):
"""
For a given residue or gap, return a set of all adjacent
element indices in the sequence that are also residues or gaps.
:param res: Residue to get the run of
:type res: residue.AbstractSequenceElement
:reuturn: Set of residue indices in the run
:rtype: set(int)
"""
res_idx = self.index(res)
run_idx = set([res_idx])
curr_idx = res_idx - 1
while (curr_idx >= 0 and self[curr_idx] and
self[curr_idx].is_gap == res.is_gap):
run_idx.add(curr_idx)
curr_idx -= 1
curr_idx = res_idx + 1
while (curr_idx < len(self) and self[curr_idx] and
self[curr_idx].is_gap == res.is_gap):
run_idx.add(curr_idx)
curr_idx += 1
return run_idx
[docs] def insertElements(self, index, elements):
"""
Insert a list of elements or sequence element into this sequence.
If `elements` is a string or iterable of strings, residue numbers will
be automatically assigned.
:param index: The index at which to insert elements
:type index: int
:param elements: A list of elements to insert
:type elements: iterable(self.ElementClass) or iterable(str)
"""
raise NotImplementedError
[docs] def mutate(self, start, end, elements):
"""
Mutate sequence elements starting at the given index to the provided
elements.
:param start: The index at which to start mutating
:type start: int
:param end: The index of the last mutated element (exclusive)
:type end: int
:param elements: The elements to which to mutate the sequence
:type elements: iterable(self.ElementClass) or iterable(str)
"""
raise NotImplementedError
[docs] def append(self, element):
"""
Appends an element to the sequence
:param element: The element to append to this sequence
:type: element: self.ElementClass or basestring
"""
raise NotImplementedError
[docs] def extend(self, elements):
"""
Extends the sequence with elements from an iterable
:param elements: The iterable containing elements with which to extend
this sequence
:type elements: iterable(self.ElementClass) or iterable(str)
"""
raise NotImplementedError
[docs] def getSubsequence(self, start, end):
"""
Return a sequence containing a subset of the elements in this one
:param start: The index at which the subsequence should start
:type start: int
:param end: The index at which the subsequence should end (exclusive)
:type end: int
:return: The requested subsequence
:rtype: AbstractSequence
"""
raise NotImplementedError
[docs] def removeElements(self, eles):
"""
Remove elements from the sequence.
:param eles: A list of elements to remove from the sequence.
:type eles: list(residue.AbstractSequenceElement)
:raise ValueError: If any of the given elements are not in the sequence.
"""
raise NotImplementedError
@property
def _res_positions(self):
"""
Return the residue positions in the sequence,
updating the cache if needed.
:return: The residue positions
:rtype: dict(`schrodinger.protein.residue.AbstractSequenceElement`,
tuple(int, int))
"""
if self._res_positions_cache is None:
self._updateResiduePositionCache()
return self._res_positions_cache
def _removeFromSequence(self, keep_func, start=0, end=None):
"""
Remove any residues not matching keep_func from the sequence
:param keep_func: A callable taking a residue and returning a bool
indicating whether to keep it in the sequence
:type keep_func: callable
:param start: The index at which to start filtering
:type start: int
:param end: The index at which to end filtering
:type end: int
:raises ValueError: In the event that invalid indices are specified
"""
raise NotImplementedError
# ==========================================================================
# Gaps
# ==========================================================================
@property
def gap_char(self):
# Make gap_char a read-only property to avoid string comparison errors
return self._gap_char
[docs] def getGaplessLength(self):
"""
:return: Length of this sequence ignoring gaps
:rtype: int
"""
raise NotImplementedError
[docs] def getGaps(self):
"""
:rtype: list(residue.Gap)
:return: The gaps in the sequence.
"""
return [res for res in self if res.is_gap]
[docs] def addGapsByIndices(self, gap_idxs):
"""
Add gaps to the sequence from a list of gap indices. Note that these
indices are based on numbering *after* the insertion. To insert gaps
using indices based on numbering before the insertion, see
`addGapsBeforeIndices`.
:param gap_idxs: A list of gap indices
:type gap_idxs: list(int)
"""
raise NotImplementedError
[docs] def validateGapIndices(self, gap_idxs):
"""
Make sure that the specified gap indices are valid input for
`addGapsByIndices`. If the indices are invalid, a `ValueError` will be
raised. Indices are considered invalid if:
- they refer to a position that's more than one residue past the end
of the sequence
- they are negative
:param gap_idxs: The gap indices to validate
:type gap_idxs: list[int]
"""
if any(idx < 0 for idx in gap_idxs):
raise ValueError("Negative gap index")
for i, cur_gap_idx in enumerate(sorted(gap_idxs)):
if cur_gap_idx > len(self) + i:
raise ValueError("Invalid gap index")
[docs] def addGapsBeforeIndices(self, indices):
"""
Add one gap to the alignment before each of the specified residue
positions. Note that these indices are based on numbering *before* the
insertion. To insert gaps using indices based on numbering after the
insertion, see `addGapsByIndices`.
:param indices: A list of indices to insert gaps before.
:type indices: list(int)
"""
if not indices:
return
indices = offset_indices(indices)
self.addGapsByIndices(indices)
[docs] def removeTerminalGaps(self):
"""
Remove gaps from the end of the sequence
"""
terminal_gaps = self.getTerminalGaps()
if terminal_gaps:
self.removeElements(terminal_gaps)
[docs] def getTerminalGaps(self):
"""
Return terminal gaps.
:return: A list of terminal gaps (in ascending index order)
:rtype: list(residue.Gap)
"""
last_res_idx = len(self) - 1
while last_res_idx >= 0 and self[last_res_idx].is_gap:
last_res_idx -= 1
return self[last_res_idx + 1:]
[docs] def getGapCount(self):
"""
:return: the number of gaps in the sequence
:rtype: int
"""
return len(self) - self.getGaplessLength()
[docs] def removeAllGaps(self):
"""
Remove gaps from the sequence
"""
self._removeFromSequence(lambda res: res.is_res, 0)
# ==========================================================================
# Annotations
# ==========================================================================
@property
def annotations(self):
return self._annotations
[docs] def getAnnotation(self, index, annotation):
"""
Returns the annotation at the specified index or None for a gap.
:raises ValueError: if the annotation is not available
"""
anno_name = annotation.name
if index < len(self):
element = self[index]
if element.is_res:
try:
return getattr(element, anno_name)
except AttributeError:
if (self.annotations is None or not isinstance(
annotation, self.annotations.ANNOTATION_TYPES)):
msg = "%s is not in annotation types" % anno_name
raise ValueError(msg)
# get sequence-level annotation
array = getattr(self.annotations, anno_name)
return array[index]
return None
[docs] def getAnnotationValueForComparison(self,
col,
anno,
ann_index=0,
cdr_scheme=None):
"""
Get an annotation value appropriate for determining whether two
residues have the same annotation value. e.g. for binding site, only
same-distance contacts of the same ligand should compare equal.
:param anno: Protein sequence annotation enum member
:type anno: annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES
:param ann_index: Annotation index for multi-value annotations
:type ann_index: int
:param cdr_scheme: CDR scheme for antibody annotation. (Will be ignored
if `anno` is not SEQ_ANNO_TYPES.antibody_cdr.)
:type cdr_scheme: annotation.AntibodyCDRScheme
:return: Annotation value for comparison or None if the corresponding
residue is a gap or has no value for this annotation
:rtype: object or None
"""
elem = self[col]
if not anno.can_expand or elem.is_gap:
return None
if anno.multi_value:
if anno is SEQ_ANNO_TYPES.domains:
if self.annotations.domains is None or elem.domains is None:
return None
domain = self.annotations.domains[ann_index]
if domain in elem.domains:
# Only return domain string if the current residue is part
# of this domain
return domain
return None
elif anno is SEQ_ANNO_TYPES.binding_sites:
# Return a tuple of (ligand name, contact_enum) so that only
# contacts of the same ligand match
if not self.annotations.ligands:
return None
lig = self.annotations.ligands[ann_index]
lig_contacts = self.getAnnotation(col, anno)
binding_site = lig_contacts[ann_index]
if binding_site is annotation.BINDING_SITE.NoContact:
return None
return (lig, binding_site)
else:
# TODO MSV-3355 test whether this works for kinase conservation
anno_val = self.getAnnotation(col, anno)
if not anno_val:
return None
return anno_val[ann_index]
elif anno is SEQ_ANNO_TYPES.antibody_cdr:
cdr_feature = self.annotations.getAntibodyCDR(col,
scheme=cdr_scheme)
cdr_label = cdr_feature.label
if cdr_label is annotation.AntibodyCDRLabel.NotCDR:
return None
return cdr_label
elif anno is SEQ_ANNO_TYPES.kinase_features:
kinase_feature = elem.kinase_features
if kinase_feature is annotation.KinaseFeatureLabel.NO_FEATURE:
return None
return kinase_feature
else:
return self.getAnnotation(col, anno)
[docs] def getNumAnnValues(self, ann):
return self.annotations.getNumAnnValues(ann)
[docs] def clearAllCaching(self):
"""
This method should be implemented in subclasses that cache any data.
"""
[docs] def setPfam(self, new_pfam, pfam_name):
residues = list(self.residues())
if len(new_pfam) != len(residues):
raise ValueError("Length of `new_pfam` must be equal to the number "
"of residues in the sequence."
f"Got: {len(new_pfam)} Expected: {len(residues)}")
for res, p, in zip(residues, new_pfam):
res.pfam = p
self.pfam_name = pfam_name
self.pfamChanged.emit()
[docs] def clearPfam(self):
for res in self.residues():
res.pfam = None
self.pfamChanged.emit()
[docs] def hasPfam(self):
return not all(p is None for p in self.annotations.pfam)
# ==========================================================================
# Structure
# ==========================================================================
[docs] def hasStructure(self):
"""
:return: Whether this sequence has an associated structure.
:rtype: bool
"""
raise NotImplementedError
[docs] def getStructure(self):
"""
:return: The associated structure. Will return None if there is no
associated structure.
:rtype: schrodinger.structure.Structure or NoneType
"""
raise NotImplementedError
[docs] def setStructure(self, struc):
"""
Set the associated structure. Can only be used on sequences with an
associated structure.
:param struc: The new structure for this sequence
:type struc: schrodinger.structure.Structure
:raise RuntimeError: If there's no structure associated with this
sequence object.
"""
raise NotImplementedError
# ==========================================================================
# Metrics
# ==========================================================================
def _getScore(self,
scoring_function,
other,
consider_gaps=True,
only_consider=None):
"""
Calculate and return the average score after applying a scoring function
to the aligned residues of this sequence and the other sequence.
:param scoring_function: A function that takes two residues and returns
a score.
:type scoring_function: callable(Residue, Residue)
:param other: A sequence to compare against
:type other: schrodinger.protein.sequence.Sequence
:param consider_gaps: Whether we should count gaps when we're
calculating the average score. Note that the scoring function is not
applied to gaps, they are only used in finding the N to divide the
total score by.
:type consider_gaps: bool
:param only_consider: Restrict the calculation to a subset of the
specified residues
:type: None or set
:return: The average score after applying the scoring function to
the aligned residues of this sequence and the other sequence.
:rtype: float
"""
total_score = 0.0
total_length = 0
for res1, res2 in zip(self, other):
if only_consider is not None:
if res1 not in only_consider:
continue
if res1.is_gap and res2.is_gap:
# Don't consider gap-only columns
continue
elif res1.is_res and res2.is_res:
total_score += scoring_function(res1, res2)
total_length += 1
elif consider_gaps:
total_length += 1
if total_length > 0:
return total_score / total_length
else:
return 0.0
[docs] def getIdentity(self, other, consider_gaps=True, only_consider=None):
"""
Return a float scoring the identity between the sequence and another
sequence, assuming that they're already aligned
:param other: A sequence to compare against
:type other: schrodinger.protein.sequence.Sequence
:param consider_gaps: Whether we should count gaps when we're
calculating the average score.
:type consider_gaps: bool
:return: The sequence identity score (between 0.0 and 1.0)
:rtype: float
"""
return self._getScore(self.ElementClass.getIdentity,
other,
consider_gaps,
only_consider=only_consider)
[docs] def getSimilarity(self, other, consider_gaps=True, only_consider=None):
"""
Return a float score of the similarity count between the sequence and
another sequence, assuming that they're already aligned.
:param other: A sequence to compare against
:type other: schrodinger.protein.sequence.Sequence
:param consider_gaps: Whether we should count gaps when we're
calculating the average score.
:type consider_gaps: bool
:return: The sequence similarity score (between 0.0 and 1.0)
:rtype: float
"""
return self._getScore(self.ElementClass.getBinarySimilarity,
other,
consider_gaps,
only_consider=only_consider)
[docs] def getConservation(self, other, consider_gaps=True, only_consider=None):
"""
Return a float scoring the homology conservation between the sequence
and another sequence, assuming that they're already aligned.
The homology criterion is based on "side chain chemistry" descriptor
matching.
:param other: A sequence to compare against
:type other: schrodinger.protein.sequence.Sequence
:param consider_gaps: Whether we should count gaps when we're
calculating the average score.
:type consider_gaps: bool
:return: The sequence conservation score (between 0.0 and 1.0)
:rtype: float
"""
return self._getScore(self.ElementClass.getConservation,
other,
consider_gaps,
only_consider=only_consider)
[docs] def getSimilarityScore(self, other, consider_gaps=True, only_consider=None):
"""
Return the total score of similarity between the sequence and a
other sequence, assuming that they're already aligned.
:param other: A sequence to compare against
:type other: schrodinger.protein.sequence.Sequence
:param consider_gaps: Ignored because the similarity with a gap is 0.0
:type consider_gaps: bool
:param only_consider: A set of residues to restrict attention to
:type only_consider: set or None
:return: The total sequence similarity score
:rtype: int
"""
def include_in_calculation(pair):
res1, res2 = pair
if res1.is_gap or res2.is_gap:
return False
if only_consider is not None and res1 not in only_consider:
return False
return True
residue_pairs = zip(self, other)
residue_pairs = (
pair for pair in residue_pairs if include_in_calculation(pair))
similarities = (
self.ElementClass.getSimilarity(*pair) for pair in residue_pairs)
return int(sum(similarities))
[docs] def getStructureResForRes(self, res):
"""
:param res: Residue to get structure residue for
:type res: residue.Residue
:return: Structure residue or None if no matching residue is found
:rtype: schrodinger.structure._Residue or NoneType
"""
raise NotImplementedError
[docs] def structuredResidueCount(self):
"""
Get the number of residues in this sequence with an associated
structured residue.
:rtype: int
"""
if not self.hasStructure():
return 0
return sum(
1 for elem in self if (not elem.is_gap and not elem.seqres_only))
[docs] def hasStructuredResidues(self):
"""
Return whether this sequence has any structured residues. This method
is equivalent to `bool(seq.structuredResidueCount())` but doesn't
(typically) require iterating through the entire sequence.
:rtype: bool
"""
return (self.hasStructure() and
any(not (elem.is_gap or elem.seqres_only) for elem in self))
[docs] def getProperty(self, seq_prop):
"""
Get the sequence's value corresponding to the given SequenceProperty
object
:param seq_prop: The object describing the sequence property
:type seq_prop: schrodinger.protein.properties.SequenceProperty
:return: The value of the sequence property
:rtype: float or None
"""
if seq_prop.property_type == PropertyType.StructureProperty:
st = self.getStructure()
if st is not None:
return st.property.get(seq_prop.property_name)
else:
descriptors = self._descriptors[seq_prop.property_source]
return descriptors.get(seq_prop.property_name)
[docs] def updateDescriptors(self, descriptors, property_source):
"""
Updates the descriptor dicts with new descriptor values
:param descriptors: A dict mapping descriptor names to their values
:type descriptors: dict[str, float]
:param property_source: The source of the descriptors
:type property_source: properties.PropertySource
"""
self._descriptors[property_source].update(descriptors)
[docs] def clearDescriptors(self):
self._descriptors = {
properties.PropertySource.Sequence: dict(),
properties.PropertySource.Structure: dict(),
properties.PropertySource.Residue: dict(),
}
self.descriptorsCleared.emit()
@property
def descriptors(self):
return self._descriptors
[docs] def hasDescriptors(self):
for descriptors in self._descriptors.values():
if descriptors:
return True
return False
def _invalidateDescriptorsIfResiduesChanged(self, elements):
if not self.hasDescriptors():
return
if any(elem.is_res for elem in elements):
self.clearDescriptors()
def _invalidateDescriptors(self):
if self.hasDescriptors():
self.clearDescriptors()
# To convince Hypothesis that Sequence is a sequence (MSV-1966)
collections.abc.Sequence.register(AbstractSequence)
[docs]class AbstractSingleChainSequence(AbstractSequence):
"""
Base class for single-chain biological sequences
Note: Protein-specific functionality should go in ProteinSequence.
:cvar sequenceCopied: A signal emitted when this sequence is copied.
Emitted with the sequence being copied and the newly created copy.
This signal is used by the structure model to make sure that the newly
created copy is kept in sync with the structure.
:vartype sequenceCopied: QtCore.pyqtSignal
"""
sequenceCopied = QtCore.pyqtSignal(object, object)
# ==========================================================================
# Magic Methods
# ==========================================================================
[docs] def __init__(self,
elements="",
name="",
origin=None,
entry_id=None,
entry_name="",
pdb_id="",
chain="",
structure_chain=None,
long_name="",
resnums=None):
"""
Make a sequence object from a list of strings and/or `self.ElementClass`
Strings are converted to `self.ElementClass` using a mapping of strings
to element types.
:param elements: An iterable of elements making up the sequence.
`None` are interpretted as gaps.
:type elements: iterable(self.ElementClass or None) or
iterable(str or None)
:param name: The name of the sequence (possibly shortened), used for
display purposes. For a FASTA sequence, this should be a short
identifier such as the Uniprot ID.
:type name: str
:param origin: A piece of metadata indicating where the sequence came
from
:type origin: Sequence.ORIGIN or None
:param entry_id: An entry associated with the sequence, if any
:type: str
:param entry_name: An entry name associated with the sequence, if any
:type: str
:param pdb_id: An id associated with the sequence, if any
:type: str
:param chain: The chain to which the sequence belongs
:type chain: str
:param structure_chain: The chain of the structure this sequence
is associated with. This is usually the same as `chain` if the
sequence has a structure but isn't necessarily.
:type structure_chain: str
:param long_name: The full name of the sequence. For a FASTA
sequence, this should be the full FASTA header.
:type long_name: str
:param resnums: Residue numbers to assign to `elements`. If not given,
residues will be numbered starting from one. Regardless of what's
given here, no residue numbering will occur if `elements` is an
iterable of `ElementClass` and any element already has a residue
number set. If this iterable is shorter than `elements`, additional
numbers will be generated by incrementing the last number present.
:type resnums: Iterable(int)
"""
if elements is None:
raise ValueError(
"Cannot pass value of NoneType to protein sequence")
if not self.ALPHABET:
cls_name = self.__class__.__name__
raise NotImplementedError(
f"{cls_name} cannot be instantiated directly.")
super().__init__()
self.name = name
self._origin = None
self.origin = origin
self.entry_name = entry_name
if entry_id is not None:
assert str(entry_id)
self.entry_id = entry_id
self.long_name = long_name
self._chain = chain
self.structure_chain = structure_chain
self.pdb_id = pdb_id
self._visibility = None
# The _get_structure and _set_structure values will be overwritten by the
# structure model if this sequence is associated with a structure.
self._get_structure = None
self._set_structure = None
self._residue_map = {}
self._structure_res_map = None
if not resnums:
# Make sure that we set residue numbers if the input was strings
resnums = (1,)
self._sequence = self._makeSeqElements(elements, resnums)
self._cached_str = None
self._res_positions_cache = None
self._gapless_length = None
self.residuesChanged.connect(self._clearCachedStr)
self.lengthChanged.connect(self._clearCachedStr)
def __str__(self):
if self._cached_str is None:
self._cached_str = "".join(
self._gap_char if res.is_gap else res.type.short_code
for res in self._sequence)
return self._cached_str
[docs] def __len__(self):
return len(self._sequence)
def __iter__(self):
return iter(self._sequence)
def __repr__(self):
"""
Returns a representation of the instance that can be pasted into a repl
"""
class_open = self.__class__.__name__ + "("
arguments = ", ".join(self._getReprArguments())
seq_rep = [class_open, arguments, ")"]
return "".join(seq_rep)
[docs] def __contains__(self, item):
return item in self._res_positions
def __bool__(self):
return bool(self._sequence)
def __getitem__(self, index):
try:
return self._sequence[index]
except IndexError:
msg = ("index (%d) out of range (sequence length is only %d)" %
(index, len(self)))
msg += "\n" + str(self)
raise IndexError(msg)
def __delitem__(self, index):
elements = self[index] if isinstance(index, slice) else [self[index]]
self.removeElements(elements)
def __copy__(self):
raise TypeError("Cannot make shallow copy. Must call deepcopy")
def __deepcopy__(self, memo):
copied_elements = copy.deepcopy(self._sequence, memo)
copied_descriptors = copy.deepcopy(self._descriptors, memo)
SequenceClass = self.__class__
new_seq = SequenceClass(copied_elements,
name=self.name,
origin=self.origin,
entry_id=self.entry_id,
chain=self.chain,
structure_chain=self.structure_chain,
long_name=self.long_name)
new_seq._descriptors = copied_descriptors
intra_ss_bonds = (
bond for bond in self.disulfide_bonds if bond.is_intra_sequence)
for res1, res2 in intra_ss_bonds:
residue.add_disulfide_bond(new_seq[res1.idx_in_seq],
new_seq[res2.idx_in_seq])
intra_ss_bonds = (bond for bond in self.pred_disulfide_bonds
if bond.is_intra_sequence)
for res1, res2 in intra_ss_bonds:
residue.add_disulfide_bond(new_seq[res1.idx_in_seq],
new_seq[res2.idx_in_seq],
known=False)
# Note that this method does not copy _get_structure and _set_structure.
# Instead we emit sequenceCopied. If this sequence is being monitored
# by a structure model, then the structure model is responsible for
# setting these attributes properly in new_seq.
self.sequenceCopied.emit(self, new_seq)
return new_seq
@QtCore.pyqtSlot()
def _clearCachedStr(self):
self._cached_str = None
def _getReprArguments(self):
"""
Generate the arguments to add to the repr string.
:return: A list of argument strings
:rtype: list(str)
"""
elements = "[%s]" % ",".join(self._resFormatCode(res) for res in self)
name = "name='%s'" % re.escape(self.name)
origin = "origin=%s" % self.origin
entry_id = "entry_id='%s'" % self.entry_id
chain = "chain='%s'" % self.chain
return [elements, name, origin, entry_id, chain]
def _resFormatCode(self, res):
"""
Returns a representation of a residue for __repr__
"""
if res.is_gap:
return "'%s'" % self.gap_char
return "'%s'" % res
# ==========================================================================
# Construction
# ==========================================================================
[docs] @classmethod
def makeSeqElement(cls, element):
"""
:param element: A sequence element or string representation thereof
:type element: str or cls.ElementClass
:return: sequence element
:rtype: cls.ElementClass
"""
# TODO Once MSV-1504 is implemented, type checking should be stricter
# to avoid accidental mixing of amino acid and nucleotide residues
if isinstance(element, residue.AbstractSequenceElement):
return element
elif isinstance(element, residue.CombinedChainResidueWrapper):
return element.split_res
elif element is None or element.strip() in cls._gap_chars:
return residue.Gap()
element = element.upper().strip()
element_type = cls.ALPHABET.get(element)
if element_type is None:
element_type = cls._unknown_res_type
if len(element) == 3:
# Make custom unknown residue type for 3-character unknowns
element_type = element_type.makeVariant(element)
res = cls.ElementClass(element_type)
return res
def _makeSeqElement(self, element):
"""
Helper function to call self.makeSeqElement and set Residue.sequence.
If the sequence has a structure, then newly generated elements will have
their `seqres_only` attribute default to True.
"""
res = self.makeSeqElement(element)
res.sequence = self
if isinstance(element, str) and self.hasStructure():
res.seqres_only = True
return res
def _makeSeqElements(self, elements, resnums=()):
"""
:param elements: iterable of sequence elements
:type elements: iterable(self.ElementClass or None) or
iterable(str or None)
:param resnums: Residue numbers to assign to `elements`. These numbers
will be ignored if `elements` is an iterable of `ElementClass` and
any element already has a residue number set. If this iterable is
shorter than `elements`, additional numbers will be generated by
incrementing the last number present.
:type resnums: Iterable(int)
:return: list(self.ElementClass)
:rtype: list
.. note::
The performance of this method is critical for quickly loading
sequences into the MSV. As such, it has been written to optimize
performance over code reuse. (In other words, there's a good reason
that this method doesn't just call `_makeSeqElement`.)
"""
if not elements:
return []
if isinstance(elements, str):
# remove all whitespace and other disallowed characters
elements = DISALLOWED_SEQ_CHARS.sub("", elements)
elements = elements.upper()
return self._makeSeqElementsFromStr(elements, resnums)
try:
elements = list(elements)
except TypeError:
# TODO: Remove this. See MSV-2794.
elements = [elements]
# Figure out the type of the contents of elements. Since Nones are
# allowed for gaps, we need to find the first non-None element
elem = None
for elem in elements:
if elem is not None:
break
if elem is None:
# elements is all Nones
return [residue.Gap() for _ in elements]
elif isinstance(elem, str):
# convert elements to uppercase and remove spaces
elements = [
None if e is None else e.upper().strip() for e in elements
]
return self._makeSeqElementsFromStr(elements, resnums)
elif isinstance(elem, residue.AbstractSequenceElement):
return self._makeSeqElementsFromElements(elements, resnums)
elif isinstance(elem, residue.CombinedChainResidueWrapper):
# convert combined-chain elements to split-chain elements
elements = [None if e is None else e.split_res for e in elements]
return self._makeSeqElementsFromElements(elements, resnums)
else:
raise TypeError("Unrecognized type for element:", type(elem))
def _makeSeqElementsFromStr(self, elements, resnums=()):
"""
:param elements: iterable of sequence elements
:type elements: iterable(str or None)
:param resnums: Residue numbers to assign to `elements`. If this
iterable is shorter than `elements`, additional numbers will be
generated by incrementing the last number present.
:type resnums: Iterable(int)
:return: list(self.ElementClass)
:rtype: list
"""
seq_elements = [None] * len(elements)
has_struc = self.hasStructure()
resnum = 1 if resnums else None
for i, (elem, input_resnum) in enumerate(
itertools.zip_longest(elements, resnums)):
if input_resnum is not None:
resnum = input_resnum
if elem is None or elem in self._gap_chars:
new_seq_elem = residue.Gap()
else:
elem_type = self.ALPHABET.get(elem, self._unknown_res_type)
# Do not use keyword arguments here. Function calls with
# only positional arguments are faster.
new_seq_elem = self.ElementClass(elem_type, None, resnum,
has_struc)
if resnums:
resnum += 1
new_seq_elem.sequence = self
seq_elements[i] = new_seq_elem
return seq_elements
def _makeSeqElementsFromElements(self, elements, resnums):
"""
:param elements: iterable of sequence elements
:type elements: iterable(self.ElementClass or None)
:param resnums: Residue numbers to assign to `elements`. These numbers
will be ignored if a residue number is set for any element in
`elements. If this iterable is shorter than `elements`, additional
numbers will be generated by incrementing the last number present.
:type resnums: Iterable(int)
:return: list(self.ElementClass)
:rtype: list
"""
if resnums and any(elem.hasSetResNum()
for elem in elements
if elem is not None and not elem.is_gap):
# ignore resnums if any of the input elements have residue numbers
# set already
resnums = ()
resnum = 1 if resnums else None
for i, (elem, input_resnum) in enumerate(
itertools.zip_longest(elements, resnums)):
if elem is None:
elem = residue.Gap()
elements[i] = elem
elem.sequence = self
if resnums and not elem.is_gap:
if input_resnum is not None:
resnum = input_resnum
elem.resnum = resnum
resnum += 1
return elements
[docs] @classmethod
def isValid(cls, elements):
"""
:param elements: An iterable of string representations of elements
making up the sequence
:type elements: iterable(str) or str
:return: Tuple indicating whether valid and a set of invalid characters,
if any
:rtype: tuple(bool, set(str))
"""
raise NotImplementedError
# ==========================================================================
# Sequence-Level Properties
# ==========================================================================
@property
def name(self):
return self._name
@name.setter
def name(self, new_name):
"""
Set the name on the instance and emit a notification
:param name: The new name for the sequence
:type name: str
:raises: ValueError if name is not str
"""
if not isinstance(new_name, str):
raise ValueError(f"name must be str, not {type(new_name)}")
# Strip out non-ASCII characters
new_name = new_name.encode('ascii', 'ignore').decode('ascii')
self._name = new_name
self.nameChanged.emit()
@property
def chain(self):
return self._chain
@chain.setter
def chain(self, name):
self._chain = name
self.nameChanged.emit()
@property
def fullname(self):
"""
:return: a formatted name + optional chain name for the sequence
:rtype: str
"""
if self.chain:
return "%s_%s" % (self._name, self.chain)
return self._name
@property
def visibility(self):
return self._visibility
@visibility.setter
def visibility(self, value):
if self._visibility != value:
self._visibility = value
self.visibilityChanged.emit()
# ==========================================================================
# Residues
# ==========================================================================
[docs] def index(self, res, ignore_gaps=False):
"""
Returns the index of the specified residue
:param res: The residue to find
:type res: residue.Residue
:param ignore_gaps: Whether the index returned should ignore gaps in
the sequence or not.
:type ignore_gaps: bool
:raises: A ValueError if the residue is not present
:rtype: int
:return: The index of the residue
"""
if res not in self._res_positions:
err_msg = f"Specified residue {res} not found in sequence {self}"
raise ValueError(err_msg)
idx = 1 if ignore_gaps else 0
return self._res_positions[res][idx]
[docs] def indices(self, residues, ignore_gaps=False):
"""
Returns the indices of all specified residues. Note that there is no
guarantee that the returned integers will be in the same order as the
input residues. (For combined-chain sequences, it's highly likely that
they *won't* be.)
:param res: The residues to find indices of
:type res: Iterable(residue.Residue)
:param ignore_gaps: Whether the indices returned should ignore gaps in
the sequence or not.
:type ignore_gaps: bool
:return: The indices of the residues
:rtype: list[int]
"""
if self._res_positions_cache is None:
self._updateResiduePositionCache()
idx = 1 if ignore_gaps else 0
return [self._res_positions_cache[res][idx] for res in residues]
[docs] def insertElements(self, index, elements):
# See parent class for method documentation
try:
elements = list(elements)
except TypeError:
# TODO: Remove this. See MSV-2794.
elements = [elements]
assign_numbers = all(isinstance(c, str) for c in elements)
elements = self._makeSeqElements(elements)
if assign_numbers:
next_res = None
prev_res = None
true_index = min(index, max(0, len(self) - 1))
prev_indices = range(true_index - 1, -1, -1)
for position in prev_indices:
elem = self[position]
if elem.is_res and elem.hasSetResNum():
prev_res = elem
break
next_indices = range(true_index, len(self))
for position in next_indices:
elem = self[position]
if elem.is_res and elem.hasSetResNum():
next_res = elem
break
assign_residue_numbers(elements,
start_res=prev_res,
end_res=next_res)
old_length = len(self._sequence)
new_length = old_length + len(elements)
self.lengthAboutToChange.emit(old_length, new_length)
self._sequence[index:index] = elements
self._addNewResiduesToMap(elements)
self.lengthChanged.emit(old_length, new_length)
self.residuesAdded.emit(set(elements))
self.residuesChanged.emit(index, old_length)
[docs] def mutate(self, start, end, elements):
# See parent class for method documentation
elements = self._makeSeqElements(elements)
old_length = len(self._sequence)
length_change = len(elements) - (end - start)
if length_change != 0:
new_length = old_length + length_change
self.lengthAboutToChange.emit(old_length, new_length)
removed = self._sequence[start:end]
for ele in removed:
if ele.is_res:
if ele.disulfide_bond is not None:
residue.remove_disulfide_bond(ele.disulfide_bond)
if ele.pred_disulfide_bond is not None:
residue.remove_disulfide_bond(ele.pred_disulfide_bond)
ele.sequence = None
self._sequence[start:end] = elements
self._removeResiduesFromMap(removed)
self._addNewResiduesToMap(elements)
if length_change != 0:
self.lengthChanged.emit(old_length, new_length)
last_changed_res_index = min(old_length, new_length)
self.residuesChanged.emit(start, last_changed_res_index - 1)
else:
self.residuesChanged.emit(start, end - 1)
self.residuesRemoved.emit(set(removed))
self.residuesAdded.emit(set(elements))
[docs] def append(self, element):
# See parent class for method documentation
element = self._makeSeqElement(element)
self.extend([element])
[docs] def extend(self, elements):
# See parent class for method documentation
old_length = len(self._sequence)
new_length = old_length + len(elements)
self.lengthAboutToChange.emit(old_length, new_length)
elements = self._makeSeqElements(elements)
self._sequence.extend(elements)
self.lengthChanged.emit(old_length, new_length)
self.residuesChanged.emit(old_length, new_length)
self.residuesAdded.emit(set(elements))
[docs] def getSubsequence(self, start, end):
# See parent class for method documentation
SequenceClass = self.__class__
return SequenceClass(copy.copy(res) for res in self[start:end])
[docs] def removeElements(self, eles):
# See parent class for method documentation
if len(eles) == 0:
return
elif any(ele not in self for ele in eles):
err_msg = ('Elements must be contained by sequence in order to be '
'removed')
raise ValueError(err_msg)
start_idx_removed = min(self.index(ele) for ele in eles)
new_length = len(self) - len(eles)
old_length = len(self)
self.lengthAboutToChange.emit(old_length, new_length)
self._removeResiduesFromMap(eles)
self._sequence = [ele for ele in self._sequence if ele not in eles]
for ele in eles:
if ele.is_res:
if ele.disulfide_bond is not None:
residue.remove_disulfide_bond(ele.disulfide_bond)
if ele.pred_disulfide_bond is not None:
residue.remove_disulfide_bond(ele.pred_disulfide_bond)
ele.sequence = None
self.lengthChanged.emit(old_length, new_length)
self.residuesChanged.emit(start_idx_removed, new_length - 1)
self.residuesRemoved.emit(set(eles))
def _replaceAllElements(self, elements):
"""
Replace _sequence entirely with the supplied elements
:param elements: Elements for the sequence
:type elements: iterable(self.ElementClass) or iterable(str)
"""
new_length = len(elements)
old_length = len(self)
announce_length_change = new_length != old_length
if announce_length_change:
self.lengthAboutToChange.emit(old_length, new_length)
elements = self._makeSeqElements(elements)
self._sequence = elements
if announce_length_change:
self.lengthChanged.emit(old_length, new_length)
self.residuesChanged.emit(0, old_length)
def _removeFromSequence(self, keep_func, start=0, end=None):
"""
Remove any residues not matching keep_func from the sequence
:param keep_func: A callable taking a residue and returning a bool
indicating whether to keep it in the sequence
:type keep_func: callable
:param start: The index at which to start filtering
:type start: int
:param end: The index at which to end filtering
:type end: int
:raises ValueError: In the event that invalid indices are specified
"""
old_length = len(self)
end = end if end is not None else old_length
if end > old_length or start < 0:
msg = ("Invalid indices specified (start=%d, end=%d) for a sequence"
"of length %d" % (start, end, old_length))
raise ValueError(msg)
new_sequence = self._sequence[0:start]
for res in self._sequence[start:end]:
if keep_func(res):
new_sequence.append(res)
else:
res.sequence = None
new_sequence.extend(self._sequence[end:])
self._replaceAllElements(new_sequence)
def _updateResiduePositionCache(self):
"""
Update the cache of residues to their index within the sequence.
The residue-index cache is stored as a mapping between residues
and two indexes: 1) the index of the residue in the sequence including
gaps and 2) the index of the residue in the sequence not including
gaps. Gaps store `None` for their gapless indexes. Put more concisely,
the residue-index cache maps:
residue.AbstractSequenceElement -> tuple(gapped_index, gapless_index)
"""
self._res_positions_cache = {}
gapless_idx = 0
for gapped_idx, res in enumerate(self):
if res.is_res:
self._res_positions_cache[res] = (gapped_idx, gapless_idx)
gapless_idx += 1
else:
self._res_positions_cache[res] = (gapped_idx, None)
self._gapless_length = gapless_idx
@QtCore.pyqtSlot()
def _invalidateResiduePositionCache(self):
"""
Invalidate the cache of residues to their index within the sequence
The connected signal emits no arguments and the slot decorator must
match the signal, but the method does not need to have the params.
Note that we do not actually calculate the indices until they are
needed; this happens in _updateResiduePositionCache.
"""
self._res_positions_cache = None
self._gapless_length = None
# ==========================================================================
# Gaps
# ==========================================================================
[docs] def getGaplessLength(self):
# See parent class for method documentation
if self._gapless_length is None:
self._updateResiduePositionCache()
return self._gapless_length
[docs] def addGapsByIndices(self, gap_idxs):
# See parent class for method documentation
if not gap_idxs:
return
self.validateGapIndices(gap_idxs)
gap_idxs = set(gap_idxs)
old_length = len(self)
new_length = len(self) + len(gap_idxs)
self.lengthAboutToChange.emit(old_length, new_length)
_sequence = iter(self._sequence)
new_sequence = []
for i in range(new_length):
next_element = None
if i not in gap_idxs:
next_element = next(_sequence, None)
if next_element is None:
next_element = residue.Gap()
next_element.sequence = self
new_sequence.append(next_element)
assert len(new_sequence) == new_length, "Sanity check"
self._sequence = new_sequence
self.lengthChanged.emit(old_length, new_length)
self.residuesChanged.emit(min(gap_idxs), old_length - 1)
# ==========================================================================
# Predictions
# ==========================================================================
[docs] def setSSA(self, new_ssa):
residues = list(self.residues())
if len(new_ssa) != len(residues):
raise ValueError('New ssa must be a list of equal length '
'to the number of residues')
for res, ssa in zip(residues, new_ssa):
res.secondary_structure = ssa
self._invalidateSecondaryStructure()
self.secondaryStructureChanged.emit()
[docs] def setSSAPredictions(self, pred):
if len(pred) != len(self):
raise ValueError('Prediction must be a list of equal length '
'to the sequence')
for res, ssa in zip(self, pred):
res.pred_secondary_structure = ssa
self._invalidateSecondaryStructure()
self._invalidatePredictionFlags()
self.predictionsChanged.emit()
[docs] def setDisorderedRegionsPredictions(self, pred):
if len(pred) != len(self):
raise ValueError('Prediction must be a list of equal length '
'to the sequence')
for res, dis in zip(self, pred):
res.pred_disordered = dis
self._invalidatePredictionFlags()
self.predictionsChanged.emit()
[docs] def setDomainArrangementPredictions(self, pred):
if len(pred) != len(self):
raise ValueError('Prediction must be a list of equal length '
'to the sequence')
for res, dom in zip(self, pred):
res.pred_domain_arr = dom
self._invalidatePredictionFlags()
self.predictionsChanged.emit()
[docs] def setSolventAccessibilityPredictions(self, pred):
if len(pred) != len(self):
raise ValueError('Prediction must be a list of equal length '
'to the sequence')
for res, acc in zip(self, pred):
res.pred_accessibility = acc
self._invalidatePredictionFlags()
self.predictionsChanged.emit()
[docs] def deletePrediction(self, prediction_type):
if prediction_type is SEQ_ANNO_TYPES.pred_disulfide_bonds:
for res in self:
bond = res.pred_disulfide_bond
if bond is not None:
residue.remove_disulfide_bond(bond)
self.predictionsChanged.emit()
return
empty_pred = [None] * len(self)
if prediction_type is SEQ_ANNO_TYPES.pred_secondary_structure:
self.setSSAPredictions(empty_pred)
elif prediction_type is SEQ_ANNO_TYPES.pred_accessibility:
self.setSolventAccessibilityPredictions(empty_pred)
elif prediction_type is SEQ_ANNO_TYPES.pred_disordered:
self.setDisorderedRegionsPredictions(empty_pred)
elif prediction_type is SEQ_ANNO_TYPES.pred_domain_arr:
self.setDomainArrangementPredictions(empty_pred)
else:
raise ValueError(f"{prediction_type} is not a prediction")
[docs] def deleteAllPredictions(self):
all_predictions = [
SEQ_ANNO_TYPES.pred_disulfide_bonds,
SEQ_ANNO_TYPES.pred_secondary_structure,
SEQ_ANNO_TYPES.pred_accessibility, SEQ_ANNO_TYPES.pred_disordered,
SEQ_ANNO_TYPES.pred_domain_arr
]
for prediction in all_predictions:
self.deletePrediction(prediction)
# ==========================================================================
# Structure
# ==========================================================================
@property
def origin(self):
"""
:return: A piece of metadata indicating where the sequence came from
:rtype origin: `Sequence.ORIGIN` or None
"""
return self._origin
@origin.setter
def origin(self, new_origin):
"""
:param new_origin: A piece of metadata indicating where the sequence came
from
:type new_origin: `Sequence.ORIGIN` or None
"""
if new_origin is None or new_origin in self.ORIGIN:
self._origin = new_origin
else:
origins = "".join(self.ORIGIN.names())
msg = "Sequence origin be one of the following values: %s" % origins
raise ValueError(msg)
[docs] def hasStructure(self):
# See AbstractSequence for documentation
return self._get_structure is not None
[docs] def getStructure(self):
# See AbstractSequence for documentation
if self._get_structure is not None:
return self._get_structure()
else:
return None
[docs] def setStructure(self, struc):
# See AbstractSequence for documentation
if self._set_structure is None:
raise RuntimeError("No structure associated with this sequence.")
else:
self._set_structure(struc)
[docs] def onStructureChanged(self):
self.annotations.onStructureChanged()
self.structureChanged.emit()
# This map is lazily regenerated when calling `getStructureResForRes`
self._structure_res_map = None
[docs] def setResidueMap(self, residue_map):
"""
Set a new mapping between ResidueKey and structured residues.
Note: the only intended user of this method is
`schrodinger.application.msv.seqio.StructureConverter`, where the
ResidueKey is computed from the `structure._Residue` used to create the
`residue.Residue`. If the sequence has a structure, the map can be
generated using `generateResidueMap`.
:param residue_map: Mapping between residue key and Residue
:type residue_map: dict(residue.ResidueKey, residue.Residue)
"""
self._residue_map = residue_map
[docs] def generateResidueMap(self):
"""
Create residue map based on current structured residues.
Note: this method requires `self.hasStructure()` to be True and
`self.entry_id` to be set. If this sequence was produced by
`schrodinger.application.msv.seqio.StructureConverter`, there should
already be a residue map and this method does not need to be called.
:raise RuntimeError: If sequence has no structure or entry id
"""
if not self.hasStructure() or self.entry_id is None:
raise RuntimeError("No structure associated with this sequence.")
residue_map = {res.getKey(): res for res in self if res.hasStructure()}
self.setResidueMap(residue_map)
def _addNewResiduesToMap(self, elements):
"""
Private method to add new residues to `self._residue_map`.
Residues must already belong to the sequence and the sequence must have
`entry_id` and structure set.
:param elements: Sequence elements being added (gaps will be ignored)
:type elements: list[residue.Residue]
"""
for ele in elements:
if ele.hasStructure():
res_key = ele.getKey()
self._residue_map[res_key] = ele
def _removeResiduesFromMap(self, elements):
"""
Private method to remove residues from `self._residue_map`.
Residues must still belong to the sequence and the sequence must have
`entry_id` and structure set.
:param elements: Sequence elements being removed (gaps will be ignored)
:type elements: list[residue.Residue]
"""
for ele in elements:
if ele.hasStructure():
res_key = ele.getKey()
self._residue_map.pop(res_key, None)
def _getStructureResMap(self):
"""
:return: Mapping between residue key and structure residue
:rtype: dict(residue.ResidueKey, schrodinger.structure._Residue)
:raise RuntimeError: If sequence has no structure
"""
if not self.hasStructure():
raise RuntimeError("No structure associated with this sequence.")
if self._structure_res_map is None:
structure_res_map = {}
get_sres_key = partial(residue.get_structure_residue_key,
entry_id=self.entry_id)
st = self.getStructure()
try:
chain = st.chain[self.structure_chain]
except KeyError:
pass
else:
for sres in chain.residue:
sres_key = get_sres_key(sres)
structure_res_map[sres_key] = sres
self._structure_res_map = structure_res_map
return self._structure_res_map
def _getStructureResByKey(self, res_key):
"""
:param res_key: Residue key: (entry_id, chain, resnum, inscode)
:type res_key: residue.ResidueKey
:return: Structure residue or None if no matching residue is found
:rtype: schrodinger.structure._Residue or NoneType
:raise RuntimeError: If sequence has no structure
"""
sres_map = self._getStructureResMap()
return sres_map.get(res_key)
[docs] def getResByKey(self, res_key):
"""
:param res_key: Residue key: (entry_id, chain, resnum, inscode)
:type res_key: residue.ResidueKey
:return: Residue matching key or None if no matching residue is found
:rtype: residue.Residue or NoneType
:raise RuntimeError: If sequence has no structure
"""
if not self.hasStructure():
raise RuntimeError("No structure associated with this sequence.")
return self._residue_map.get(res_key)
[docs] def getStructureResForRes(self, res):
# See parent class for method documentation
if not self.hasStructure():
return None
res_key = res.getKey()
sres = self._getStructureResByKey(res_key)
return sres
[docs]class AbstractProteinSequenceMixin:
"""
A mixin for code shared between split-chain and combined-chain protein
sequences.
"""
disulfideBondsCacheCleared = QtCore.pyqtSignal()
predictionsChanged = QtCore.pyqtSignal()
secondaryStructureChanged = QtCore.pyqtSignal()
kinaseFeaturesChanged = QtCore.pyqtSignal()
kinaseConservationChanged = QtCore.pyqtSignal()
@property
def ALPHABET(self):
# This instance property is needed because metaclasses are excluded
# from attribute lookup
return type(self).ALPHABET
# ==========================================================================
# Magic Methods
# ==========================================================================
[docs] def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._secondary_structures = None
self._pred_secondary_structures = None
self._disulfide_bonds = None
self._pred_disulfide_bonds = None
self.pfam_name = None
self._invalidatePredictionFlags()
# ==========================================================================
# Annotations
# ==========================================================================
@property
def disulfide_bonds(self):
"""
:return: A sorted tuple of the valid disulfide bonds.
:rtype: tuple(residue.DisulfideBond)
"""
raise NotImplementedError
@property
def secondary_structures(self):
"""
A list of _SecondaryStructure namedtuples containing the type of
secondary structure and where the secondary structures begin and end.
:return: A list of namedtuples containing an SS_TYPE from
schrodinger.structure and the residue indexes marking the limits
of the secondary structure.
:rtype: list(_SecondaryStructure)
"""
if self._secondary_structures is None:
self._updateSecondaryStructure()
return self._secondary_structures
def _updateSecondaryStructure(self):
"""
Calculates and caches tuples describing the sequence secondary
structures. The tuples contain the SSA type and their limits (ie
where the structure begins and ends)
"""
raise NotImplementedError
[docs] def clearDisulfideBondsCache(self):
self._disulfide_bonds = None
self._pred_disulfide_bonds = None
self.disulfideBondsCacheCleared.emit()
[docs] def isKinaseChain(self) -> bool:
raise NotImplementedError
# ==========================================================================
# Predictions
# ==========================================================================
@QtCore.pyqtSlot()
def _invalidatePredictionFlags(self):
self._has_ssa_prediction = None
self._has_dis_predictions = None
self._has_dom_prediction = None
self._has_acc_predictions = None
[docs] def hasDisorderedRegionsPredictions(self):
if self._has_dis_predictions is None:
self._has_dis_predictions = any(
r.pred_disordered is not None for r in self.residues())
return self._has_dis_predictions
[docs] def hasDisulfideBondPredictions(self):
return bool(self.pred_disulfide_bonds)
[docs] def hasDomainArrangementPredictions(self):
if self._has_dom_prediction is None:
self._has_dom_prediction = any(
r.pred_domain_arr is not None for r in self.residues())
return self._has_dom_prediction
[docs] def hasSolventAccessibility(self):
if self._has_acc_predictions is None:
self._has_acc_predictions = any(
r.pred_accessibility is not None for r in self.residues())
return self._has_acc_predictions
[docs] def hasSSAPredictions(self):
if self._has_ssa_prediction is None:
secondary_strucs = self.pred_secondary_structures
if (len(secondary_strucs) == 1 and
secondary_strucs[0].ssa_type is None):
self._has_ssa_prediction = False
else:
self._has_ssa_prediction = True
return self._has_ssa_prediction
@property
def pred_secondary_structures(self):
if self._pred_secondary_structures is None:
self._updatePredSecondaryStructure()
return self._pred_secondary_structures
[docs]class ProteinSequence(json.JsonableClassMixin,
AbstractProteinSequenceMixin,
AbstractSingleChainSequence,
metaclass=ProteinSequenceMeta):
"""
A single-chain protein sequence.
:cvar secondaryStructureCacheCleared: A signal emitted when the secondary
structure cache has been cleared. Used to keep the
`CombinedChainProteinSequence` cache in sync. If listening for changes
in the secondary structure values, use `secondaryStructureChanged`
instead.
"""
AnnotationClass = annotation.ProteinSequenceAnnotations
ElementClass = residue.Residue
_gap_chars = constants.GAP_CHARS
secondaryStructureCacheCleared = QtCore.pyqtSignal()
# ==========================================================================
# Magic Methods
# ==========================================================================
[docs] def __init__(self,
elements="",
name="",
origin=None,
entry_id=None,
entry_name="",
pdb_id="",
chain="",
structure_chain=None,
long_name="",
resnums=(),
disulfide_bonds=None,
pred_disulfide_bonds=None):
"""
See `AbstractSingleChainSequence` for additional documentation.
:param disulfide_bonds: A list of pairs of residue indices to link via
disulfide bonds.
:type disulfide_bonds: Iterable(tuple(int, int))
:param pred_disulfide_bonds: A list of pairs of residue indices to link
via predicted disulfide bonds.
:type pred_disulfide_bonds: Iterable(tuple(int, int))
"""
super().__init__(elements,
name=name,
origin=origin,
entry_id=entry_id,
entry_name=entry_name,
pdb_id=pdb_id,
chain=chain,
structure_chain=structure_chain,
long_name=long_name,
resnums=resnums)
self._is_kinase_cons_annotated = False
self._is_kinase_annotated = False
self.residuesChanged.connect(self.kinaseFeaturesChanged)
self.residuesChanged.connect(self._invalidateResiduePositionCache)
self.lengthChanged.connect(self._invalidateResiduePositionCache)
# Set up protein secondary structure slots
self.residuesChanged.connect(self._invalidateSecondaryStructure)
self.lengthChanged.connect(self._invalidateSecondaryStructure)
self.residuesChanged.connect(self._invalidatePredictionFlags)
self.lengthChanged.connect(self._invalidatePredictionFlags)
self._pattern_seq_dict = None
self.residuesChanged.connect(self._invalidatePatternSeqDict)
if disulfide_bonds is not None:
for i1, i2 in disulfide_bonds:
residue.add_disulfide_bond(self[i1], self[i2])
if pred_disulfide_bonds is not None:
for i1, i2 in pred_disulfide_bonds:
residue.add_disulfide_bond(self[i1], self[i2], known=False)
def _getReprArguments(self):
# See AbstractSingleChainSequence for method documentation
arguments = super()._getReprArguments()
if self.disulfide_bonds:
arg = ("disulfide_bonds=" +
self._bondIndicesForRepr(self.disulfide_bonds))
arguments.append(arg)
if self.pred_disulfide_bonds:
arg = ("pred_disulfide_bonds=" +
self._bondIndicesForRepr(self.pred_disulfide_bonds))
arguments.append(arg)
return arguments
def _bondIndicesForRepr(self, disulfide_bonds):
"""
Generate a string describing the residue indices involved in the given
disulfide bonds.
:param disulfide_bonds: The bond to describe.
:type disulfide_bonds: list(residue.DisulfideBond)
:return: A string describing the disulfide bonds.
:rtype: str
"""
intra_seq_bonds = [
bond for bond in disulfide_bonds
if bond.isValid() and bond.is_intra_sequence
]
indices = [(res1.idx_in_seq, res2.idx_in_seq)
for (res1, res2) in intra_seq_bonds]
indices.sort()
return "[%s]" % ", ".join(f"({i1}, {i2})" for (i1, i2) in indices)
def _resFormatCode(self, res):
if res.is_gap:
return super()._resFormatCode(res)
return "'%s'" % res.long_code
# ==========================================================================
# Serialization
# ==========================================================================
[docs] def toJsonImplementation(self):
intraseq_bonds = (
bond for bond in self.disulfide_bonds if bond.is_intra_sequence)
disulfide_bonds = [
[res1.idx_in_seq, res2.idx_in_seq] for res1, res2 in intraseq_bonds
]
intraseq_pred_bonds = (bond for bond in self.pred_disulfide_bonds
if bond.is_intra_sequence)
pred_disulfide_bonds = [[res1.idx_in_seq, res2.idx_in_seq]
for res1, res2 in intraseq_pred_bonds]
return {
'residues': self._sequence,
'name': self.name,
'chain': self.chain,
'structure_chain': self.structure_chain,
'disulfide_bonds': disulfide_bonds,
'pred_disulfide_bonds': pred_disulfide_bonds,
'long_name': self.long_name,
'entry_id': self.entry_id,
'descriptors': {
source.name: descriptor
for source, descriptor in self.descriptors.items()
},
}
[docs] @classmethod
def fromJsonImplementation(cls, json_obj):
residues = []
residues_str = []
is_kinase_cons_annotated = False
is_kinase_annotated = False
for ele in json_obj['residues']:
try:
elem = residue.Gap.fromJson(ele)
except ValueError:
elem = residue.Residue.fromJson(ele)
if elem._kinase_conservation:
is_kinase_cons_annotated = True
if elem.kinase_features is not None:
is_kinase_annotated = True
residues.append(elem)
residues_str.append(str(elem))
SeqType = guess_seq_type(residues_str)
rehydrated_seq = SeqType(residues,
entry_id=json_obj['entry_id'],
structure_chain=json_obj['structure_chain'])
rehydrated_seq._is_kinase_cons_annotated = is_kinase_cons_annotated
rehydrated_seq._is_kinase_annotated = is_kinase_annotated
for res1_idx, res2_idx in json_obj['disulfide_bonds']:
residue.add_disulfide_bond(rehydrated_seq[res1_idx],
rehydrated_seq[res2_idx])
for res1_idx, res2_idx in json_obj['pred_disulfide_bonds']:
residue.add_disulfide_bond(rehydrated_seq[res1_idx],
rehydrated_seq[res2_idx],
known=False)
rehydrated_seq.name = json_obj['name']
rehydrated_seq.long_name = json_obj['long_name']
rehydrated_seq.chain = json_obj['chain']
descriptors_from_json = json_obj.get('descriptors')
if descriptors_from_json is not None:
for source, descriptors in descriptors_from_json.items():
src = properties.PropertySource[source]
rehydrated_seq.updateDescriptors(descriptors, src)
return rehydrated_seq
[docs] @json.adapter(version=47007)
def adapter47007(cls, json_dict):
plain_seq = cls()
json_dict['entry_id'] = plain_seq.entry_id
json_dict['structure_chain'] = plain_seq.structure_chain
return json_dict
[docs] @json.adapter(version=48003)
def adapter48002(cls, json_dict):
json_dict['long_name'] = json_dict.pop('title')
return json_dict
[docs] @json.adapter(version=52065)
def adapter52065(cls, json_dict):
json_dict['kinase_features'] = None
return json_dict
# ==========================================================================
# Construction
# ==========================================================================
[docs] @classmethod
def isValid(cls, elements):
"""
:param elements: An iterable of string representations of elements
making up the sequence
:type elements: iterable(str) or str
:return: Tuple indicating whether valid and a set of invalid characters,
if any
:rtype: tuple(bool, set(str))
"""
valid_chars = set(cls.ALPHABET) | set(cls._gap_chars)
invalid_chars = set(elements) - valid_chars
return not invalid_chars, invalid_chars
# ==========================================================================
# Annotations
# ==========================================================================
[docs] def renumberResidues(self, new_rescode_map):
"""
Renumber residues in this sequence given a dictionary mapping old
rescodes to the new desired rescodes.
:type new_rescode_map: dict[tuple(int, str), tuple(int, str)]
"""
min_res_idx = None
max_res_idx = None
for idx, res in enumerate(self):
if res.is_res and res.getChainKey() in new_rescode_map:
res.resnum, res.inscode = new_rescode_map[res.getChainKey()]
if min_res_idx is None:
min_res_idx = idx
max_res_idx = idx
if min_res_idx is not None:
self.residuesChanged.emit(min_res_idx, max_res_idx)
[docs] def isKinaseChain(self):
return any(
res.is_res and (res.kinase_features or res.kinase_conservation)
for res in self)
@property
def is_kinase_annotated(self):
return self._is_kinase_annotated
[docs] def setKinaseFeatures(self,
feature_map: Dict[residue.Residue,
annotation.KinaseFeatureLabel]):
"""
:param feature_map: Map of residue to kinase feature label
"""
for res, feature in feature_map.items():
if res not in self:
raise ValueError(
'All residues in feature_map must belong to this sequence')
res.kinase_features = feature
self.kinaseFeaturesChanged.emit()
self._is_kinase_annotated = True
@property
def is_kinase_cons_annotated(self):
return self._is_kinase_cons_annotated
[docs] def setKinaseConservation(self, cons_map, lig_asl):
"""
:param cons_map: Map of residue to conservation
:type cons_map: dict[residue.Residue, annotation.KinaseConservation]
:param lig_asl: ASL of the associated ligand
:type lig_asl: str
"""
try:
lig_index = self.annotations.ligand_asls.index(lig_asl)
except ValueError:
return
for res, cons in cons_map.items():
res.kinase_conservation[lig_index] = cons
self.kinaseConservationChanged.emit()
self._is_kinase_cons_annotated = True
@property
def disulfide_bonds(self):
# See AbstractProteinSequenceMixin for documentation
if self._disulfide_bonds is None:
bonds = set(res.disulfide_bond for res in self if res.is_res and
res.disulfide_bond and res.disulfide_bond.isValid())
key = lambda bond: min(
self.index(res) for res in bond if res in self)
self._disulfide_bonds = tuple(sorted(bonds, key=key))
return self._disulfide_bonds
@QtCore.pyqtSlot()
def _invalidateSecondaryStructure(self):
"""
Invalidates the stored indices of start and end positions of secondary
structure assignment blocks in the sequence.
Note that we do not actually calculate the indices until they are
needed; this happens in _updateSecondaryStructure.
"""
self._secondary_structures = None
self._pred_secondary_structures = None
self.secondaryStructureCacheCleared.emit()
def _updateSecondaryStructure(self):
# See AbstractProteinSequenceMixin for documentation
self._secondary_structures = []
if len(self) == 0:
return
ssa = self._calculateSecondaryStructuresList(
ssa_attr='secondary_structure')
self._secondary_structures = ssa
def _calculateSecondaryStructuresList(self, ssa_attr):
"""
Calculate and return a list of `_SecondaryStructure` tuples describing
the types and ranges of secondary structures in the sequence.
:param ssa_attr: The attribute name to access the secondary structure
on a residue.
:type ssa_attr: str
:rtype : list[_SecondaryStructure]
"""
start_idxs = set()
end_idxs = set()
for prev_res, curr_res, next_res in self.iterNeighbors():
# Note: iterNeighbors uses `None` to indicate that neighbor does
# not exist
curr_ssa = getattr(curr_res, ssa_attr)
if (prev_res is None or getattr(prev_res, ssa_attr) != curr_ssa):
start_idxs.add(self.index(curr_res))
if (next_res is None or getattr(next_res, ssa_attr) != curr_ssa):
end_idxs.add(self.index(curr_res))
start_idxs = sorted(start_idxs)
end_idxs = sorted(end_idxs)
all_ss = []
for start_idx, end_idx in zip(start_idxs, end_idxs):
ss = _SecondaryStructure((start_idx, end_idx),
getattr(self[start_idx], ssa_attr))
all_ss.append(ss)
return all_ss
# ==========================================================================
# Predictions
# ==========================================================================
@property
def pred_disulfide_bonds(self):
# See AbstractProteinSequenceMixin for documentation
if self._pred_disulfide_bonds is None:
bonds = list(
set(res.pred_disulfide_bond
for res in self
if res.is_res and res.pred_disulfide_bond and
res.pred_disulfide_bond.isValid()))
key = lambda bond: min(
self.index(res) for res in bond if res in self)
self._pred_disulfide_bonds = tuple(sorted(bonds, key=key))
return self._pred_disulfide_bonds
def _updatePredSecondaryStructure(self):
# See AbstractProteinSequenceMixin for documentation
self._pred_secondary_structures = []
if len(self) == 0:
return
ssa = self._calculateSecondaryStructuresList(
ssa_attr='pred_secondary_structure')
self._pred_secondary_structures = ssa
# ==========================================================================
# Structure
# ==========================================================================
[docs] def removeStructurelessResidues(self, start=0, end=None):
"""
Remove any structureless residues
:param start: The index at which to start filtering structureless
residues.
:type start: int
:param end: The index at which to end filtering
:type end: int
"""
def keep_func(res):
return (res.hasStructure() or res.is_gap or
not res.sequence.hasStructure())
self._removeFromSequence(keep_func, start, end)
# ==========================================================================
# Patterns
# ==========================================================================
[docs] def encodeForPatternSearch(self,
with_ss=False,
with_flex=False,
with_asa=False):
"""
Convert to sequence dict expected by `find_generalized_pattern`.
:param with_ss: Whether to include secondary structure information.
:type with_ss: bool
:param with_flex: Whether to include flexibility information.
:type with_flex: bool
:param with_asa: Whether to include accessible surface area
information.
:type with_asa: bool
:rtype: dict
:return: dictionary of sequence data
"""
# TODO make cache aware of structure flags
if self._pattern_seq_dict is not None:
return self._pattern_seq_dict
_aa_list = []
if with_ss:
_ss_list = []
if with_asa:
std_sasa_dict = calculate_sasa_dict()
_asa_list = []
seq_dict = {
"res_ids": [],
}
def is_positive(number):
return number is not None and number > 0
if with_flex:
_flex_list = []
pos_b_factors = [
res.b_factor
for res in self
if res.is_res and is_positive(res.b_factor)
]
avg_b_factor = sum(pos_b_factors) / len(pos_b_factors) \
if pos_b_factors else None
for i, res in enumerate(self):
if res.is_gap:
continue
# amino acid sequence
short_code = (res.short_code if res.short_code
in AMINO_ACIDS_3_TO_1.values() else "X")
_aa_list.append(short_code)
# residue numbering
seq_dict["res_ids"].append(res.rescode.strip())
# secondary structure
if with_ss:
ss_code = ' '
if res.secondary_structure is structure.SS_HELIX:
ss_code = 'h'
elif res.secondary_structure is structure.SS_STRAND:
ss_code = 'e'
_ss_list.append(ss_code)
# flexibility
if with_flex:
flex_code = ' '
# Consider flexible if b-factor is above average
if (is_positive(res.b_factor) and is_positive(avg_b_factor) and
res.b_factor > avg_b_factor):
flex_code = 'f'
_flex_list.append(flex_code)
# solvent_accessibility
if with_asa:
asa_code = ' '
# Consider exposed if ASA is greater than standard ASA
std_sasa = std_sasa_dict.get(res.long_code)
res_sasa = self.annotations.sasa[i]
if (std_sasa is not None and res_sasa is not None and
res_sasa > std_sasa):
asa_code = 'e'
_asa_list.append(asa_code)
seq_dict["amino_acids"] = "".join(_aa_list)
empty_seq_str = " " * len(seq_dict["res_ids"])
seq_dict["secondary_structure"] = ("".join(_ss_list)
if with_ss else empty_seq_str)
seq_dict["flexibility"] = ("".join(_flex_list)
if with_flex else empty_seq_str)
seq_dict["solvent_accessibility"] = ("".join(_asa_list)
if with_asa else empty_seq_str)
self._pattern_seq_dict = seq_dict
return self._pattern_seq_dict
@QtCore.pyqtSlot()
def _invalidatePatternSeqDict(self):
"""
Invalidates the stored sequence dictionary for pattern search.
"""
self._pattern_seq_dict = None
# ==========================================================================
# Caching
# ==========================================================================
[docs] def clearAllCaching(self):
self.annotations.clearAllCaching()
self.clearDisulfideBondsCache()
self._invalidateSecondaryStructure()
self._invalidateResiduePositionCache()
self._invalidatePatternSeqDict()
self._invalidatePredictionFlags()
[docs] def setGPCRAnnotations(self, gpcr_annotation_map):
for res, gpcr_anno in gpcr_annotation_map.items():
res.gpcr_segment = gpcr_anno['gpcr_segment']
res.gpcr_generic_number = gpcr_anno['gpcr_generic_number']
[docs]class NucleicAcidSequence(ProteinSequence, metaclass=NucleicSequenceMeta):
# We only have "limited support" of nucleic acids, so this class is just a
# stub.
#
# TODO (MSV-1504): Create proper nucleic acid domain objects
AnnotationClass = annotation.NucleicAcidSequenceAnnotations
ElementClass = residue.Nucleotide
_unknown_res_type = residue.UNKNOWN_NA
ALPHABET = None # Cannot directly instantiate a NucleicAcidSequence
@property
def is_kinase_cons_annotated(self):
return True
@property
def is_kinase_annotated(self):
return True
[docs] def isKinaseChain(self):
return False
[docs] def getTranslation(self):
"""
Get a translated sequence. This method uses BioPython's translate
method to convert a nucleic acid sequence into an amino acid sequence
:return: A translated protein sequence. The name and chain from the
nucleic acid sequence are copied over
:rtype: ProteinSequence
"""
seq_str = str(self).replace(self.gap_char, '')
extra = len(seq_str) % 3
if extra:
# Pad to a multiple of 3
seq_str += "N" * (3 - extra)
bp_amino_str = translate(seq_str, stop_symbol=self.gap_char)
protein_seq = ProteinSequence(bp_amino_str,
name=self.name,
chain=self.chain)
return protein_seq
[docs]class DNASequence(NucleicAcidSequence):
ALPHABET = types.MappingProxyType(residue.DNA_ALPHABET)
[docs]class RNASequence(NucleicAcidSequence):
ALPHABET = types.MappingProxyType(residue.RNA_ALPHABET)
[docs]@dataclasses.dataclass
class GapRegion:
"""
Container for information about gaps to add to or remove from the start and
end of a chain
"""
from_start: int
from_end: int
GapRegionsForChains = typing.List[GapRegion]
GapRegionsForSeqs = typing.List[GapRegionsForChains]
[docs]class CombinedChainProteinSequence(
AbstractProteinSequenceMixin,
AbstractSequence,
metaclass=CombinedChainSequenceMeta,
wraps=ProteinSequence,
wrapped_constants=("ElementClass", "_gap_chars"),
wrapped_properties=("origin", "entry_name", "entry_id", "long_name",
"pdb_id", "visibility", "name"),
wrapped_getters=("getStructure", "hasStructure"),
wrapped_setters=("setStructure",)):
"""
A sequence that contains multiple chains from the same protein. Instances
of this class do not directly contain any residues themselves and instead
wrap one or several `ProteinSequence` objects.
:note: `CombinedChainProteinSequence.visibility` properly reports entry
inclusion state, but it may not correctly report entry visibility (e.g.
partially visible vs. fully visible). The MSV structure icons only
report inclusion state and the visibility of included entries isn't
reported anywhere in the panel, though, so this limitation doesn't have
any impact on functionality.
"""
AnnotationClass = annotation.CombinedChainProteinSequenceAnnotations
# ==========================================================================
# Magic Methods
# ==========================================================================
[docs] def __init__(self, seqs):
"""
:param seqs: A list of the split-chain sequences to wrap.
:type seqs: list(ProteinSequence)
"""
if not seqs:
raise ValueError("ExtendedProteinSequence requires at least one "
"sequence.")
self._seqs = list(seqs)
self._updateSeqOffsets()
self._updateSeqCaches()
super().__init__()
for seq in seqs:
self._connectSignals(seq)
def __deepcopy__(self, memo):
copied_chains = copy.deepcopy(self._seqs, memo)
return self.__class__(copied_chains)
def __getitem__(self, index):
if isinstance(index, slice):
if index.step not in (None, 1):
raise ValueError(f"{self.__class__.__name__} does not support "
"slices with steps.")
elif index.start == index.stop:
return []
if index.start is None:
start_seq_num = 0
start_index = 0
else:
start_seq_num, start_index = self._indexToSeqIndexAndIndex(
index.start)
if index.stop is None:
end_seq_num = len(self._seqs) - 1
end_index = len(self._seqs[-1]) - 1
else:
end_seq_num, end_index = self._indexToSeqIndexAndIndex(
index.stop - 1)
if start_seq_num == end_seq_num:
residues = self._seqs[start_seq_num][start_index:end_index + 1]
else:
residues = self._seqs[start_seq_num][start_index:]
for seq_num in range(start_seq_num + 1, end_seq_num):
residues.extend(self._seqs[seq_num][:])
residues.extend(self._seqs[end_seq_num][:end_index + 1])
return [
residue.CombinedChainResidueWrapper(res, self)
for res in residues
]
else:
seq, index_in_seq, _ = self.indexToSeqAndIndex(index)
res = seq[index_in_seq]
return residue.CombinedChainResidueWrapper(res, self)
def __iter__(self):
for res in itertools.chain.from_iterable(self._seqs):
yield residue.CombinedChainResidueWrapper(res, self)
[docs] def __len__(self):
return self._seq_offsets[-1] + len(self._seqs[-1])
def __str__(self):
return "".join(str(seq) for seq in self._seqs)
def __repr__(self):
chain_reprs = ", ".join(map(repr, self._seqs))
return f"{self.__class__.__name__}([{chain_reprs}])"
def __hash__(self):
return hash(self.chains)
def __eq__(self, other):
if isinstance(other, CombinedChainProteinSequence):
return self.chains == other.chains
else:
return False
def _getSignalsAndSlots(self, seq):
"""
Return signals and slots for signals emitted by the given split-chain
sequences.
:param seq: The split-chain sequence
:type seq: ProteinSequence
:return: A list of tuples of (signal, slot)
:rtype: list(tuple(QtCore.pyqtBoundSignal, method)
"""
return [
(seq.residuesChanged, self._onResiduesChanged),
(seq.lengthAboutToChange, self._onLengthAboutToChange),
(seq.lengthChanged, self._onLengthChanged),
(seq.nameChanged, self.nameChanged),
(seq.visibilityChanged, self.visibilityChanged),
(seq.structureChanged, self.structureChanged),
(seq.annotationTitleChanged, self.annotationTitleChanged),
(seq.disulfideBondsCacheCleared, self.clearDisulfideBondsCache),
(seq.secondaryStructureCacheCleared,
self._clearSecondaryStructureCache)
] # yapf: disable
def _connectSignals(self, seq):
for signal, slot in self._getSignalsAndSlots(seq):
signal.connect(slot)
def _disconnectSignals(self, seq):
for signal, slot in self._getSignalsAndSlots(seq):
signal.disconnect(slot)
# ==========================================================================
# Sequence-Level Properties
# ==========================================================================
@property
def fullname(self):
return f"{self.name}_All"
# ==========================================================================
# Residues
# ==========================================================================
[docs] def index(self, res):
chain = res.split_sequence
try:
chain_index = self._chain_indices[chain]
except KeyError:
err_msg = f"Specified residue {res} not found in sequence {self}"
raise ValueError(err_msg)
offset = self._seq_offsets[chain_index]
return offset + chain.index(res.split_res)
[docs] def indices(self, residues):
"""
Returns the indices of all specified residues. Note that the returned
integers will likely *not* be in the same order as the input residues.
:param res: The residues to find indices of
:type res: Iterable(residue.CombinedChainResidueWrapper)
:return: The indices of the residues
:rtype: list[int]
"""
by_chain = {}
for cur_res in residues:
split_res = cur_res.split_res
by_chain.setdefault(split_res.sequence, []).append(split_res)
indices = []
for chain, cur_residues in by_chain.items():
offset = self.offsetForChain(chain)
cur_indices = chain.indices(cur_residues)
indices.extend(i + offset for i in cur_indices)
return indices
[docs] def insertElements(self, index, elements):
# See parent class for method documentation
chain, chain_index, _ = self.indexToSeqAndIndex(index)
chain.insertElements(chain_index, elements)
[docs] def mutate(self, start, end, elements):
"""
Mutate sequence elements. See parent class for additional method
documentation.
:raise MultipleChainsError: If the specified residue range spans
multiple chains.
"""
chain, start_chain_index, end_chain_index = \
self._getMutationChainIndices(start, end)
chain.mutate(start_chain_index, end_chain_index, elements)
[docs] def assertCanMutateResidues(self, start, end):
"""
Make sure that we can mutate the specified residues. If not, raise an
exception.
:param start: The index at which to start mutating
:type start: int
:param end: The index of the last mutated element (exclusive)
:type end: int
:raise MultipleChainsError: If the specified residue range spans
multiple chains.
"""
self._getMutationChainIndices(start, end)
[docs] def append(self, element):
# See parent class for method documentation
self._seqs[-1].append(element)
[docs] def extend(self, elements):
# See parent class for method documentation
self._seqs[-1].extend(elements)
[docs] def getSubsequence(self, start, end):
"""
Return a sequence containing a subset of the elements in this one. Note
that the new sequence will be a split-chain sequence and will ignore
any chain breaks present in the requested subset of elements.
:param start: The index at which the subsequence should start
:type start: int
:param end: The index at which the subsequence should end (exclusive)
:type end: int
:return: The requested subsequence
:rtype: ProteinSequence
"""
return ProteinSequence(
copy.copy(res.split_res) for res in self[start:end])
[docs] def removeElements(self, eles):
# See parent class for method documentation
by_chain = {}
for cur_ele in eles:
by_chain.setdefault(cur_ele.split_sequence,
[]).append(cur_ele.split_res)
if not set(by_chain.keys()).issubset(self._seqs):
raise ValueError("Elements must be contained by sequence in order "
"to be removed")
for chain, cur_eles in by_chain.items():
chain.removeElements(cur_eles)
# note that the sequence offsets will be updated in response to the
# lengthChanged signal, so we don't need to explicitly call
# _updateSeqOffsets here
def _onResiduesChanged(self, start, end):
chain = self.sender()
chain_i = self._seqs.index(chain)
offset = self._seq_offsets[chain_i]
self.residuesChanged.emit(start + offset, end + offset)
def _onLengthAboutToChange(self, old_chain_len, new_chain_len):
old_combined_len = len(self)
new_combined_len = old_combined_len + new_chain_len - old_chain_len
self.lengthAboutToChange.emit(old_combined_len, new_combined_len)
def _onLengthChanged(self, old_chain_len, new_chain_len):
self._updateSeqOffsets()
new_combined_len = len(self)
old_combined_len = new_combined_len - new_chain_len + old_chain_len
self.lengthChanged.emit(old_combined_len, new_combined_len)
def _removeFromSequence(self, keep_func, start=0, end=None):
# See parent class for method documentation
if end is None:
end = len(self)
for chain, offset in zip(self._seqs, self._seq_offsets):
if end < offset or start >= offset + len(chain):
continue
if start <= offset:
cur_start = 0
else:
cur_start = start - offset
if end >= offset + len(chain):
cur_end = None
else:
cur_end = end - offset
chain._removeFromSequence(keep_func, cur_start, cur_end)
[docs] def getStructureResForRes(self, res):
# See parent class for method documentation
chain = res.split_sequence
if chain not in self._seqs:
raise ValueError("Residue not part of this sequence")
if not chain.hasStructure():
return None
return chain.getStructureResForRes(res.split_res)
# ==========================================================================
# Gaps
# ==========================================================================
[docs] def getGaplessLength(self):
# See parent class for method documentation
return sum(chain.getGaplessLength() for chain in self._seqs)
[docs] def addGapsByIndices(self, gap_idxs):
# See parent class for method documentation
# Adding a gap is ambiguous at chain boundaries. We always add to
# the beginning of the later sequence.
chain_i = 0
gaps_by_chain = [[] for _ in range(len(self._seqs))]
num_chains = len(self._seqs)
for gap_i, cur_index in enumerate(sorted(gap_idxs)):
while chain_i < num_chains:
adjusted = cur_index - gap_i - self._seq_offsets[chain_i]
# figure out the index of last allowable gap for this chain
last_allowable_gap = len(self._seqs[chain_i])
if chain_i < num_chains - 1:
last_allowable_gap -= 1
if adjusted > last_allowable_gap:
chain_i += 1
continue
else:
gaps_by_chain[chain_i].append(adjusted)
break
else:
raise ValueError("Invalid gap index")
for split_seq, gaps in zip(self._seqs, gaps_by_chain):
if gaps:
split_seq.addGapsBeforeIndices(gaps)
# note that the sequence offsets will be updated in response to the
# lengthChanged signal, so we don't need to explicitly call
# _updateSeqOffsets here
[docs] def addGapsToChainStartsAndEnds(self, gaps: GapRegionsForChains):
"""
Add the specified numbers of gaps to the starts and ends of each chain.
:param gaps: The numbers of gaps to add
"""
assert len(self._seqs) == len(gaps)
for chain, gap_region in zip(self._seqs, gaps):
start_gaps = [residue.Gap() for _ in range(gap_region.from_start)]
chain.insertElements(0, start_gaps)
end_gaps = [residue.Gap() for _ in range(gap_region.from_end)]
chain.extend(end_gaps)
[docs] def removeGapsFromChainStartsAndEnds(self, gaps: GapRegionsForChains):
"""
Remove the specified numbers of gaps from the starts and ends of each
chain.
:param gaps: The numbers of gaps to remove
"""
assert len(self._seqs) == len(gaps)
elems = self._getGapsFromChainStartsAndEnds(gaps)
elems = [
residue.CombinedChainResidueWrapper(gap, self) for gap in elems
]
self.removeElements(elems)
[docs] def validateGapsToRemoveFromChainStartAndEnds(self,
gaps: GapRegionsForChains):
"""
Make sure that we can remove the specified numbers of gaps from the
starts and ends of each chain.
:param gaps: The numbers of gaps to remove
:raise AssertionError: If some of the sequence elements to be removed
aren't actually gaps.
"""
self._getGapsFromChainStartsAndEnds(gaps)
def _getGapsFromChainStartsAndEnds(self, gaps: GapRegionsForChains):
"""
Fetch the sequence elements representing the specified numbers of gaps
from the starts and ends of each chain.
:param gaps: The numbers of gaps to remove
:raise AssertionError: If some of the sequence elements to be removed
aren't actually gaps.
"""
elems = []
for chain, gap_region in zip(self._seqs, gaps):
elems.extend(chain[0:gap_region.from_start])
if gap_region.from_end:
elems.extend(chain[-gap_region.from_end:])
assert all(e.is_gap for e in elems)
return elems
# ==========================================================================
# Annotations
# ==========================================================================
@property
def disulfide_bonds(self):
"""
:return: A sorted tuple of the valid disulfide bonds.
:rtype: tuple(residue.CombinedChainDisulfideBond)
"""
if self._disulfide_bonds is None:
bonds = {
residue.CombinedChainDisulfideBond(cur_bond, self)
for chain in self._seqs
for cur_bond in chain.disulfide_bonds
}
self._disulfide_bonds = tuple(
sorted((bond for bond in bonds if bond.isValid()),
key=lambda bond: min(self.index(res) for res in bond)))
return self._disulfide_bonds
def _updateSecondaryStructure(self):
# See AbstractProteinSequenceMixin for documentation
self._secondary_structures = []
for chain, offset in zip(self._seqs, self._seq_offsets):
for split_ss in chain.secondary_structures:
split_start, split_end = split_ss.limits
combined_start = split_start + offset
combined_end = split_end + offset
combined_ss = _SecondaryStructure(
(combined_start, combined_end), split_ss.ssa_type)
self._secondary_structures.append(combined_ss)
def _clearSecondaryStructureCache(self):
self._secondary_structures = None
[docs] def isKinaseChain(self):
return any(chain.isKinaseChain() for chain in self.chains)
# ==========================================================================
# Predictions
# ==========================================================================
@property
def pred_disulfide_bonds(self):
# See AbstractProteinSequenceMixin for documentation
if self._pred_disulfide_bonds is None:
bonds = {
residue.CombinedChainDisulfideBond(cur_bond, self)
for chain in self._seqs
for cur_bond in chain.pred_disulfide_bonds
}
key = lambda bond: min(
self.index(res) for res in bond if res in self)
self._pred_disulfide_bonds = tuple(sorted(bonds, key=key))
return self._pred_disulfide_bonds
def _updatePredSecondaryStructure(self):
# See AbstractProteinSequenceMixin for documentation
self._pred_secondary_structures = []
for chain, offset in zip(self._seqs, self._seq_offsets):
for split_ss in chain.pred_secondary_structures:
split_start, split_end = split_ss.limits
combined_start = split_start + offset
combined_end = split_end + offset
combined_ss = _SecondaryStructure(
(combined_start, combined_end), split_ss.ssa_type)
self._pred_secondary_structures.append(combined_ss)
# ==========================================================================
# Chains and Sequences
# ==========================================================================
def _updateSeqOffsets(self):
"""
Update `self._seq_offsets`, which gives the residue index for the start
of each chain. Should be called when `self._seqs` changes.
"""
self._seq_offsets = [0] + list(
itertools.accumulate(len(seq) for seq in self._seqs[:-1]))
self._ro_seq_offsets = tuple(self._seq_offsets)
def _updateSeqCaches(self):
"""
Update the read-only copy of the sequence and sequence offsets. These
copies are used to ensure that clients of this class can't modify
internal data.
"""
self._ro_seqs = tuple(self._seqs)
self._chain_indices = types.MappingProxyType(
{seq: i for i, seq in enumerate(self._seqs)})
def _indexToSeqIndexAndIndex(self, index):
"""
Convert a combined-chain residue index to a sequence index and a residue
index within the specified sequence.
:param index: A valid combined-chain residue index
:type index: int
:return: The split-chain sequence index and residue index
:rtype: tuple(int, int)
"""
if index < 0:
index = len(self) + index
seq_index = bisect.bisect_right(self._seq_offsets, index) - 1
index_in_seq = index - self._seq_offsets[seq_index]
return seq_index, index_in_seq
[docs] def indexToSeqAndIndex(self, index):
"""
Convert a combined-chain residue index to a split-chain sequence and a
residue index within the specified sequence.
:param index: A valid combined-chain residue index
:type index: int
:return: A tuple of
- the split-chain sequence
- residue index
- the starting index of the split-chain sequence
:rtype: tuple(ProteinSequence, int, int)
"""
seq_index, index_in_seq = self._indexToSeqIndexAndIndex(index)
return self._seqs[seq_index], index_in_seq, self._seq_offsets[seq_index]
@property
def chain(self):
return "All"
@property
def chains(self):
return self._ro_seqs
@property
def chain_offsets(self):
return self._ro_seq_offsets
[docs] def hasChain(self, chain_name):
"""
Does this sequence contain a chain with the specified name?
:param chain_name: The chain name to check
:type chain_name: str
:rtype: bool
"""
return any(chain_name == seq.chain for seq in self._seqs)
[docs] def addChain(self, seq):
"""
Add a new chain to this sequence.
:param seq: The chain to add
:type seq: ProteinSequence
"""
old_len = len(self)
new_len = old_len + len(seq)
self.lengthAboutToChange.emit(old_len, new_len)
self._seqs.append(seq)
self._updateSeqOffsets()
self._updateSeqCaches()
self.clearDisulfideBondsCache()
self._connectSignals(seq)
self._clearSecondaryStructureCache()
self.annotations.chainAdded(seq)
self.lengthChanged.emit(old_len, new_len)
[docs] def removeChain(self, seq):
"""
Remove a chain from this sequence. Note that you should not remove the
last chain; instead, remove this sequence from the alignment.
:param seq: The chain to remove
:type seq: ProteinSequence
"""
if len(self._seqs) == 1:
raise ValueError(f"Cannot remove last chain from a "
f"{self.__class__.__name__}")
index = self._seqs.index(seq)
old_len = len(self)
new_len = old_len - len(seq)
self.lengthAboutToChange.emit(old_len, new_len)
del self._seqs[index]
self._updateSeqOffsets()
self._updateSeqCaches()
self._disconnectSignals(seq)
self._clearSecondaryStructureCache()
self.clearDisulfideBondsCache()
self.annotations.chainRemoved(seq)
self.lengthChanged.emit(old_len, new_len)
[docs] def removeChains(self, seqs):
"""
Remove multiple chains from this sequence. Note that you should not all
chain from a combine-chain sequence; instead, remove the sequence from
the alignment.
:param seqs: The chains to remove
:type seqs: list[ProteinSequence]
"""
for seq in seqs:
self.removeChain(seq)
[docs] def insertElementByChain(self, index, chain, element):
"""
Add the given element to the specified chain of a sequence.
:param index: The index to insert the element at. Note that this is the
index in the sequence, not the chain.
:type index: int
:param chain: The chain to insert into
:type chain: sequence.ProteinSequence
:param element: The element to insert
:type element: str or residue.AbstractSequenceElement
"""
chain_index = self._chain_indices[chain]
index_in_chain = index - self._seq_offsets[chain_index]
assert 0 <= index_in_chain <= len(chain)
chain.insertElements(index_in_chain, [element])
[docs] def offsetForChain(self, chain):
"""
Get the combined-chain residue index for the first residue of the
specified chain.
:param chain: The chain
:type chain: ProteinSequence
:return: The offset
:rtype: int
"""
chain_index = self._chain_indices[chain]
return self._seq_offsets[chain_index]
def _getMutationChainIndices(self, start, end):
"""
Get the chain and chain indices referred to by the specified combined-
chain indices.
:param start: The index at which to start mutating
:type start: int
:param end: The index of the last mutated element (exclusive)
:type end: int
:return: A tuple of
- the chain that the indices refer to
- The start index within the chain
- The end index within the chain
:rtype: tuple(sequence.ProteinSequence, int, int)
:raise MultipleChainsError: If the specified residue range spans
multiple chains.
"""
start_chain, start_chain_index, _ = self.indexToSeqAndIndex(start)
# since end is exclusive, we want the chain of the previous residue
if end > start:
end -= 1
offset = 1
elif end < start:
raise RuntimeError("End must be greater than or equal to start")
else:
offset = 0
end_chain, end_chain_index, _ = self.indexToSeqAndIndex(end)
end_chain_index += offset
if start_chain != end_chain:
raise MultipleChainsError("Cannot mutate multiple chains.")
return start_chain, start_chain_index, end_chain_index
# ==========================================================================
# Caching
# ==========================================================================
[docs] def clearAllCaching(self):
# we intentionally don't clear the read-only sequence and sequence-
# offset caches, since those aren't regenerated as needed.
self.annotations.clearAllCaching()
self._clearSecondaryStructureCache()
self.clearDisulfideBondsCache()
for chain in self._seqs:
chain.clearAllCaching()
[docs]class MultipleChainsError(ValueError):
"""
An exception raised when the specified indices span multiple chains but the
operation can only be carried out on a single chain.
"""
# This class intentionally left blank
[docs]class StructureSequence(structure._AtomCollection):
"""
Class representing a sequence of protein residues.
"""
def _getResidueIterator(self):
return structure._structure._get_residues_by(self)
residue = property(
_getResidueIterator,
doc="Returns residue iterator for all residues in the sequence")
[docs]def get_structure_sequences(st):
"""
Iterates over all sequences in the given structure.
"""
mm.mmpdb_initialize(mm.error_handler)
new_indices = mm.mmpdb_get_sequence_order(st.handle,
mm.MMPDB_SEQUENCE_ORDER)
mm.mmpdb_terminate()
curr_sequence_atoms = []
prev_res_atom = None # An atom of the previous residue
prev_residue_id = (None, None, None)
# Skip atom 0, as it is simply a place holder, because atoms are 1-indexed.
for ai in new_indices[1:]:
atom = st.atom[ai]
residue_id = (atom.resnum, atom.chain, atom.inscode)
# Check to see whether we reached the end of this sequence
if residue_id != prev_residue_id and prev_res_atom is not None and not mm.mmct_res_connected(
st.handle, prev_res_atom, ai):
# This is the start of a new sequence
yield StructureSequence(st, curr_sequence_atoms)
curr_sequence_atoms = []
prev_res_atom = ai
prev_residue_id = residue_id
# Whether new sequence or not, we need to update these:
curr_sequence_atoms.append(ai)
if curr_sequence_atoms:
yield StructureSequence(st, curr_sequence_atoms)
[docs]def find_generalized_pattern(sequence_list, pattern, validate_pattern=False):
"""
Finds a generalized sequence pattern within specified sequences.
NOTE: The search is performed in the forward direction only.
:param sequence_list: list of sequence dictionaries to search.
:type sequence_list: list(dict)
Each sequence is a dictionary including amino acid strings
and associated data. The amino acid string should be ungapped.
The dictionary should include 'amino_acids' key and optionally
'secondary_structure', 'solvent_accessibility' and 'flexibility'
keys:
- sequence is expressed as uppercase 1-letter code:
sequence['amino_acids'] = 'ACDEFGHIKLMNPQRSTVWY'
(sequence in 1-letter code, mandatory)
- secondary structure usese 'h', 'e' and ' ' symbols:
sequence['secondary_structure'] = 'hhhhh eeeee '
- solvent accessibility uses 'e' (exposed) and ' ' (buried)
symbols:
sequence['solvent_accessibiliy' = 'eeeee eeeee '
- flexibility uses 'f' (flexible) and ' ' (not flexible)
symbols:
sequence['flexibility'] = ' fffff fffff'
:type pattern: str
:param pattern: Pattern defined using extended PROSITE syntax.
- standard IUPAC one-letter codes are used for all amino acids
- each element in a pattern is separated using '-' symbol
- symbol 'x' is used for position where any amino acid is accepted
- ambiguities are listed using the acceptable amino acids between
square brackets, e.g. [ACT] means Ala, Cys or Thr
- amino acids not accepted for a given position are indicated
by listing them between curly brackets, e.g. {GP} means 'not Gly
and not Pro'
- repetition is indicated using parentheses, e.g. A(3) means
Ala-Ala-Ala, x(2,4) means between 2 to 4 any residues
- the following lowercase characters can be used as additional
flags:
- 'x' means any amino acid
- 'a' means acidic residue: [DE]
- 'b' means basic residue: [KR]
- 'o' means hydrophobic residue: [ACFILPWVY]
- 'p' means aromatic residue: [WYF]
- 's' means solvent exposed residue
- 'h' means helical residue
- 'e' means extended residue
- 'f' means flexible residue
- Each position can optionally by followed by @<res_num> expression
that will match the position with a given residue number.
- Entire pattern can be followed by :<index> expression that defines
a 'hotspot' in the pattern. When the hotspot is defined, only
a single residue corresponding to (pattern_match_start+index-1)
will be returned as a match. The index is 1-based and can be used
to place the hotspot outside of the pattern (can also be
a negative number).
Pattern examples:
- N-{P}-[ST] : Asn-X-Ser or Thr (X != Pro)
- N[sf]-{P}[sf]-[ST][sf] : as above, but all residues flexible
OR solvent exposed
- Nsf-{P}sf-[ST]sf : as above, but all residues flexible
AND solvent exposed
- Ns{f} : Asn solvent exposed AND not in flexible region
- N[s{f}] : Asn solvent exposed OR not in flexible region
- [ab]{K}{s}f : acidic OR basic, with exception of Lys,
flexible AND not solvent exposed
- Ahe : Ala helical AND extended - no match possible
- A[he] : Ala helical OR extended
- A{he} : Ala coiled chain conformation (not helical nor extended)
- [ST] : Ser OR Thr
- ST : Ser AND Thr - no match possible
:type validate_pattern: boolean
:param validate_pattern: If True, the function will validate the pattern
without performing the search (the sequences parameter will be
ignored) and return True if the pattern is valid, or False
otherwise. The default is False.
:rtype: list of lists of integer tuples or False if the pattern is invalid
:return: False if the specified input pattern was incorrect.
Otherwise, it returns a list of lists of matches for each
input sequence. Each match is a (start, end) tuple where
start and end are matching sequence positions.
"""
'''
FIXME This will modify the user's input. Not good. Intead we can expect
the user to do this themselves.
if invert and sequence_list:
for k in sequences.values():
sequences[k] = sequences[k][::-1]
'''
# Cast unicode pattern to str to avoid b''.translate() exception
pattern = str(pattern)
# Remove color designator if present
if '#' in pattern:
pattern = pattern[:pattern.find('#')]
hotspot = None
# Extract a hotspot marker
if ':' in pattern:
pos = pattern.find(':')
try:
hotspot = int(pattern[pos + 1:])
except ValueError:
hotspot = None
else:
pattern = pattern[:pos]
# Split the pattern by dash symbols
plist = [p for p in pattern.split('-') if p]
pattern = ""
idlist = []
# Extract residue number matches
for index, p in enumerate(plist):
_id = ""
match = re.match(r"""
(.*) # everything before the @
@(\d+[a-zA-Z]?) # the @ and residue number with optional insertion code
(.*)$ # everything after the residue number
""",
p,
flags=re.VERBOSE)
if match:
# Fail if a residue number match is requested but
# the sequence list does not have residue number information
if sequence_list and not sequence_list[0].get("res_ids"):
return False
p = match.group(1) + match.group(3)
_id = match.group(2)
else:
_id = ""
if p == "":
p = "x"
plist[index] = p
idlist.append(_id)
replace_wildcards = lambda x: x.replace('a', 'ED'). \
replace('b', 'KR').replace('o', 'WILYFVPCA').replace('p', 'WYF')
# Convert the pattern to a regular expression (a bit clunky code)
for p in plist:
s = p.split('(')
p = s[0]
pattern += '('
# Extract included, excluded and mandatory amino acids
incl_aa = delete_from_str(''.join(re.findall(r"\[[abopA-Z]+\]", p)),
'[]')
excl_aa = delete_from_str(''.join(re.findall(r"\{[A-Zabop]+\}", p)),
'{}')
mand_aa = delete_from_str(p, incl_aa + excl_aa + "[]{}efhs")
incl_aa = replace_wildcards(incl_aa)
excl_aa = replace_wildcards(excl_aa)
if 'x' in incl_aa + mand_aa:
pattern += '.'
else:
if 'x' in excl_aa:
# Exclude everything
return False
if len(mand_aa) > 1:
# Wrong pattern, no match possible
return False
if mand_aa:
if mand_aa in "abop":
incl_aa = replace_wildcards(mand_aa)
else:
pattern += mand_aa
if incl_aa or excl_aa:
pattern += '['
if incl_aa:
pattern += delete_from_str(incl_aa, excl_aa)
else:
pattern += '^' + excl_aa
pattern += ']'
# Extract included, excluded and mandatory flags
incl_flag = delete_from_str(''.join(re.findall(r"\[[efhs\{\}]+\]", p)),
'[]{}')
excl_flag = delete_from_str(''.join(re.findall(r"\{[efhs\[\]]+\}", p)),
'{}[]')
mand_flag = [
c for c in delete_from_str(p, incl_flag + excl_flag + "[]{}")
if c in "efhs"
]
incl_flag = delete_from_str(incl_flag, excl_flag)
if 'e' in mand_flag and 'h' in mand_flag:
# Can't be simultaneously helix and extended
return False
p = ""
for f in ['eh', 's', 'f']:
p = ""
for c in f:
if c in mand_flag:
p = c
elif c in incl_flag:
if '[' in p:
p = '[' + c + p[1:]
else:
p = '[' + c + ']'
elif c in excl_flag:
if '[' in p:
p = '[^' + c + p[2:]
else:
p = '[^' + c + ']'
if not p:
p = '.'
pattern += p
pattern += '-)'
# Add number of pattern repetitions
if len(s) > 1:
pattern += '{' + s[1].replace(')', '}')
# Compile the pattern
try:
pattern = re.compile(pattern)
except Exception:
# Syntax error: quit
return False
if validate_pattern:
return True
results = []
# Use the pattern to search the sequences
for sequence in sequence_list:
match_list = []
if 'amino_acids' in sequence:
length = len(sequence['amino_acids'])
if 'secondary_structure' not in sequence:
sequence['secondary_structure'] = ' ' * length
if 'solvent_accessibility' not in sequence:
sequence['solvent_accessibility'] = ' ' * length
else:
# replace "e" for "exposed" with "s" for "solvent-exposed"
sequence['solvent_accessibility'] = sequence[
'solvent_accessibility'].replace('e', 's')
if 'flexibility' not in sequence:
sequence['flexibility'] = ' ' * length
# Convert the sequence to flat format: Ahsf-
# (5 characters per residue)
seq_string = ''.join([
''.join(s) for s in
zip(sequence['amino_acids'], sequence['secondary_structure'],
sequence['solvent_accessibility'], sequence['flexibility'],
'-' * length)
])
# Build list of matches
match_list = []
match = pattern.search(seq_string)
while match:
start = match.start()
end = match.end()
# Now test residue IDs
has_match = True
if 'res_ids' in sequence and any(idlist):
res_ids = sequence['res_ids']
for index, id in enumerate(idlist):
if id and id != res_ids[old_div(start, 5) + index]:
has_match = False
break
if has_match:
if hotspot is None:
match_list.append((old_div(start, 5), old_div(end, 5)))
else:
match_list.append(((old_div(start, 5)) + hotspot - 1,
(old_div(start, 5)) + hotspot - 1))
# Why is the second item needed?
# It is only needed because MSV expects a range
# of residues returned for each match, so it knows
# start and end position. If the hotspot is defined,
# start == end. Thus MSV doesn't have to know
# if the pattern included hotspot definition or not.
# If the caller knows the hotspot was defined,
# it can simply ignore the second value.
match = pattern.search(seq_string, start + 1)
# Append the result even if it's []
results.append(match_list)
return results
[docs]def convert_structure_sequence_for_pattern_search(seq, sasa_by_atom=None):
"""
Converts a StructureSequence object to dictionary required
by find_generalized_pattern function. Because the conversion
can be time consuming, it should be done once per sequence.
Optionally a list of atom SASAs for each atom in the CT can be specified.
If it's not specified, it will get calculated by calling
analyze.calculate_sasa_by_atom().
:type seq: `StructureSequence`
:param seq: StructureSequence object
:type sasa_by_atom: list
:param sasa_by_atom: list of atom SASAs
:rtype: dict
:return: Dictionary of sequence information
"""
valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1)
seq_dict = {}
seq_dict["amino_acids"] = ""
seq_dict["secondary_structure"] = ""
seq_dict["flexibility"] = ""
seq_dict["solvent_accessibility"] = ""
seq_dict["res_ids"] = []
if len(seq.residue) == 0:
return None
avg_b_factor = 0.0
n = 0
for res in seq.residue:
b_factor = res.temperature_factor
if b_factor > 0:
avg_b_factor += b_factor
n += 1
if n:
avg_b_factor /= n
# Calculates the std SASAs. The calculations are made only the first time
# this function is called:
std_sasa_dict = calculate_sasa_dict()
# Calculate SASA only once for all atoms (and only if it wasn't specified):
if sasa_by_atom is None:
sasa_by_atom = analyze.calculate_sasa_by_atom(seq._st)
for res in seq.residue:
# sequence
code = 'X'
if res.pdbres[:3] in valid_amino_acids:
code = constants.AMINO_ACIDS_3_TO_1[res.pdbres[:3]]
seq_dict["amino_acids"] += code
# residue numbers
res_id = "%s%s" % (res.resnum, res.inscode.strip())
seq_dict["res_ids"].append(res_id)
# secondary structure
code = ' '
if res.secondary_structure == 1:
code = 'h'
elif res.secondary_structure == 2:
code = 'e'
seq_dict["secondary_structure"] += code
# flexibility
code = ' '
if res.temperature_factor > avg_b_factor:
code = 'f'
seq_dict["flexibility"] += code
# solvent accessibility
code = ' '
# sasa = analyze.calculate_sasa(seq._st, res.getAtomIndices())
sasa = sum([sasa_by_atom[a.index - 1] for a in res.atom])
if res.pdbres[:3] in std_sasa_dict:
sasa /= std_sasa_dict[res.pdbres[:3]]
if sasa > 1.0:
code = 'e'
seq_dict["solvent_accessibility"] += code
return seq_dict
[docs]def find_pattern(seq, pattern):
"""
Find pattern matches in a specified StructureSequence object.
Returns a list of matching positions.
:type seq: `StructureSequence`
:param seq: StructureSequence object
:type pattern: string
:param pattern: Sequence pattern. The syntax is described in
find_generalized_pattern.
:rtype: list of lists of integer tuples or None
:return: None if the specified input pattern was incorrect.
Otherwise, it returns a list of lists of matches for each
residue position in the input structure. Each match is a
(start, end) tuple where start and end are matching sequence
positions. If 'hotspot' is specified then start = end.
"""
seq_dict = convert_structure_sequence_for_pattern_search(seq)
return find_generalized_pattern([seq_dict], pattern)
[docs]def assign_residue_numbers(residues, start_res=None, end_res=None):
"""
Assign residue numbers to the given residues based on the residues before
and after
:param residues: Residues that need numbering. Will be modified in-place.
:type residues: list[residue.Residue]
:param start_res: Previous residue. Pass None if the residues are N-terminal
:type start_res: residue.Residue or NoneType
:param end_res: Next residue. Pass None if the residues are C-terminal
:type end_res: residue.Residue or NoneType
"""
# Make shallow copy without gaps
residues = [elem for elem in residues if elem.is_res]
start_num = None
resnum_and_inscode = None
num_to_renumber = len(residues)
if start_res is None:
if end_res is None:
# Both are None - number starting from 1
start_num = 1
else:
# Number N-terminal residues
start_num = end_res.resnum - num_to_renumber
elif end_res is None or (end_res.resnum - start_res.resnum - 1 >=
num_to_renumber):
# Residues are C-terminal or there are enough missing residue numbers
# that each residue can have its own number without the need for
# insertion codes.
start_num = start_res.resnum + 1
else:
# There are more residues than residue numbers, so we need
# to use insertion codes.
resnum_and_inscode = gen_resnums_and_inscodes(start_res.resnum,
start_res.inscode,
end_res.resnum,
end_res.inscode)
if len(resnum_and_inscode) < num_to_renumber:
# There are more residues than we can possibly fit even
# using every number + insertion code combination, so
# pad with Nones
nones = [None] * (num_to_renumber - len(resnum_and_inscode))
resnum_and_inscode.extend(nones)
if resnum_and_inscode is not None:
for resnum, res in zip(resnum_and_inscode, residues):
if resnum is None:
inscode = None
else:
resnum, inscode = resnum
res.resnum = resnum
res.inscode = inscode
else:
for idx, res in enumerate(residues, start=start_num):
res.resnum = idx
res.inscode = None
[docs]def gen_resnums_and_inscodes(start_resnum, start_inscode, end_resnum,
end_inscode):
"""
Create a list of all residue numbers/insertion code combinations
possible between the given endpoints. If the ending residue number and
insertion code are less than or equal to the starting residue number and
insertion code, then an empty list will be returned.
:param start_resnum: The starting residue number.
:type start_resnum: int
:param start_inscode: The starting insertion code.
:type start_inscode: str
:param end_resnum: The ending residue number.
:type end_resnum: int
:param end_inscode: The ending insertion code.
:type end_inscode: str
:return: A list of residue numbers and insertion codes
:rtype: list[tuple[int, str]]
"""
# Make sure that the insertion codes are ASCII letters or blanks
start_inscode = start_inscode.upper() or " "
end_inscode = end_inscode.upper() or " "
for resnum, inscode in ((start_resnum, start_inscode), (end_resnum,
end_inscode)):
if inscode not in string.ascii_uppercase + " ":
raise IOError("Invalid insertion code for residue %s%s." %
(resnum, inscode))
if end_resnum < start_resnum or (end_resnum == start_resnum and
(end_inscode <= start_inscode or
end_inscode == " ")):
# the end point is before the start point
return []
cur_resnum = start_resnum
cur_inscode = start_inscode
resnum_and_inscode = []
while True:
if cur_inscode == " ":
cur_inscode = "A"
elif cur_inscode == "Z":
cur_inscode = " "
cur_resnum += 1
else:
cur_inscode = chr(ord(cur_inscode) + 1)
if cur_resnum == end_resnum and cur_inscode == end_inscode:
return resnum_and_inscode
resnum_and_inscode.append((cur_resnum, cur_inscode))
def _get_protein_sequence(chain):
"""
Converts a protein chain to a ProteinSequence object.
:param chain: Protein chain to get the sequence of.
:type chain: `structure._Chain`
:return: A protein sequence.
:rtype: `ProteinSequence`
"""
sequence = ProteinSequence([res.getCode() for res in chain.residue])
sequence.chain = chain.name
return sequence
[docs]def get_pairwise_sequence_similarity(chain1,
chain2,
consider_gap=True,
method='muscle'):
"""
Given two single chain sequences, align them,
and return sequence similarity among them.
:param chain1: The first sequence chain.
:type chain1: `structure._Chain`
:param chain2: The second sequence chain.
:type chain2: `structure._Chain`
:param consider_gap: Whether or not to consider gaps in the alignment,
default to true.
:type consider_gap: bool
:param method: Which alignment method to use ('muscle' or 'clustalw')
:type method: string
:return: Sequence similarity of the alignment of the two.
:rtype: float, between 0.0 and 1.0
"""
with ioredirect.IOSilence():
aln = align_from_chains([chain1, chain2], method=method)
return aln[0].getSimilarity(aln[1], consider_gap)
[docs]def create_alignment_from_chains(chains):
"""
Return `ProteinAlignment` object comprised of two chains
:param chains: Chains to be aligned
:type chains: iterable(structure._Chain)
"""
# Dev note: Consider moving this to a ProteinSequence class method
# To avoid a circular import
from schrodinger.protein.alignment import ProteinAlignment
aln = ProteinAlignment()
for chain in chains:
seq = _get_protein_sequence(chain)
aln.addSeq(seq)
return aln
[docs]def align_alignment(aln, second_aln=None, method='muscle'):
"""
Perform alignment from an ProteinAlignment object
:param aln: Alignment data
:type aln: `ProteinAlignment`
:param method: Which method/program to use
:type method: string
:return: Aligned sequences
:rtype: `ProteinAlignment`
"""
if method == 'muscle':
from schrodinger.protein.tasks import muscle
job = muscle.MuscleJob(aln, second_aln=second_aln)
new_aln = job.run()
elif method == 'clustalw':
from schrodinger.protein.tasks import clustal
if second_aln:
raise NotImplementedError("""
ClustalW2 alignment of alignments (profiling)
is not supported directly via this API. Consider using
MUSCLE instead (method='muscle')
""")
job = clustal.ClustalJob(aln)
new_aln = job.run()
else:
raise ValueError("Unsupported alignment method: {}".format(method))
return new_aln
[docs]def align_from_chains(chains, method='muscle'):
"""
Perform alignment on a series of chains
:param chains: Chains to be aligned
:type chains: iterable(structure._Chain)
:param method: Which method/program to use (choices 'muscle', 'clustalw')
:type method: string
:return: Aligned sequences
:rtype: `ProteinAlignment`
"""
aln = create_alignment_from_chains(chains)
return align_alignment(aln, method=method)
[docs]def get_aligned_residues(st1, st2, method='muscle'):
"""
This generator will yield 2 structure._Residue objects - one from each
structure - for each position in aligned sequences.
:param st1: First structure.
:type st1: `structure.Structure`
:param st2: Second structure
:type st2: `structure.Structure`
:return: Generates tuples of 2 residues that align at each position.
:rtype: generator(structure._Residue or None, structure._Residue or None)
:raises ValueError: if structures don't have equivalent chains.
"""
if st1.chain.chainNames() != st2.chain.chainNames():
raise ValueError("Input structures must have same chains")
for chain1, chain2 in zip(st1.chain, st2.chain):
alignment = align_from_chains([chain1, chain2], method=method)
residues1 = list(chain1.residue)
residues2 = list(chain2.residue)
for seq_res1, seq_res2 in zip(*alignment):
st_res1 = residues1[
seq_res1.gapless_idx_in_seq] if seq_res1.is_res else None
st_res2 = residues2[
seq_res2.gapless_idx_in_seq] if seq_res2.is_res else None
yield st_res1, st_res2
[docs]def get_aligned_structure_residues(sts, method='muscle'):
"""
This generator will yield 2 structure._Residue objects - one from each
structure for each position in aligned sequences.
:param sts: Structures to align
:type sts: list(structure.Structure)
:return: Generates lists of residues that align at each position.
:rtype: generator(list[structure._Residue or None])
"""
# To avoid a circular import
from schrodinger.protein.alignment import ProteinAlignment
aln = ProteinAlignment()
for st in sts:
seq = ProteinSequence([res.getCode() for res in st.residue])
# sequence.chain = chain.name
aln.addSeq(seq)
alignment = align_alignment(aln, method=method)
residues_by_st = [list(st.residue) for st in sts]
for aligned_residues in zip(*alignment):
residues = [
residues_by_st[i][seq_res.gapless_idx_in_seq]
if seq_res.is_res else None
for i, seq_res in enumerate(aligned_residues)
]
yield residues
[docs]def offset_indices(indices):
"""
Offset insertion indices based on numbering before insertion to reflect
numbering after insertion.
For example, [1, 1, 2, 3, 5, 8] would be changed to [1, 2, 4, 6, 9, 13]
:type indices: list[int]
"""
sorted_indices = sorted(indices)
for i, cur_index in enumerate(sorted_indices):
sorted_indices[i] = cur_index + i
return sorted_indices