Source code for schrodinger.protein.sequence

"""
Implementation of ProteinSequence, Sequence, and StructureSequence class.

StructureSequence allows iteration over all sequences in a given protein CT,
and iteration over residues of each (in sequence order).
"""

import bisect
import collections
import copy
import dataclasses
import enum
import itertools
import re
import string
import types
import typing
from collections import namedtuple
from functools import partial
from typing import Dict

from past.utils import old_div

from Bio.Seq import translate

from schrodinger import structure
from schrodinger.application.msv import utils as msv_utils
from schrodinger.infra import mm
from schrodinger.models import json
from schrodinger.protein import annotation
from schrodinger.protein import constants
from schrodinger.protein import properties
from schrodinger.protein import residue
from schrodinger.protein.constants import AMINO_ACIDS_3_TO_1
from schrodinger.protein.properties import PropertyType
from schrodinger.Qt import QtCore
from schrodinger.structutils import analyze
from schrodinger.test import ioredirect
from schrodinger.ui.sequencealignment.sequence import delete_from_str
from schrodinger.ui.sequencealignment.structure_utils import calculate_sasa_dict

SEQ_ANNO_TYPES = annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES
_SecondaryStructure = namedtuple('_SecondaryStructure', ['limits', 'ssa_type'])
DISALLOWED_SEQ_CHARS = re.compile(r"[^a-zA-Z0-9~\.\-]")


[docs]def guess_seq_type(res_names): """ Takes an iterable of residue names and returns the appropriate sequence class :param res_names: An iterable of residue names. Note that all residue names must be uppercase. :type res_names: Iterable(str) :return: The appropriate class for the input sequence :rtype: Type[AbstractSingleChainSequence] Note that we use a pretty simple heuristic here. """ res_names = set(res_names) seq_classes = [RNASequence, DNASequence, ProteinSequence] element_names = [] for seq_class in seq_classes: eles = set(seq_class.ALPHABET.keys()) unk = seq_class._unknown_res_type eles.add(unk.short_code) eles.add(unk.long_code) element_names.append(eles) # Determine type by the highest number of matching pdbres names match_counts = [ len(res_names & seq_elem_names) for seq_elem_names in element_names ] if not any(match_counts): return ProteinSequence idxmax = match_counts.index(max(match_counts)) return seq_classes[idxmax]
[docs]def make_sequence(elements, *args, **kwargs): """ Guesses the appropriates Sequence type from the names of residues and returns an instance with those residues. :param elements: A list of strings to examine :type elements: list(str) :rtype: protein.Sequence :return: An instance of the appropriate type """ SeqClass = guess_seq_type(elements) return SeqClass(elements, *args, **kwargs)
[docs]class Sequence: # We use this as a base class for both actual sequence objects and # read-only sequence proxies. This is necessary for PyQt signal typing. pass
[docs]class AbstractSequence(Sequence, QtCore.QObject): """ A base class for single-chain and combined-chain biological sequences. :cvar ORIGIN: Possible sequence origins :vartype ORIGIN: enum.Enum :cvar AnnotationClass: Class to use for annotations :vartype AnnotationClass: annotation.SequenceAnnotations :cvar ElementClass: Class to use for elements :vartype ElementClass: Type[residue.Residue] :cvar ALPHABET: A mapping of string representations of elements to element types. Concrete subclasses must define this. :vartype ALPHABET: dict(str, residue.ElementType) :cvar _gap_chars: A tuple of permissible gap characters in the element list; the first item will be used for serialization. :vartype _gap_chars: tuple(str) :cvar _unknown_res_type: The type for an unknown residue :vartype _unknown_res_type: residue.ElementType :cvar residuesChanged: A signal emitted when sequence residues are changed. Emitted with the indices of the first and last changed residues. :vartype residuesChanged: QtCore.pyqtSignal :cvar lengthAboutToChange: A signal emitted when the sequence length is about to change. Emitted with the old and new lengths. :vartype lengthAboutToChange: QtCore.pyqtSignal :cvar lengthChanged: A signal emitted when the sequence length is changed. Emitted with the old and new lengths. :vartype lengthChanged: QtCore.pyqtSignal :cvar nameChanged: A signal emitted when the sequence name is changed. :vartype nameChanged: QtCore.pyqtSignal :cvar visibilityChanged: A signal emitted when the visibility is changed. :vartype visibilityChanged: QtCore.pyqtSignal :cvar structureChanged: A signal emitted when the structure changes. :vartype structureChanged: QtCore.pyqtSignal :cvar annotationTitleChanged: A signal emitted when an annotation title is changed. :vartype annotationTitleChanged: QtCore.pyqtSignal """ ORIGIN = enum.Enum("ORIGIN", ["Maestro", "PyMOL"]) AnnotationClass = None ElementClass = residue.Residue ALPHABET = {} _gap_char = "~" _gap_chars = frozenset((_gap_char)) _unknown_res_type = residue.UNKNOWN residuesRemoved = QtCore.pyqtSignal(set) residuesAdded = QtCore.pyqtSignal(set) residuesChanged = QtCore.pyqtSignal(int, int) lengthAboutToChange = QtCore.pyqtSignal(int, int) lengthChanged = QtCore.pyqtSignal(int, int) nameChanged = QtCore.pyqtSignal() visibilityChanged = QtCore.pyqtSignal() structureChanged = QtCore.pyqtSignal() annotationTitleChanged = QtCore.pyqtSignal() pfamChanged = QtCore.pyqtSignal() descriptorsCleared = QtCore.pyqtSignal() # ========================================================================== # Magic Methods # ==========================================================================
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.AnnotationClass: self._annotations = self.AnnotationClass(self) self._annotations.titleChanged.connect(self.annotationTitleChanged) else: self._annotations = None self.residuesRemoved.connect( self._invalidateDescriptorsIfResiduesChanged) self.residuesAdded.connect(self._invalidateDescriptorsIfResiduesChanged) self.structureChanged.connect(self._invalidateDescriptors) self.clearDescriptors()
# ========================================================================== # Residues # ==========================================================================
[docs] def residues(self): """ Return an iterable of residues, ignoring gaps. :return: Iterable of residues :rtype: iter(Residue) """ for res in self: if res.is_res: yield res
[docs] def getNextResidue(self, res): """ Return the next residue in the sequence (ignoring gaps) or None if this is the last residue. :param res: A given residue in the sequence :type res: schrodinger.protein.residue.Residue :return: The previous residue in the sequence :rtype: schrodinger.protein.residue.Residue """ indices = range(res.idx_in_seq + 1, len(self)) for position in indices: if not self[position].is_gap: return self[position]
[docs] def getPreviousResidue(self, res): """ Return the previous residue in the sequence (ignoring gaps) or None if this is the first residue. :param res: A given residue in the sequence :type res: schrodinger.protein.residue.Residue :return: The previous residue in the sequence :rtype: `schrodinger.protein.residue.Residue` """ indices = range(res.idx_in_seq - 1, -1, -1) for position in indices: if not self[position].is_gap: return self[position]
[docs] def iterNeighbors(self): """ Return an iterable of three element tuples consisting of (prev_res, curr_res, next_res), ignoring gaps. None is used for neighbors of first and last residues in the sequence, and does not indicate gaps here. :return: Iterable of 3-tuples, each element of the each tuple being either a `schrodinger.protein.residue.Residue` or None :rtype: iter(tuple(Residue or NoneType, Residue, Residue or NoneType)) """ res_iter = self.residues() try: next_res = next(res_iter) except StopIteration: return prev_res, cur_res = None, None # Add None to the end to represent the neighbor of the last residue for res in itertools.chain(res_iter, [None]): prev_res, cur_res, next_res = cur_res, next_res, res yield (prev_res, cur_res, next_res)
[docs] def index(self, res): """ Returns the index of the specified residue. :param res: The residue to find :type res: residue.Residue :return: The index of the residue :rtype: int """ raise NotImplementedError
[docs] def indices(self, residues): """ Returns the indices of all specified residues. Note that there is no guarantee that the returned integers will be in the same order as the input residues. (For combined-chain sequences, it's highly likely that they *won't* be.) :param res: The residues to find indices of :type res: Iterable(residue.Residue) :return: The indices of the residues :rtype: list[int] """ raise NotImplementedError
[docs] def getRun(self, res): """ For a given residue or gap, return a set of all adjacent element indices in the sequence that are also residues or gaps. :param res: Residue to get the run of :type res: residue.AbstractSequenceElement :reuturn: Set of residue indices in the run :rtype: set(int) """ res_idx = self.index(res) run_idx = set([res_idx]) curr_idx = res_idx - 1 while (curr_idx >= 0 and self[curr_idx] and self[curr_idx].is_gap == res.is_gap): run_idx.add(curr_idx) curr_idx -= 1 curr_idx = res_idx + 1 while (curr_idx < len(self) and self[curr_idx] and self[curr_idx].is_gap == res.is_gap): run_idx.add(curr_idx) curr_idx += 1 return run_idx
[docs] def insertElements(self, index, elements): """ Insert a list of elements or sequence element into this sequence. If `elements` is a string or iterable of strings, residue numbers will be automatically assigned. :param index: The index at which to insert elements :type index: int :param elements: A list of elements to insert :type elements: iterable(self.ElementClass) or iterable(str) """ raise NotImplementedError
[docs] def mutate(self, start, end, elements): """ Mutate sequence elements starting at the given index to the provided elements. :param start: The index at which to start mutating :type start: int :param end: The index of the last mutated element (exclusive) :type end: int :param elements: The elements to which to mutate the sequence :type elements: iterable(self.ElementClass) or iterable(str) """ raise NotImplementedError
[docs] def append(self, element): """ Appends an element to the sequence :param element: The element to append to this sequence :type: element: self.ElementClass or basestring """ raise NotImplementedError
[docs] def extend(self, elements): """ Extends the sequence with elements from an iterable :param elements: The iterable containing elements with which to extend this sequence :type elements: iterable(self.ElementClass) or iterable(str) """ raise NotImplementedError
[docs] def getSubsequence(self, start, end): """ Return a sequence containing a subset of the elements in this one :param start: The index at which the subsequence should start :type start: int :param end: The index at which the subsequence should end (exclusive) :type end: int :return: The requested subsequence :rtype: AbstractSequence """ raise NotImplementedError
[docs] def removeElements(self, eles): """ Remove elements from the sequence. :param eles: A list of elements to remove from the sequence. :type eles: list(residue.AbstractSequenceElement) :raise ValueError: If any of the given elements are not in the sequence. """ raise NotImplementedError
@property def _res_positions(self): """ Return the residue positions in the sequence, updating the cache if needed. :return: The residue positions :rtype: dict(`schrodinger.protein.residue.AbstractSequenceElement`, tuple(int, int)) """ if self._res_positions_cache is None: self._updateResiduePositionCache() return self._res_positions_cache def _removeFromSequence(self, keep_func, start=0, end=None): """ Remove any residues not matching keep_func from the sequence :param keep_func: A callable taking a residue and returning a bool indicating whether to keep it in the sequence :type keep_func: callable :param start: The index at which to start filtering :type start: int :param end: The index at which to end filtering :type end: int :raises ValueError: In the event that invalid indices are specified """ raise NotImplementedError # ========================================================================== # Gaps # ========================================================================== @property def gap_char(self): # Make gap_char a read-only property to avoid string comparison errors return self._gap_char
[docs] def getGaplessLength(self): """ :return: Length of this sequence ignoring gaps :rtype: int """ raise NotImplementedError
[docs] def getGaps(self): """ :rtype: list(residue.Gap) :return: The gaps in the sequence. """ return [res for res in self if res.is_gap]
[docs] def addGapsByIndices(self, gap_idxs): """ Add gaps to the sequence from a list of gap indices. Note that these indices are based on numbering *after* the insertion. To insert gaps using indices based on numbering before the insertion, see `addGapsBeforeIndices`. :param gap_idxs: A list of gap indices :type gap_idxs: list(int) """ raise NotImplementedError
[docs] def validateGapIndices(self, gap_idxs): """ Make sure that the specified gap indices are valid input for `addGapsByIndices`. If the indices are invalid, a `ValueError` will be raised. Indices are considered invalid if: - they refer to a position that's more than one residue past the end of the sequence - they are negative :param gap_idxs: The gap indices to validate :type gap_idxs: list[int] """ if any(idx < 0 for idx in gap_idxs): raise ValueError("Negative gap index") for i, cur_gap_idx in enumerate(sorted(gap_idxs)): if cur_gap_idx > len(self) + i: raise ValueError("Invalid gap index")
[docs] def addGapsBeforeIndices(self, indices): """ Add one gap to the alignment before each of the specified residue positions. Note that these indices are based on numbering *before* the insertion. To insert gaps using indices based on numbering after the insertion, see `addGapsByIndices`. :param indices: A list of indices to insert gaps before. :type indices: list(int) """ if not indices: return indices = offset_indices(indices) self.addGapsByIndices(indices)
[docs] def removeTerminalGaps(self): """ Remove gaps from the end of the sequence """ terminal_gaps = self.getTerminalGaps() if terminal_gaps: self.removeElements(terminal_gaps)
[docs] def getTerminalGaps(self): """ Return terminal gaps. :return: A list of terminal gaps (in ascending index order) :rtype: list(residue.Gap) """ last_res_idx = len(self) - 1 while last_res_idx >= 0 and self[last_res_idx].is_gap: last_res_idx -= 1 return self[last_res_idx + 1:]
[docs] def getGapCount(self): """ :return: the number of gaps in the sequence :rtype: int """ return len(self) - self.getGaplessLength()
[docs] def removeAllGaps(self): """ Remove gaps from the sequence """ self._removeFromSequence(lambda res: res.is_res, 0)
# ========================================================================== # Annotations # ========================================================================== @property def annotations(self): return self._annotations
[docs] def getAnnotation(self, index, annotation): """ Returns the annotation at the specified index or None for a gap. :raises ValueError: if the annotation is not available """ anno_name = annotation.name if index < len(self): element = self[index] if element.is_res: try: return getattr(element, anno_name) except AttributeError: if (self.annotations is None or not isinstance( annotation, self.annotations.ANNOTATION_TYPES)): msg = "%s is not in annotation types" % anno_name raise ValueError(msg) # get sequence-level annotation array = getattr(self.annotations, anno_name) return array[index] return None
[docs] def getAnnotationValueForComparison(self, col, anno, ann_index=0, cdr_scheme=None): """ Get an annotation value appropriate for determining whether two residues have the same annotation value. e.g. for binding site, only same-distance contacts of the same ligand should compare equal. :param anno: Protein sequence annotation enum member :type anno: annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES :param ann_index: Annotation index for multi-value annotations :type ann_index: int :param cdr_scheme: CDR scheme for antibody annotation. (Will be ignored if `anno` is not SEQ_ANNO_TYPES.antibody_cdr.) :type cdr_scheme: annotation.AntibodyCDRScheme :return: Annotation value for comparison or None if the corresponding residue is a gap or has no value for this annotation :rtype: object or None """ elem = self[col] if not anno.can_expand or elem.is_gap: return None if anno.multi_value: if anno is SEQ_ANNO_TYPES.domains: if self.annotations.domains is None or elem.domains is None: return None domain = self.annotations.domains[ann_index] if domain in elem.domains: # Only return domain string if the current residue is part # of this domain return domain return None elif anno is SEQ_ANNO_TYPES.binding_sites: # Return a tuple of (ligand name, contact_enum) so that only # contacts of the same ligand match if not self.annotations.ligands: return None lig = self.annotations.ligands[ann_index] lig_contacts = self.getAnnotation(col, anno) binding_site = lig_contacts[ann_index] if binding_site is annotation.BINDING_SITE.NoContact: return None return (lig, binding_site) else: # TODO MSV-3355 test whether this works for kinase conservation anno_val = self.getAnnotation(col, anno) if not anno_val: return None return anno_val[ann_index] elif anno is SEQ_ANNO_TYPES.antibody_cdr: cdr_feature = self.annotations.getAntibodyCDR(col, scheme=cdr_scheme) cdr_label = cdr_feature.label if cdr_label is annotation.AntibodyCDRLabel.NotCDR: return None return cdr_label elif anno is SEQ_ANNO_TYPES.kinase_features: kinase_feature = elem.kinase_features if kinase_feature is annotation.KinaseFeatureLabel.NO_FEATURE: return None return kinase_feature else: return self.getAnnotation(col, anno)
[docs] def getNumAnnValues(self, ann): return self.annotations.getNumAnnValues(ann)
[docs] def clearAllCaching(self): """ This method should be implemented in subclasses that cache any data. """
[docs] def setPfam(self, new_pfam, pfam_name): residues = list(self.residues()) if len(new_pfam) != len(residues): raise ValueError("Length of `new_pfam` must be equal to the number " "of residues in the sequence." f"Got: {len(new_pfam)} Expected: {len(residues)}") for res, p, in zip(residues, new_pfam): res.pfam = p self.pfam_name = pfam_name self.pfamChanged.emit()
[docs] def clearPfam(self): for res in self.residues(): res.pfam = None self.pfamChanged.emit()
[docs] def hasPfam(self): return not all(p is None for p in self.annotations.pfam)
# ========================================================================== # Structure # ==========================================================================
[docs] def hasStructure(self): """ :return: Whether this sequence has an associated structure. :rtype: bool """ raise NotImplementedError
[docs] def getStructure(self): """ :return: The associated structure. Will return None if there is no associated structure. :rtype: schrodinger.structure.Structure or NoneType """ raise NotImplementedError
[docs] def setStructure(self, struc): """ Set the associated structure. Can only be used on sequences with an associated structure. :param struc: The new structure for this sequence :type struc: schrodinger.structure.Structure :raise RuntimeError: If there's no structure associated with this sequence object. """ raise NotImplementedError
# ========================================================================== # Metrics # ========================================================================== def _getScore(self, scoring_function, other, consider_gaps=True, only_consider=None): """ Calculate and return the average score after applying a scoring function to the aligned residues of this sequence and the other sequence. :param scoring_function: A function that takes two residues and returns a score. :type scoring_function: callable(Residue, Residue) :param other: A sequence to compare against :type other: schrodinger.protein.sequence.Sequence :param consider_gaps: Whether we should count gaps when we're calculating the average score. Note that the scoring function is not applied to gaps, they are only used in finding the N to divide the total score by. :type consider_gaps: bool :param only_consider: Restrict the calculation to a subset of the specified residues :type: None or set :return: The average score after applying the scoring function to the aligned residues of this sequence and the other sequence. :rtype: float """ total_score = 0.0 total_length = 0 for res1, res2 in zip(self, other): if only_consider is not None: if res1 not in only_consider: continue if res1.is_gap and res2.is_gap: # Don't consider gap-only columns continue elif res1.is_res and res2.is_res: total_score += scoring_function(res1, res2) total_length += 1 elif consider_gaps: total_length += 1 if total_length > 0: return total_score / total_length else: return 0.0
[docs] def getIdentity(self, other, consider_gaps=True, only_consider=None): """ Return a float scoring the identity between the sequence and another sequence, assuming that they're already aligned :param other: A sequence to compare against :type other: schrodinger.protein.sequence.Sequence :param consider_gaps: Whether we should count gaps when we're calculating the average score. :type consider_gaps: bool :return: The sequence identity score (between 0.0 and 1.0) :rtype: float """ return self._getScore(self.ElementClass.getIdentity, other, consider_gaps, only_consider=only_consider)
[docs] def getSimilarity(self, other, consider_gaps=True, only_consider=None): """ Return a float score of the similarity count between the sequence and another sequence, assuming that they're already aligned. :param other: A sequence to compare against :type other: schrodinger.protein.sequence.Sequence :param consider_gaps: Whether we should count gaps when we're calculating the average score. :type consider_gaps: bool :return: The sequence similarity score (between 0.0 and 1.0) :rtype: float """ return self._getScore(self.ElementClass.getBinarySimilarity, other, consider_gaps, only_consider=only_consider)
[docs] def getConservation(self, other, consider_gaps=True, only_consider=None): """ Return a float scoring the homology conservation between the sequence and another sequence, assuming that they're already aligned. The homology criterion is based on "side chain chemistry" descriptor matching. :param other: A sequence to compare against :type other: schrodinger.protein.sequence.Sequence :param consider_gaps: Whether we should count gaps when we're calculating the average score. :type consider_gaps: bool :return: The sequence conservation score (between 0.0 and 1.0) :rtype: float """ return self._getScore(self.ElementClass.getConservation, other, consider_gaps, only_consider=only_consider)
[docs] def getSimilarityScore(self, other, consider_gaps=True, only_consider=None): """ Return the total score of similarity between the sequence and a other sequence, assuming that they're already aligned. :param other: A sequence to compare against :type other: schrodinger.protein.sequence.Sequence :param consider_gaps: Ignored because the similarity with a gap is 0.0 :type consider_gaps: bool :param only_consider: A set of residues to restrict attention to :type only_consider: set or None :return: The total sequence similarity score :rtype: int """ def include_in_calculation(pair): res1, res2 = pair if res1.is_gap or res2.is_gap: return False if only_consider is not None and res1 not in only_consider: return False return True residue_pairs = zip(self, other) residue_pairs = ( pair for pair in residue_pairs if include_in_calculation(pair)) similarities = ( self.ElementClass.getSimilarity(*pair) for pair in residue_pairs) return int(sum(similarities))
[docs] def getStructureResForRes(self, res): """ :param res: Residue to get structure residue for :type res: residue.Residue :return: Structure residue or None if no matching residue is found :rtype: schrodinger.structure._Residue or NoneType """ raise NotImplementedError
[docs] def structuredResidueCount(self): """ Get the number of residues in this sequence with an associated structured residue. :rtype: int """ if not self.hasStructure(): return 0 return sum( 1 for elem in self if (not elem.is_gap and not elem.seqres_only))
[docs] def hasStructuredResidues(self): """ Return whether this sequence has any structured residues. This method is equivalent to `bool(seq.structuredResidueCount())` but doesn't (typically) require iterating through the entire sequence. :rtype: bool """ return (self.hasStructure() and any(not (elem.is_gap or elem.seqres_only) for elem in self))
[docs] def getProperty(self, seq_prop): """ Get the sequence's value corresponding to the given SequenceProperty object :param seq_prop: The object describing the sequence property :type seq_prop: schrodinger.protein.properties.SequenceProperty :return: The value of the sequence property :rtype: float or None """ if seq_prop.property_type == PropertyType.StructureProperty: st = self.getStructure() if st is not None: return st.property.get(seq_prop.property_name) else: descriptors = self._descriptors[seq_prop.property_source] return descriptors.get(seq_prop.property_name)
[docs] def updateDescriptors(self, descriptors, property_source): """ Updates the descriptor dicts with new descriptor values :param descriptors: A dict mapping descriptor names to their values :type descriptors: dict[str, float] :param property_source: The source of the descriptors :type property_source: properties.PropertySource """ self._descriptors[property_source].update(descriptors)
[docs] def clearDescriptors(self): self._descriptors = { properties.PropertySource.Sequence: dict(), properties.PropertySource.Structure: dict(), properties.PropertySource.Residue: dict(), } self.descriptorsCleared.emit()
@property def descriptors(self): return self._descriptors
[docs] def hasDescriptors(self): for descriptors in self._descriptors.values(): if descriptors: return True return False
def _invalidateDescriptorsIfResiduesChanged(self, elements): if not self.hasDescriptors(): return if any(elem.is_res for elem in elements): self.clearDescriptors() def _invalidateDescriptors(self): if self.hasDescriptors(): self.clearDescriptors()
# To convince Hypothesis that Sequence is a sequence (MSV-1966) collections.abc.Sequence.register(AbstractSequence)
[docs]class AbstractSingleChainSequence(AbstractSequence): """ Base class for single-chain biological sequences Note: Protein-specific functionality should go in ProteinSequence. :cvar sequenceCopied: A signal emitted when this sequence is copied. Emitted with the sequence being copied and the newly created copy. This signal is used by the structure model to make sure that the newly created copy is kept in sync with the structure. :vartype sequenceCopied: QtCore.pyqtSignal """ sequenceCopied = QtCore.pyqtSignal(object, object) # ========================================================================== # Magic Methods # ==========================================================================
[docs] def __init__(self, elements="", name="", origin=None, entry_id=None, entry_name="", pdb_id="", chain="", structure_chain=None, long_name="", resnums=None): """ Make a sequence object from a list of strings and/or `self.ElementClass` Strings are converted to `self.ElementClass` using a mapping of strings to element types. :param elements: An iterable of elements making up the sequence. `None` are interpretted as gaps. :type elements: iterable(self.ElementClass or None) or iterable(str or None) :param name: The name of the sequence (possibly shortened), used for display purposes. For a FASTA sequence, this should be a short identifier such as the Uniprot ID. :type name: str :param origin: A piece of metadata indicating where the sequence came from :type origin: Sequence.ORIGIN or None :param entry_id: An entry associated with the sequence, if any :type: str :param entry_name: An entry name associated with the sequence, if any :type: str :param pdb_id: An id associated with the sequence, if any :type: str :param chain: The chain to which the sequence belongs :type chain: str :param structure_chain: The chain of the structure this sequence is associated with. This is usually the same as `chain` if the sequence has a structure but isn't necessarily. :type structure_chain: str :param long_name: The full name of the sequence. For a FASTA sequence, this should be the full FASTA header. :type long_name: str :param resnums: Residue numbers to assign to `elements`. If not given, residues will be numbered starting from one. Regardless of what's given here, no residue numbering will occur if `elements` is an iterable of `ElementClass` and any element already has a residue number set. If this iterable is shorter than `elements`, additional numbers will be generated by incrementing the last number present. :type resnums: Iterable(int) """ if elements is None: raise ValueError( "Cannot pass value of NoneType to protein sequence") if not self.ALPHABET: cls_name = self.__class__.__name__ raise NotImplementedError( f"{cls_name} cannot be instantiated directly.") super().__init__() self.name = name self._origin = None self.origin = origin self.entry_name = entry_name if entry_id is not None: assert str(entry_id) self.entry_id = entry_id self.long_name = long_name self._chain = chain self.structure_chain = structure_chain self.pdb_id = pdb_id self._visibility = None # The _get_structure and _set_structure values will be overwritten by the # structure model if this sequence is associated with a structure. self._get_structure = None self._set_structure = None self._residue_map = {} self._structure_res_map = None if not resnums: # Make sure that we set residue numbers if the input was strings resnums = (1,) self._sequence = self._makeSeqElements(elements, resnums) self._cached_str = None self._res_positions_cache = None self._gapless_length = None self.residuesChanged.connect(self._clearCachedStr) self.lengthChanged.connect(self._clearCachedStr)
def __str__(self): if self._cached_str is None: self._cached_str = "".join( self._gap_char if res.is_gap else res.type.short_code for res in self._sequence) return self._cached_str
[docs] def __len__(self): return len(self._sequence)
def __iter__(self): return iter(self._sequence) def __repr__(self): """ Returns a representation of the instance that can be pasted into a repl """ class_open = self.__class__.__name__ + "(" arguments = ", ".join(self._getReprArguments()) seq_rep = [class_open, arguments, ")"] return "".join(seq_rep)
[docs] def __contains__(self, item): return item in self._res_positions
def __bool__(self): return bool(self._sequence) def __getitem__(self, index): try: return self._sequence[index] except IndexError: msg = ("index (%d) out of range (sequence length is only %d)" % (index, len(self))) msg += "\n" + str(self) raise IndexError(msg) def __delitem__(self, index): elements = self[index] if isinstance(index, slice) else [self[index]] self.removeElements(elements) def __copy__(self): raise TypeError("Cannot make shallow copy. Must call deepcopy") def __deepcopy__(self, memo): copied_elements = copy.deepcopy(self._sequence, memo) copied_descriptors = copy.deepcopy(self._descriptors, memo) SequenceClass = self.__class__ new_seq = SequenceClass(copied_elements, name=self.name, origin=self.origin, entry_id=self.entry_id, chain=self.chain, structure_chain=self.structure_chain, long_name=self.long_name) new_seq._descriptors = copied_descriptors intra_ss_bonds = ( bond for bond in self.disulfide_bonds if bond.is_intra_sequence) for res1, res2 in intra_ss_bonds: residue.add_disulfide_bond(new_seq[res1.idx_in_seq], new_seq[res2.idx_in_seq]) intra_ss_bonds = (bond for bond in self.pred_disulfide_bonds if bond.is_intra_sequence) for res1, res2 in intra_ss_bonds: residue.add_disulfide_bond(new_seq[res1.idx_in_seq], new_seq[res2.idx_in_seq], known=False) # Note that this method does not copy _get_structure and _set_structure. # Instead we emit sequenceCopied. If this sequence is being monitored # by a structure model, then the structure model is responsible for # setting these attributes properly in new_seq. self.sequenceCopied.emit(self, new_seq) return new_seq @QtCore.pyqtSlot() def _clearCachedStr(self): self._cached_str = None def _getReprArguments(self): """ Generate the arguments to add to the repr string. :return: A list of argument strings :rtype: list(str) """ elements = "[%s]" % ",".join(self._resFormatCode(res) for res in self) name = "name='%s'" % re.escape(self.name) origin = "origin=%s" % self.origin entry_id = "entry_id='%s'" % self.entry_id chain = "chain='%s'" % self.chain return [elements, name, origin, entry_id, chain] def _resFormatCode(self, res): """ Returns a representation of a residue for __repr__ """ if res.is_gap: return "'%s'" % self.gap_char return "'%s'" % res # ========================================================================== # Construction # ==========================================================================
[docs] @classmethod def makeSeqElement(cls, element): """ :param element: A sequence element or string representation thereof :type element: str or cls.ElementClass :return: sequence element :rtype: cls.ElementClass """ # TODO Once MSV-1504 is implemented, type checking should be stricter # to avoid accidental mixing of amino acid and nucleotide residues if isinstance(element, residue.AbstractSequenceElement): return element elif isinstance(element, residue.CombinedChainResidueWrapper): return element.split_res elif element is None or element.strip() in cls._gap_chars: return residue.Gap() element = element.upper().strip() element_type = cls.ALPHABET.get(element) if element_type is None: element_type = cls._unknown_res_type if len(element) == 3: # Make custom unknown residue type for 3-character unknowns element_type = element_type.makeVariant(element) res = cls.ElementClass(element_type) return res
def _makeSeqElement(self, element): """ Helper function to call self.makeSeqElement and set Residue.sequence. If the sequence has a structure, then newly generated elements will have their `seqres_only` attribute default to True. """ res = self.makeSeqElement(element) res.sequence = self if isinstance(element, str) and self.hasStructure(): res.seqres_only = True return res def _makeSeqElements(self, elements, resnums=()): """ :param elements: iterable of sequence elements :type elements: iterable(self.ElementClass or None) or iterable(str or None) :param resnums: Residue numbers to assign to `elements`. These numbers will be ignored if `elements` is an iterable of `ElementClass` and any element already has a residue number set. If this iterable is shorter than `elements`, additional numbers will be generated by incrementing the last number present. :type resnums: Iterable(int) :return: list(self.ElementClass) :rtype: list .. note:: The performance of this method is critical for quickly loading sequences into the MSV. As such, it has been written to optimize performance over code reuse. (In other words, there's a good reason that this method doesn't just call `_makeSeqElement`.) """ if not elements: return [] if isinstance(elements, str): # remove all whitespace and other disallowed characters elements = DISALLOWED_SEQ_CHARS.sub("", elements) elements = elements.upper() return self._makeSeqElementsFromStr(elements, resnums) try: elements = list(elements) except TypeError: # TODO: Remove this. See MSV-2794. elements = [elements] # Figure out the type of the contents of elements. Since Nones are # allowed for gaps, we need to find the first non-None element elem = None for elem in elements: if elem is not None: break if elem is None: # elements is all Nones return [residue.Gap() for _ in elements] elif isinstance(elem, str): # convert elements to uppercase and remove spaces elements = [ None if e is None else e.upper().strip() for e in elements ] return self._makeSeqElementsFromStr(elements, resnums) elif isinstance(elem, residue.AbstractSequenceElement): return self._makeSeqElementsFromElements(elements, resnums) elif isinstance(elem, residue.CombinedChainResidueWrapper): # convert combined-chain elements to split-chain elements elements = [None if e is None else e.split_res for e in elements] return self._makeSeqElementsFromElements(elements, resnums) else: raise TypeError("Unrecognized type for element:", type(elem)) def _makeSeqElementsFromStr(self, elements, resnums=()): """ :param elements: iterable of sequence elements :type elements: iterable(str or None) :param resnums: Residue numbers to assign to `elements`. If this iterable is shorter than `elements`, additional numbers will be generated by incrementing the last number present. :type resnums: Iterable(int) :return: list(self.ElementClass) :rtype: list """ seq_elements = [None] * len(elements) has_struc = self.hasStructure() resnum = 1 if resnums else None for i, (elem, input_resnum) in enumerate( itertools.zip_longest(elements, resnums)): if input_resnum is not None: resnum = input_resnum if elem is None or elem in self._gap_chars: new_seq_elem = residue.Gap() else: elem_type = self.ALPHABET.get(elem, self._unknown_res_type) # Do not use keyword arguments here. Function calls with # only positional arguments are faster. new_seq_elem = self.ElementClass(elem_type, None, resnum, has_struc) if resnums: resnum += 1 new_seq_elem.sequence = self seq_elements[i] = new_seq_elem return seq_elements def _makeSeqElementsFromElements(self, elements, resnums): """ :param elements: iterable of sequence elements :type elements: iterable(self.ElementClass or None) :param resnums: Residue numbers to assign to `elements`. These numbers will be ignored if a residue number is set for any element in `elements. If this iterable is shorter than `elements`, additional numbers will be generated by incrementing the last number present. :type resnums: Iterable(int) :return: list(self.ElementClass) :rtype: list """ if resnums and any(elem.hasSetResNum() for elem in elements if elem is not None and not elem.is_gap): # ignore resnums if any of the input elements have residue numbers # set already resnums = () resnum = 1 if resnums else None for i, (elem, input_resnum) in enumerate( itertools.zip_longest(elements, resnums)): if elem is None: elem = residue.Gap() elements[i] = elem elem.sequence = self if resnums and not elem.is_gap: if input_resnum is not None: resnum = input_resnum elem.resnum = resnum resnum += 1 return elements
[docs] @classmethod def isValid(cls, elements): """ :param elements: An iterable of string representations of elements making up the sequence :type elements: iterable(str) or str :return: Tuple indicating whether valid and a set of invalid characters, if any :rtype: tuple(bool, set(str)) """ raise NotImplementedError
# ========================================================================== # Sequence-Level Properties # ========================================================================== @property def name(self): return self._name @name.setter def name(self, new_name): """ Set the name on the instance and emit a notification :param name: The new name for the sequence :type name: str :raises: ValueError if name is not str """ if not isinstance(new_name, str): raise ValueError(f"name must be str, not {type(new_name)}") # Strip out non-ASCII characters new_name = new_name.encode('ascii', 'ignore').decode('ascii') self._name = new_name self.nameChanged.emit() @property def chain(self): return self._chain @chain.setter def chain(self, name): self._chain = name self.nameChanged.emit() @property def fullname(self): """ :return: a formatted name + optional chain name for the sequence :rtype: str """ if self.chain: return "%s_%s" % (self._name, self.chain) return self._name @property def visibility(self): return self._visibility @visibility.setter def visibility(self, value): if self._visibility != value: self._visibility = value self.visibilityChanged.emit() # ========================================================================== # Residues # ==========================================================================
[docs] def index(self, res, ignore_gaps=False): """ Returns the index of the specified residue :param res: The residue to find :type res: residue.Residue :param ignore_gaps: Whether the index returned should ignore gaps in the sequence or not. :type ignore_gaps: bool :raises: A ValueError if the residue is not present :rtype: int :return: The index of the residue """ if res not in self._res_positions: err_msg = f"Specified residue {res} not found in sequence {self}" raise ValueError(err_msg) idx = 1 if ignore_gaps else 0 return self._res_positions[res][idx]
[docs] def indices(self, residues, ignore_gaps=False): """ Returns the indices of all specified residues. Note that there is no guarantee that the returned integers will be in the same order as the input residues. (For combined-chain sequences, it's highly likely that they *won't* be.) :param res: The residues to find indices of :type res: Iterable(residue.Residue) :param ignore_gaps: Whether the indices returned should ignore gaps in the sequence or not. :type ignore_gaps: bool :return: The indices of the residues :rtype: list[int] """ if self._res_positions_cache is None: self._updateResiduePositionCache() idx = 1 if ignore_gaps else 0 return [self._res_positions_cache[res][idx] for res in residues]
[docs] def insertElements(self, index, elements): # See parent class for method documentation try: elements = list(elements) except TypeError: # TODO: Remove this. See MSV-2794. elements = [elements] assign_numbers = all(isinstance(c, str) for c in elements) elements = self._makeSeqElements(elements) if assign_numbers: next_res = None prev_res = None true_index = min(index, max(0, len(self) - 1)) prev_indices = range(true_index - 1, -1, -1) for position in prev_indices: elem = self[position] if elem.is_res and elem.hasSetResNum(): prev_res = elem break next_indices = range(true_index, len(self)) for position in next_indices: elem = self[position] if elem.is_res and elem.hasSetResNum(): next_res = elem break assign_residue_numbers(elements, start_res=prev_res, end_res=next_res) old_length = len(self._sequence) new_length = old_length + len(elements) self.lengthAboutToChange.emit(old_length, new_length) self._sequence[index:index] = elements self._addNewResiduesToMap(elements) self.lengthChanged.emit(old_length, new_length) self.residuesAdded.emit(set(elements)) self.residuesChanged.emit(index, old_length)
[docs] def mutate(self, start, end, elements): # See parent class for method documentation elements = self._makeSeqElements(elements) old_length = len(self._sequence) length_change = len(elements) - (end - start) if length_change != 0: new_length = old_length + length_change self.lengthAboutToChange.emit(old_length, new_length) removed = self._sequence[start:end] for ele in removed: if ele.is_res: if ele.disulfide_bond is not None: residue.remove_disulfide_bond(ele.disulfide_bond) if ele.pred_disulfide_bond is not None: residue.remove_disulfide_bond(ele.pred_disulfide_bond) ele.sequence = None self._sequence[start:end] = elements self._removeResiduesFromMap(removed) self._addNewResiduesToMap(elements) if length_change != 0: self.lengthChanged.emit(old_length, new_length) last_changed_res_index = min(old_length, new_length) self.residuesChanged.emit(start, last_changed_res_index - 1) else: self.residuesChanged.emit(start, end - 1) self.residuesRemoved.emit(set(removed)) self.residuesAdded.emit(set(elements))
[docs] def append(self, element): # See parent class for method documentation element = self._makeSeqElement(element) self.extend([element])
[docs] def extend(self, elements): # See parent class for method documentation old_length = len(self._sequence) new_length = old_length + len(elements) self.lengthAboutToChange.emit(old_length, new_length) elements = self._makeSeqElements(elements) self._sequence.extend(elements) self.lengthChanged.emit(old_length, new_length) self.residuesChanged.emit(old_length, new_length) self.residuesAdded.emit(set(elements))
[docs] def getSubsequence(self, start, end): # See parent class for method documentation SequenceClass = self.__class__ return SequenceClass(copy.copy(res) for res in self[start:end])
[docs] def removeElements(self, eles): # See parent class for method documentation if len(eles) == 0: return elif any(ele not in self for ele in eles): err_msg = ('Elements must be contained by sequence in order to be ' 'removed') raise ValueError(err_msg) start_idx_removed = min(self.index(ele) for ele in eles) new_length = len(self) - len(eles) old_length = len(self) self.lengthAboutToChange.emit(old_length, new_length) self._removeResiduesFromMap(eles) self._sequence = [ele for ele in self._sequence if ele not in eles] for ele in eles: if ele.is_res: if ele.disulfide_bond is not None: residue.remove_disulfide_bond(ele.disulfide_bond) if ele.pred_disulfide_bond is not None: residue.remove_disulfide_bond(ele.pred_disulfide_bond) ele.sequence = None self.lengthChanged.emit(old_length, new_length) self.residuesChanged.emit(start_idx_removed, new_length - 1) self.residuesRemoved.emit(set(eles))
def _replaceAllElements(self, elements): """ Replace _sequence entirely with the supplied elements :param elements: Elements for the sequence :type elements: iterable(self.ElementClass) or iterable(str) """ new_length = len(elements) old_length = len(self) announce_length_change = new_length != old_length if announce_length_change: self.lengthAboutToChange.emit(old_length, new_length) elements = self._makeSeqElements(elements) self._sequence = elements if announce_length_change: self.lengthChanged.emit(old_length, new_length) self.residuesChanged.emit(0, old_length) def _removeFromSequence(self, keep_func, start=0, end=None): """ Remove any residues not matching keep_func from the sequence :param keep_func: A callable taking a residue and returning a bool indicating whether to keep it in the sequence :type keep_func: callable :param start: The index at which to start filtering :type start: int :param end: The index at which to end filtering :type end: int :raises ValueError: In the event that invalid indices are specified """ old_length = len(self) end = end if end is not None else old_length if end > old_length or start < 0: msg = ("Invalid indices specified (start=%d, end=%d) for a sequence" "of length %d" % (start, end, old_length)) raise ValueError(msg) new_sequence = self._sequence[0:start] for res in self._sequence[start:end]: if keep_func(res): new_sequence.append(res) else: res.sequence = None new_sequence.extend(self._sequence[end:]) self._replaceAllElements(new_sequence) def _updateResiduePositionCache(self): """ Update the cache of residues to their index within the sequence. The residue-index cache is stored as a mapping between residues and two indexes: 1) the index of the residue in the sequence including gaps and 2) the index of the residue in the sequence not including gaps. Gaps store `None` for their gapless indexes. Put more concisely, the residue-index cache maps: residue.AbstractSequenceElement -> tuple(gapped_index, gapless_index) """ self._res_positions_cache = {} gapless_idx = 0 for gapped_idx, res in enumerate(self): if res.is_res: self._res_positions_cache[res] = (gapped_idx, gapless_idx) gapless_idx += 1 else: self._res_positions_cache[res] = (gapped_idx, None) self._gapless_length = gapless_idx @QtCore.pyqtSlot() def _invalidateResiduePositionCache(self): """ Invalidate the cache of residues to their index within the sequence The connected signal emits no arguments and the slot decorator must match the signal, but the method does not need to have the params. Note that we do not actually calculate the indices until they are needed; this happens in _updateResiduePositionCache. """ self._res_positions_cache = None self._gapless_length = None # ========================================================================== # Gaps # ==========================================================================
[docs] def getGaplessLength(self): # See parent class for method documentation if self._gapless_length is None: self._updateResiduePositionCache() return self._gapless_length
[docs] def addGapsByIndices(self, gap_idxs): # See parent class for method documentation if not gap_idxs: return self.validateGapIndices(gap_idxs) gap_idxs = set(gap_idxs) old_length = len(self) new_length = len(self) + len(gap_idxs) self.lengthAboutToChange.emit(old_length, new_length) _sequence = iter(self._sequence) new_sequence = [] for i in range(new_length): next_element = None if i not in gap_idxs: next_element = next(_sequence, None) if next_element is None: next_element = residue.Gap() next_element.sequence = self new_sequence.append(next_element) assert len(new_sequence) == new_length, "Sanity check" self._sequence = new_sequence self.lengthChanged.emit(old_length, new_length) self.residuesChanged.emit(min(gap_idxs), old_length - 1)
# ========================================================================== # Predictions # ==========================================================================
[docs] def setSSA(self, new_ssa): residues = list(self.residues()) if len(new_ssa) != len(residues): raise ValueError('New ssa must be a list of equal length ' 'to the number of residues') for res, ssa in zip(residues, new_ssa): res.secondary_structure = ssa self._invalidateSecondaryStructure() self.secondaryStructureChanged.emit()
[docs] def setSSAPredictions(self, pred): if len(pred) != len(self): raise ValueError('Prediction must be a list of equal length ' 'to the sequence') for res, ssa in zip(self, pred): res.pred_secondary_structure = ssa self._invalidateSecondaryStructure() self._invalidatePredictionFlags() self.predictionsChanged.emit()
[docs] def setDisorderedRegionsPredictions(self, pred): if len(pred) != len(self): raise ValueError('Prediction must be a list of equal length ' 'to the sequence') for res, dis in zip(self, pred): res.pred_disordered = dis self._invalidatePredictionFlags() self.predictionsChanged.emit()
[docs] def setDomainArrangementPredictions(self, pred): if len(pred) != len(self): raise ValueError('Prediction must be a list of equal length ' 'to the sequence') for res, dom in zip(self, pred): res.pred_domain_arr = dom self._invalidatePredictionFlags() self.predictionsChanged.emit()
[docs] def setSolventAccessibilityPredictions(self, pred): if len(pred) != len(self): raise ValueError('Prediction must be a list of equal length ' 'to the sequence') for res, acc in zip(self, pred): res.pred_accessibility = acc self._invalidatePredictionFlags() self.predictionsChanged.emit()
[docs] def deletePrediction(self, prediction_type): if prediction_type is SEQ_ANNO_TYPES.pred_disulfide_bonds: for res in self: bond = res.pred_disulfide_bond if bond is not None: residue.remove_disulfide_bond(bond) self.predictionsChanged.emit() return empty_pred = [None] * len(self) if prediction_type is SEQ_ANNO_TYPES.pred_secondary_structure: self.setSSAPredictions(empty_pred) elif prediction_type is SEQ_ANNO_TYPES.pred_accessibility: self.setSolventAccessibilityPredictions(empty_pred) elif prediction_type is SEQ_ANNO_TYPES.pred_disordered: self.setDisorderedRegionsPredictions(empty_pred) elif prediction_type is SEQ_ANNO_TYPES.pred_domain_arr: self.setDomainArrangementPredictions(empty_pred) else: raise ValueError(f"{prediction_type} is not a prediction")
[docs] def deleteAllPredictions(self): all_predictions = [ SEQ_ANNO_TYPES.pred_disulfide_bonds, SEQ_ANNO_TYPES.pred_secondary_structure, SEQ_ANNO_TYPES.pred_accessibility, SEQ_ANNO_TYPES.pred_disordered, SEQ_ANNO_TYPES.pred_domain_arr ] for prediction in all_predictions: self.deletePrediction(prediction)
# ========================================================================== # Structure # ========================================================================== @property def origin(self): """ :return: A piece of metadata indicating where the sequence came from :rtype origin: `Sequence.ORIGIN` or None """ return self._origin @origin.setter def origin(self, new_origin): """ :param new_origin: A piece of metadata indicating where the sequence came from :type new_origin: `Sequence.ORIGIN` or None """ if new_origin is None or new_origin in self.ORIGIN: self._origin = new_origin else: origins = "".join(self.ORIGIN.names()) msg = "Sequence origin be one of the following values: %s" % origins raise ValueError(msg)
[docs] def hasStructure(self): # See AbstractSequence for documentation return self._get_structure is not None
[docs] def getStructure(self): # See AbstractSequence for documentation if self._get_structure is not None: return self._get_structure() else: return None
[docs] def setStructure(self, struc): # See AbstractSequence for documentation if self._set_structure is None: raise RuntimeError("No structure associated with this sequence.") else: self._set_structure(struc)
[docs] def onStructureChanged(self): self.annotations.onStructureChanged() self.structureChanged.emit() # This map is lazily regenerated when calling `getStructureResForRes` self._structure_res_map = None
[docs] def setResidueMap(self, residue_map): """ Set a new mapping between ResidueKey and structured residues. Note: the only intended user of this method is `schrodinger.application.msv.seqio.StructureConverter`, where the ResidueKey is computed from the `structure._Residue` used to create the `residue.Residue`. If the sequence has a structure, the map can be generated using `generateResidueMap`. :param residue_map: Mapping between residue key and Residue :type residue_map: dict(residue.ResidueKey, residue.Residue) """ self._residue_map = residue_map
[docs] def generateResidueMap(self): """ Create residue map based on current structured residues. Note: this method requires `self.hasStructure()` to be True and `self.entry_id` to be set. If this sequence was produced by `schrodinger.application.msv.seqio.StructureConverter`, there should already be a residue map and this method does not need to be called. :raise RuntimeError: If sequence has no structure or entry id """ if not self.hasStructure() or self.entry_id is None: raise RuntimeError("No structure associated with this sequence.") residue_map = {res.getKey(): res for res in self if res.hasStructure()} self.setResidueMap(residue_map)
def _addNewResiduesToMap(self, elements): """ Private method to add new residues to `self._residue_map`. Residues must already belong to the sequence and the sequence must have `entry_id` and structure set. :param elements: Sequence elements being added (gaps will be ignored) :type elements: list[residue.Residue] """ for ele in elements: if ele.hasStructure(): res_key = ele.getKey() self._residue_map[res_key] = ele def _removeResiduesFromMap(self, elements): """ Private method to remove residues from `self._residue_map`. Residues must still belong to the sequence and the sequence must have `entry_id` and structure set. :param elements: Sequence elements being removed (gaps will be ignored) :type elements: list[residue.Residue] """ for ele in elements: if ele.hasStructure(): res_key = ele.getKey() self._residue_map.pop(res_key, None) def _getStructureResMap(self): """ :return: Mapping between residue key and structure residue :rtype: dict(residue.ResidueKey, schrodinger.structure._Residue) :raise RuntimeError: If sequence has no structure """ if not self.hasStructure(): raise RuntimeError("No structure associated with this sequence.") if self._structure_res_map is None: structure_res_map = {} get_sres_key = partial(residue.get_structure_residue_key, entry_id=self.entry_id) st = self.getStructure() try: chain = st.chain[self.structure_chain] except KeyError: pass else: for sres in chain.residue: sres_key = get_sres_key(sres) structure_res_map[sres_key] = sres self._structure_res_map = structure_res_map return self._structure_res_map def _getStructureResByKey(self, res_key): """ :param res_key: Residue key: (entry_id, chain, resnum, inscode) :type res_key: residue.ResidueKey :return: Structure residue or None if no matching residue is found :rtype: schrodinger.structure._Residue or NoneType :raise RuntimeError: If sequence has no structure """ sres_map = self._getStructureResMap() return sres_map.get(res_key)
[docs] def getResByKey(self, res_key): """ :param res_key: Residue key: (entry_id, chain, resnum, inscode) :type res_key: residue.ResidueKey :return: Residue matching key or None if no matching residue is found :rtype: residue.Residue or NoneType :raise RuntimeError: If sequence has no structure """ if not self.hasStructure(): raise RuntimeError("No structure associated with this sequence.") return self._residue_map.get(res_key)
[docs] def getStructureResForRes(self, res): # See parent class for method documentation if not self.hasStructure(): return None res_key = res.getKey() sres = self._getStructureResByKey(res_key) return sres
[docs]class ProteinSequenceMeta(type(QtCore.QObject)): """ Metaclass for split-chain and combined-chain protein sequences """ @property def ALPHABET(self): # This metaclass property is needed because 1. ALPHABET must be # available from both the class and the instance 2. this call might # need to read the nonstandard residues database from disk so we don't # want to load it at import time. return residue.get_protein_alphabet()
[docs]class AbstractProteinSequenceMixin: """ A mixin for code shared between split-chain and combined-chain protein sequences. """ disulfideBondsCacheCleared = QtCore.pyqtSignal() predictionsChanged = QtCore.pyqtSignal() secondaryStructureChanged = QtCore.pyqtSignal() kinaseFeaturesChanged = QtCore.pyqtSignal() kinaseConservationChanged = QtCore.pyqtSignal() @property def ALPHABET(self): # This instance property is needed because metaclasses are excluded # from attribute lookup return type(self).ALPHABET # ========================================================================== # Magic Methods # ==========================================================================
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._secondary_structures = None self._pred_secondary_structures = None self._disulfide_bonds = None self._pred_disulfide_bonds = None self.pfam_name = None self._invalidatePredictionFlags()
# ========================================================================== # Annotations # ========================================================================== @property def disulfide_bonds(self): """ :return: A sorted tuple of the valid disulfide bonds. :rtype: tuple(residue.DisulfideBond) """ raise NotImplementedError @property def secondary_structures(self): """ A list of _SecondaryStructure namedtuples containing the type of secondary structure and where the secondary structures begin and end. :return: A list of namedtuples containing an SS_TYPE from schrodinger.structure and the residue indexes marking the limits of the secondary structure. :rtype: list(_SecondaryStructure) """ if self._secondary_structures is None: self._updateSecondaryStructure() return self._secondary_structures def _updateSecondaryStructure(self): """ Calculates and caches tuples describing the sequence secondary structures. The tuples contain the SSA type and their limits (ie where the structure begins and ends) """ raise NotImplementedError
[docs] def clearDisulfideBondsCache(self): self._disulfide_bonds = None self._pred_disulfide_bonds = None self.disulfideBondsCacheCleared.emit()
[docs] def isKinaseChain(self) -> bool: raise NotImplementedError
# ========================================================================== # Predictions # ========================================================================== @QtCore.pyqtSlot() def _invalidatePredictionFlags(self): self._has_ssa_prediction = None self._has_dis_predictions = None self._has_dom_prediction = None self._has_acc_predictions = None
[docs] def hasDisorderedRegionsPredictions(self): if self._has_dis_predictions is None: self._has_dis_predictions = any( r.pred_disordered is not None for r in self.residues()) return self._has_dis_predictions
[docs] def hasDisulfideBondPredictions(self): return bool(self.pred_disulfide_bonds)
[docs] def hasDomainArrangementPredictions(self): if self._has_dom_prediction is None: self._has_dom_prediction = any( r.pred_domain_arr is not None for r in self.residues()) return self._has_dom_prediction
[docs] def hasSolventAccessibility(self): if self._has_acc_predictions is None: self._has_acc_predictions = any( r.pred_accessibility is not None for r in self.residues()) return self._has_acc_predictions
[docs] def hasSSAPredictions(self): if self._has_ssa_prediction is None: secondary_strucs = self.pred_secondary_structures if (len(secondary_strucs) == 1 and secondary_strucs[0].ssa_type is None): self._has_ssa_prediction = False else: self._has_ssa_prediction = True return self._has_ssa_prediction
@property def pred_secondary_structures(self): if self._pred_secondary_structures is None: self._updatePredSecondaryStructure() return self._pred_secondary_structures
[docs]class ProteinSequence(json.JsonableClassMixin, AbstractProteinSequenceMixin, AbstractSingleChainSequence, metaclass=ProteinSequenceMeta): """ A single-chain protein sequence. :cvar secondaryStructureCacheCleared: A signal emitted when the secondary structure cache has been cleared. Used to keep the `CombinedChainProteinSequence` cache in sync. If listening for changes in the secondary structure values, use `secondaryStructureChanged` instead. """ AnnotationClass = annotation.ProteinSequenceAnnotations ElementClass = residue.Residue _gap_chars = constants.GAP_CHARS secondaryStructureCacheCleared = QtCore.pyqtSignal() # ========================================================================== # Magic Methods # ==========================================================================
[docs] def __init__(self, elements="", name="", origin=None, entry_id=None, entry_name="", pdb_id="", chain="", structure_chain=None, long_name="", resnums=(), disulfide_bonds=None, pred_disulfide_bonds=None): """ See `AbstractSingleChainSequence` for additional documentation. :param disulfide_bonds: A list of pairs of residue indices to link via disulfide bonds. :type disulfide_bonds: Iterable(tuple(int, int)) :param pred_disulfide_bonds: A list of pairs of residue indices to link via predicted disulfide bonds. :type pred_disulfide_bonds: Iterable(tuple(int, int)) """ super().__init__(elements, name=name, origin=origin, entry_id=entry_id, entry_name=entry_name, pdb_id=pdb_id, chain=chain, structure_chain=structure_chain, long_name=long_name, resnums=resnums) self._is_kinase_cons_annotated = False self._is_kinase_annotated = False self.residuesChanged.connect(self.kinaseFeaturesChanged) self.residuesChanged.connect(self._invalidateResiduePositionCache) self.lengthChanged.connect(self._invalidateResiduePositionCache) # Set up protein secondary structure slots self.residuesChanged.connect(self._invalidateSecondaryStructure) self.lengthChanged.connect(self._invalidateSecondaryStructure) self.residuesChanged.connect(self._invalidatePredictionFlags) self.lengthChanged.connect(self._invalidatePredictionFlags) self._pattern_seq_dict = None self.residuesChanged.connect(self._invalidatePatternSeqDict) if disulfide_bonds is not None: for i1, i2 in disulfide_bonds: residue.add_disulfide_bond(self[i1], self[i2]) if pred_disulfide_bonds is not None: for i1, i2 in pred_disulfide_bonds: residue.add_disulfide_bond(self[i1], self[i2], known=False)
def _getReprArguments(self): # See AbstractSingleChainSequence for method documentation arguments = super()._getReprArguments() if self.disulfide_bonds: arg = ("disulfide_bonds=" + self._bondIndicesForRepr(self.disulfide_bonds)) arguments.append(arg) if self.pred_disulfide_bonds: arg = ("pred_disulfide_bonds=" + self._bondIndicesForRepr(self.pred_disulfide_bonds)) arguments.append(arg) return arguments def _bondIndicesForRepr(self, disulfide_bonds): """ Generate a string describing the residue indices involved in the given disulfide bonds. :param disulfide_bonds: The bond to describe. :type disulfide_bonds: list(residue.DisulfideBond) :return: A string describing the disulfide bonds. :rtype: str """ intra_seq_bonds = [ bond for bond in disulfide_bonds if bond.isValid() and bond.is_intra_sequence ] indices = [(res1.idx_in_seq, res2.idx_in_seq) for (res1, res2) in intra_seq_bonds] indices.sort() return "[%s]" % ", ".join(f"({i1}, {i2})" for (i1, i2) in indices) def _resFormatCode(self, res): if res.is_gap: return super()._resFormatCode(res) return "'%s'" % res.long_code # ========================================================================== # Serialization # ==========================================================================
[docs] def toJsonImplementation(self): intraseq_bonds = ( bond for bond in self.disulfide_bonds if bond.is_intra_sequence) disulfide_bonds = [ [res1.idx_in_seq, res2.idx_in_seq] for res1, res2 in intraseq_bonds ] intraseq_pred_bonds = (bond for bond in self.pred_disulfide_bonds if bond.is_intra_sequence) pred_disulfide_bonds = [[res1.idx_in_seq, res2.idx_in_seq] for res1, res2 in intraseq_pred_bonds] return { 'residues': self._sequence, 'name': self.name, 'chain': self.chain, 'structure_chain': self.structure_chain, 'disulfide_bonds': disulfide_bonds, 'pred_disulfide_bonds': pred_disulfide_bonds, 'long_name': self.long_name, 'entry_id': self.entry_id, 'descriptors': { source.name: descriptor for source, descriptor in self.descriptors.items() }, }
[docs] @classmethod def fromJsonImplementation(cls, json_obj): residues = [] residues_str = [] is_kinase_cons_annotated = False is_kinase_annotated = False for ele in json_obj['residues']: try: elem = residue.Gap.fromJson(ele) except ValueError: elem = residue.Residue.fromJson(ele) if elem._kinase_conservation: is_kinase_cons_annotated = True if elem.kinase_features is not None: is_kinase_annotated = True residues.append(elem) residues_str.append(str(elem)) SeqType = guess_seq_type(residues_str) rehydrated_seq = SeqType(residues, entry_id=json_obj['entry_id'], structure_chain=json_obj['structure_chain']) rehydrated_seq._is_kinase_cons_annotated = is_kinase_cons_annotated rehydrated_seq._is_kinase_annotated = is_kinase_annotated for res1_idx, res2_idx in json_obj['disulfide_bonds']: residue.add_disulfide_bond(rehydrated_seq[res1_idx], rehydrated_seq[res2_idx]) for res1_idx, res2_idx in json_obj['pred_disulfide_bonds']: residue.add_disulfide_bond(rehydrated_seq[res1_idx], rehydrated_seq[res2_idx], known=False) rehydrated_seq.name = json_obj['name'] rehydrated_seq.long_name = json_obj['long_name'] rehydrated_seq.chain = json_obj['chain'] descriptors_from_json = json_obj.get('descriptors') if descriptors_from_json is not None: for source, descriptors in descriptors_from_json.items(): src = properties.PropertySource[source] rehydrated_seq.updateDescriptors(descriptors, src) return rehydrated_seq
[docs] @json.adapter(version=47007) def adapter47007(cls, json_dict): plain_seq = cls() json_dict['entry_id'] = plain_seq.entry_id json_dict['structure_chain'] = plain_seq.structure_chain return json_dict
[docs] @json.adapter(version=48003) def adapter48002(cls, json_dict): json_dict['long_name'] = json_dict.pop('title') return json_dict
[docs] @json.adapter(version=52065) def adapter52065(cls, json_dict): json_dict['kinase_features'] = None return json_dict
# ========================================================================== # Construction # ==========================================================================
[docs] @classmethod def isValid(cls, elements): """ :param elements: An iterable of string representations of elements making up the sequence :type elements: iterable(str) or str :return: Tuple indicating whether valid and a set of invalid characters, if any :rtype: tuple(bool, set(str)) """ valid_chars = set(cls.ALPHABET) | set(cls._gap_chars) invalid_chars = set(elements) - valid_chars return not invalid_chars, invalid_chars
# ========================================================================== # Annotations # ==========================================================================
[docs] def renumberResidues(self, new_rescode_map): """ Renumber residues in this sequence given a dictionary mapping old rescodes to the new desired rescodes. :type new_rescode_map: dict[tuple(int, str), tuple(int, str)] """ min_res_idx = None max_res_idx = None for idx, res in enumerate(self): if res.is_res and res.getChainKey() in new_rescode_map: res.resnum, res.inscode = new_rescode_map[res.getChainKey()] if min_res_idx is None: min_res_idx = idx max_res_idx = idx if min_res_idx is not None: self.residuesChanged.emit(min_res_idx, max_res_idx)
[docs] def isKinaseChain(self): return any( res.is_res and (res.kinase_features or res.kinase_conservation) for res in self)
@property def is_kinase_annotated(self): return self._is_kinase_annotated
[docs] def setKinaseFeatures(self, feature_map: Dict[residue.Residue, annotation.KinaseFeatureLabel]): """ :param feature_map: Map of residue to kinase feature label """ for res, feature in feature_map.items(): if res not in self: raise ValueError( 'All residues in feature_map must belong to this sequence') res.kinase_features = feature self.kinaseFeaturesChanged.emit() self._is_kinase_annotated = True
@property def is_kinase_cons_annotated(self): return self._is_kinase_cons_annotated
[docs] def setKinaseConservation(self, cons_map, lig_asl): """ :param cons_map: Map of residue to conservation :type cons_map: dict[residue.Residue, annotation.KinaseConservation] :param lig_asl: ASL of the associated ligand :type lig_asl: str """ try: lig_index = self.annotations.ligand_asls.index(lig_asl) except ValueError: return for res, cons in cons_map.items(): res.kinase_conservation[lig_index] = cons self.kinaseConservationChanged.emit() self._is_kinase_cons_annotated = True
@property def disulfide_bonds(self): # See AbstractProteinSequenceMixin for documentation if self._disulfide_bonds is None: bonds = set(res.disulfide_bond for res in self if res.is_res and res.disulfide_bond and res.disulfide_bond.isValid()) key = lambda bond: min( self.index(res) for res in bond if res in self) self._disulfide_bonds = tuple(sorted(bonds, key=key)) return self._disulfide_bonds @QtCore.pyqtSlot() def _invalidateSecondaryStructure(self): """ Invalidates the stored indices of start and end positions of secondary structure assignment blocks in the sequence. Note that we do not actually calculate the indices until they are needed; this happens in _updateSecondaryStructure. """ self._secondary_structures = None self._pred_secondary_structures = None self.secondaryStructureCacheCleared.emit() def _updateSecondaryStructure(self): # See AbstractProteinSequenceMixin for documentation self._secondary_structures = [] if len(self) == 0: return ssa = self._calculateSecondaryStructuresList( ssa_attr='secondary_structure') self._secondary_structures = ssa def _calculateSecondaryStructuresList(self, ssa_attr): """ Calculate and return a list of `_SecondaryStructure` tuples describing the types and ranges of secondary structures in the sequence. :param ssa_attr: The attribute name to access the secondary structure on a residue. :type ssa_attr: str :rtype : list[_SecondaryStructure] """ start_idxs = set() end_idxs = set() for prev_res, curr_res, next_res in self.iterNeighbors(): # Note: iterNeighbors uses `None` to indicate that neighbor does # not exist curr_ssa = getattr(curr_res, ssa_attr) if (prev_res is None or getattr(prev_res, ssa_attr) != curr_ssa): start_idxs.add(self.index(curr_res)) if (next_res is None or getattr(next_res, ssa_attr) != curr_ssa): end_idxs.add(self.index(curr_res)) start_idxs = sorted(start_idxs) end_idxs = sorted(end_idxs) all_ss = [] for start_idx, end_idx in zip(start_idxs, end_idxs): ss = _SecondaryStructure((start_idx, end_idx), getattr(self[start_idx], ssa_attr)) all_ss.append(ss) return all_ss # ========================================================================== # Predictions # ========================================================================== @property def pred_disulfide_bonds(self): # See AbstractProteinSequenceMixin for documentation if self._pred_disulfide_bonds is None: bonds = list( set(res.pred_disulfide_bond for res in self if res.is_res and res.pred_disulfide_bond and res.pred_disulfide_bond.isValid())) key = lambda bond: min( self.index(res) for res in bond if res in self) self._pred_disulfide_bonds = tuple(sorted(bonds, key=key)) return self._pred_disulfide_bonds def _updatePredSecondaryStructure(self): # See AbstractProteinSequenceMixin for documentation self._pred_secondary_structures = [] if len(self) == 0: return ssa = self._calculateSecondaryStructuresList( ssa_attr='pred_secondary_structure') self._pred_secondary_structures = ssa # ========================================================================== # Structure # ==========================================================================
[docs] def removeStructurelessResidues(self, start=0, end=None): """ Remove any structureless residues :param start: The index at which to start filtering structureless residues. :type start: int :param end: The index at which to end filtering :type end: int """ def keep_func(res): return (res.hasStructure() or res.is_gap or not res.sequence.hasStructure()) self._removeFromSequence(keep_func, start, end)
# ========================================================================== # Patterns # ==========================================================================
[docs] def encodeForPatternSearch(self, with_ss=False, with_flex=False, with_asa=False): """ Convert to sequence dict expected by `find_generalized_pattern`. :param with_ss: Whether to include secondary structure information. :type with_ss: bool :param with_flex: Whether to include flexibility information. :type with_flex: bool :param with_asa: Whether to include accessible surface area information. :type with_asa: bool :rtype: dict :return: dictionary of sequence data """ # TODO make cache aware of structure flags if self._pattern_seq_dict is not None: return self._pattern_seq_dict _aa_list = [] if with_ss: _ss_list = [] if with_asa: std_sasa_dict = calculate_sasa_dict() _asa_list = [] seq_dict = { "res_ids": [], } def is_positive(number): return number is not None and number > 0 if with_flex: _flex_list = [] pos_b_factors = [ res.b_factor for res in self if res.is_res and is_positive(res.b_factor) ] avg_b_factor = sum(pos_b_factors) / len(pos_b_factors) \ if pos_b_factors else None for i, res in enumerate(self): if res.is_gap: continue # amino acid sequence short_code = (res.short_code if res.short_code in AMINO_ACIDS_3_TO_1.values() else "X") _aa_list.append(short_code) # residue numbering seq_dict["res_ids"].append(res.rescode.strip()) # secondary structure if with_ss: ss_code = ' ' if res.secondary_structure is structure.SS_HELIX: ss_code = 'h' elif res.secondary_structure is structure.SS_STRAND: ss_code = 'e' _ss_list.append(ss_code) # flexibility if with_flex: flex_code = ' ' # Consider flexible if b-factor is above average if (is_positive(res.b_factor) and is_positive(avg_b_factor) and res.b_factor > avg_b_factor): flex_code = 'f' _flex_list.append(flex_code) # solvent_accessibility if with_asa: asa_code = ' ' # Consider exposed if ASA is greater than standard ASA std_sasa = std_sasa_dict.get(res.long_code) res_sasa = self.annotations.sasa[i] if (std_sasa is not None and res_sasa is not None and res_sasa > std_sasa): asa_code = 'e' _asa_list.append(asa_code) seq_dict["amino_acids"] = "".join(_aa_list) empty_seq_str = " " * len(seq_dict["res_ids"]) seq_dict["secondary_structure"] = ("".join(_ss_list) if with_ss else empty_seq_str) seq_dict["flexibility"] = ("".join(_flex_list) if with_flex else empty_seq_str) seq_dict["solvent_accessibility"] = ("".join(_asa_list) if with_asa else empty_seq_str) self._pattern_seq_dict = seq_dict return self._pattern_seq_dict
@QtCore.pyqtSlot() def _invalidatePatternSeqDict(self): """ Invalidates the stored sequence dictionary for pattern search. """ self._pattern_seq_dict = None # ========================================================================== # Caching # ==========================================================================
[docs] def clearAllCaching(self): self.annotations.clearAllCaching() self.clearDisulfideBondsCache() self._invalidateSecondaryStructure() self._invalidateResiduePositionCache() self._invalidatePatternSeqDict() self._invalidatePredictionFlags()
[docs] def setGPCRAnnotations(self, gpcr_annotation_map): for res, gpcr_anno in gpcr_annotation_map.items(): res.gpcr_segment = gpcr_anno['gpcr_segment'] res.gpcr_generic_number = gpcr_anno['gpcr_generic_number']
[docs]class NucleicSequenceMeta(ProteinSequenceMeta): ALPHABET = None
[docs]class NucleicAcidSequence(ProteinSequence, metaclass=NucleicSequenceMeta): # We only have "limited support" of nucleic acids, so this class is just a # stub. # # TODO (MSV-1504): Create proper nucleic acid domain objects AnnotationClass = annotation.NucleicAcidSequenceAnnotations ElementClass = residue.Nucleotide _unknown_res_type = residue.UNKNOWN_NA ALPHABET = None # Cannot directly instantiate a NucleicAcidSequence @property def is_kinase_cons_annotated(self): return True @property def is_kinase_annotated(self): return True
[docs] def isKinaseChain(self): return False
[docs] def getTranslation(self): """ Get a translated sequence. This method uses BioPython's translate method to convert a nucleic acid sequence into an amino acid sequence :return: A translated protein sequence. The name and chain from the nucleic acid sequence are copied over :rtype: ProteinSequence """ seq_str = str(self).replace(self.gap_char, '') extra = len(seq_str) % 3 if extra: # Pad to a multiple of 3 seq_str += "N" * (3 - extra) bp_amino_str = translate(seq_str, stop_symbol=self.gap_char) protein_seq = ProteinSequence(bp_amino_str, name=self.name, chain=self.chain) return protein_seq
[docs]class DNASequence(NucleicAcidSequence): ALPHABET = types.MappingProxyType(residue.DNA_ALPHABET)
[docs]class RNASequence(NucleicAcidSequence): ALPHABET = types.MappingProxyType(residue.RNA_ALPHABET)
[docs]class CombinedChainSequenceMeta(msv_utils.DocstringWrapperMetaClass, ProteinSequenceMeta): """ The metaclass for `CombinedChainProteinSequence`. This metaclass wraps the specified class attributes. """ def __new__(metacls, cls, bases, classdict, *, wraps=None, wrapped_constants=(), wrapped_properties=(), wrapped_getters=(), wrapped_setters=()): """ See Python documentation for positional argument documentation. :param wraps: The `AbstractSingleChainSequence` subclass to wrap. If None, no wrapping will be done. (This allows for subclassing of the wrapper class.) :type wraps: type or None :param wrapped_constants: Class constants to be wrapped. :type wrapped_constants: Iterable :param wrapped_properties: Properties to be wrapped. :type wrapped_properties: Iterable :param wrapped_getters: Getters to be wrapped. :type wrapped_getters: Iterable :param wrapped_setters: Setters to be wrapped. :type wrapped_setters: Iterable """ if wraps is not None: for name in wrapped_constants: classdict[name] = getattr(wraps, name) for name in wrapped_properties: classdict[name] = metacls._wrapProperty(name) for name in wrapped_getters: classdict[name] = metacls._wrapGetterMethod(name) for name in wrapped_setters: classdict[name] = metacls._wrapSetterMethod(name) return super().__new__(metacls, cls, bases, classdict, wraps=wraps) @staticmethod def _wrapProperty(name): """ Wrap the specified property. :param name: The name of the property to wrap. :type name: str :return: The wrapped property :rtype: property """ def getter(self): return getattr(self._seqs[0], name) def setter(self, value): for chain in self._seqs: setattr(chain, name, value) def deleter(self): for chain in self._seqs: delattr(chain, name) return property(getter, setter, deleter) @staticmethod def _wrapGetterMethod(name): """ Wrap the specified getter method. The wrapper method will retrieved values from the first wrapped sequence, so this should only be used for methods where the return value will be identical across all wrapped sequences. :param name: The name of the method to wrap. :type name: str :return: The wrapped method :rtype: function """ def getter(self, *args, **kwargs): return getattr(self._seqs[0], name)(*args, **kwargs) return getter @staticmethod def _wrapSetterMethod(name): """ Wrap the specified setter method. The wrapper method will set values on all wrapped sequences. :param name: The name of the method to wrap. :type name: str :return: The wrapped method :rtype: function """ def setter(self, *args, **kwargs): for chain in self._seqs: getattr(chain, name)(*args, **kwargs) return setter
[docs]@dataclasses.dataclass class GapRegion: """ Container for information about gaps to add to or remove from the start and end of a chain """ from_start: int from_end: int
GapRegionsForChains = typing.List[GapRegion] GapRegionsForSeqs = typing.List[GapRegionsForChains]
[docs]class CombinedChainProteinSequence( AbstractProteinSequenceMixin, AbstractSequence, metaclass=CombinedChainSequenceMeta, wraps=ProteinSequence, wrapped_constants=("ElementClass", "_gap_chars"), wrapped_properties=("origin", "entry_name", "entry_id", "long_name", "pdb_id", "visibility", "name"), wrapped_getters=("getStructure", "hasStructure"), wrapped_setters=("setStructure",)): """ A sequence that contains multiple chains from the same protein. Instances of this class do not directly contain any residues themselves and instead wrap one or several `ProteinSequence` objects. :note: `CombinedChainProteinSequence.visibility` properly reports entry inclusion state, but it may not correctly report entry visibility (e.g. partially visible vs. fully visible). The MSV structure icons only report inclusion state and the visibility of included entries isn't reported anywhere in the panel, though, so this limitation doesn't have any impact on functionality. """ AnnotationClass = annotation.CombinedChainProteinSequenceAnnotations # ========================================================================== # Magic Methods # ==========================================================================
[docs] def __init__(self, seqs): """ :param seqs: A list of the split-chain sequences to wrap. :type seqs: list(ProteinSequence) """ if not seqs: raise ValueError("ExtendedProteinSequence requires at least one " "sequence.") self._seqs = list(seqs) self._updateSeqOffsets() self._updateSeqCaches() super().__init__() for seq in seqs: self._connectSignals(seq)
def __deepcopy__(self, memo): copied_chains = copy.deepcopy(self._seqs, memo) return self.__class__(copied_chains) def __getitem__(self, index): if isinstance(index, slice): if index.step not in (None, 1): raise ValueError(f"{self.__class__.__name__} does not support " "slices with steps.") elif index.start == index.stop: return [] if index.start is None: start_seq_num = 0 start_index = 0 else: start_seq_num, start_index = self._indexToSeqIndexAndIndex( index.start) if index.stop is None: end_seq_num = len(self._seqs) - 1 end_index = len(self._seqs[-1]) - 1 else: end_seq_num, end_index = self._indexToSeqIndexAndIndex( index.stop - 1) if start_seq_num == end_seq_num: residues = self._seqs[start_seq_num][start_index:end_index + 1] else: residues = self._seqs[start_seq_num][start_index:] for seq_num in range(start_seq_num + 1, end_seq_num): residues.extend(self._seqs[seq_num][:]) residues.extend(self._seqs[end_seq_num][:end_index + 1]) return [ residue.CombinedChainResidueWrapper(res, self) for res in residues ] else: seq, index_in_seq, _ = self.indexToSeqAndIndex(index) res = seq[index_in_seq] return residue.CombinedChainResidueWrapper(res, self) def __iter__(self): for res in itertools.chain.from_iterable(self._seqs): yield residue.CombinedChainResidueWrapper(res, self)
[docs] def __len__(self): return self._seq_offsets[-1] + len(self._seqs[-1])
def __str__(self): return "".join(str(seq) for seq in self._seqs) def __repr__(self): chain_reprs = ", ".join(map(repr, self._seqs)) return f"{self.__class__.__name__}([{chain_reprs}])" def __hash__(self): return hash(self.chains) def __eq__(self, other): if isinstance(other, CombinedChainProteinSequence): return self.chains == other.chains else: return False def _getSignalsAndSlots(self, seq): """ Return signals and slots for signals emitted by the given split-chain sequences. :param seq: The split-chain sequence :type seq: ProteinSequence :return: A list of tuples of (signal, slot) :rtype: list(tuple(QtCore.pyqtBoundSignal, method) """ return [ (seq.residuesChanged, self._onResiduesChanged), (seq.lengthAboutToChange, self._onLengthAboutToChange), (seq.lengthChanged, self._onLengthChanged), (seq.nameChanged, self.nameChanged), (seq.visibilityChanged, self.visibilityChanged), (seq.structureChanged, self.structureChanged), (seq.annotationTitleChanged, self.annotationTitleChanged), (seq.disulfideBondsCacheCleared, self.clearDisulfideBondsCache), (seq.secondaryStructureCacheCleared, self._clearSecondaryStructureCache) ] # yapf: disable def _connectSignals(self, seq): for signal, slot in self._getSignalsAndSlots(seq): signal.connect(slot) def _disconnectSignals(self, seq): for signal, slot in self._getSignalsAndSlots(seq): signal.disconnect(slot) # ========================================================================== # Sequence-Level Properties # ========================================================================== @property def fullname(self): return f"{self.name}_All" # ========================================================================== # Residues # ==========================================================================
[docs] def index(self, res): chain = res.split_sequence try: chain_index = self._chain_indices[chain] except KeyError: err_msg = f"Specified residue {res} not found in sequence {self}" raise ValueError(err_msg) offset = self._seq_offsets[chain_index] return offset + chain.index(res.split_res)
[docs] def indices(self, residues): """ Returns the indices of all specified residues. Note that the returned integers will likely *not* be in the same order as the input residues. :param res: The residues to find indices of :type res: Iterable(residue.CombinedChainResidueWrapper) :return: The indices of the residues :rtype: list[int] """ by_chain = {} for cur_res in residues: split_res = cur_res.split_res by_chain.setdefault(split_res.sequence, []).append(split_res) indices = [] for chain, cur_residues in by_chain.items(): offset = self.offsetForChain(chain) cur_indices = chain.indices(cur_residues) indices.extend(i + offset for i in cur_indices) return indices
[docs] def insertElements(self, index, elements): # See parent class for method documentation chain, chain_index, _ = self.indexToSeqAndIndex(index) chain.insertElements(chain_index, elements)
[docs] def mutate(self, start, end, elements): """ Mutate sequence elements. See parent class for additional method documentation. :raise MultipleChainsError: If the specified residue range spans multiple chains. """ chain, start_chain_index, end_chain_index = \ self._getMutationChainIndices(start, end) chain.mutate(start_chain_index, end_chain_index, elements)
[docs] def assertCanMutateResidues(self, start, end): """ Make sure that we can mutate the specified residues. If not, raise an exception. :param start: The index at which to start mutating :type start: int :param end: The index of the last mutated element (exclusive) :type end: int :raise MultipleChainsError: If the specified residue range spans multiple chains. """ self._getMutationChainIndices(start, end)
[docs] def append(self, element): # See parent class for method documentation self._seqs[-1].append(element)
[docs] def extend(self, elements): # See parent class for method documentation self._seqs[-1].extend(elements)
[docs] def getSubsequence(self, start, end): """ Return a sequence containing a subset of the elements in this one. Note that the new sequence will be a split-chain sequence and will ignore any chain breaks present in the requested subset of elements. :param start: The index at which the subsequence should start :type start: int :param end: The index at which the subsequence should end (exclusive) :type end: int :return: The requested subsequence :rtype: ProteinSequence """ return ProteinSequence( copy.copy(res.split_res) for res in self[start:end])
[docs] def removeElements(self, eles): # See parent class for method documentation by_chain = {} for cur_ele in eles: by_chain.setdefault(cur_ele.split_sequence, []).append(cur_ele.split_res) if not set(by_chain.keys()).issubset(self._seqs): raise ValueError("Elements must be contained by sequence in order " "to be removed") for chain, cur_eles in by_chain.items(): chain.removeElements(cur_eles)
# note that the sequence offsets will be updated in response to the # lengthChanged signal, so we don't need to explicitly call # _updateSeqOffsets here def _onResiduesChanged(self, start, end): chain = self.sender() chain_i = self._seqs.index(chain) offset = self._seq_offsets[chain_i] self.residuesChanged.emit(start + offset, end + offset) def _onLengthAboutToChange(self, old_chain_len, new_chain_len): old_combined_len = len(self) new_combined_len = old_combined_len + new_chain_len - old_chain_len self.lengthAboutToChange.emit(old_combined_len, new_combined_len) def _onLengthChanged(self, old_chain_len, new_chain_len): self._updateSeqOffsets() new_combined_len = len(self) old_combined_len = new_combined_len - new_chain_len + old_chain_len self.lengthChanged.emit(old_combined_len, new_combined_len) def _removeFromSequence(self, keep_func, start=0, end=None): # See parent class for method documentation if end is None: end = len(self) for chain, offset in zip(self._seqs, self._seq_offsets): if end < offset or start >= offset + len(chain): continue if start <= offset: cur_start = 0 else: cur_start = start - offset if end >= offset + len(chain): cur_end = None else: cur_end = end - offset chain._removeFromSequence(keep_func, cur_start, cur_end)
[docs] def getStructureResForRes(self, res): # See parent class for method documentation chain = res.split_sequence if chain not in self._seqs: raise ValueError("Residue not part of this sequence") if not chain.hasStructure(): return None return chain.getStructureResForRes(res.split_res)
# ========================================================================== # Gaps # ==========================================================================
[docs] def getGaplessLength(self): # See parent class for method documentation return sum(chain.getGaplessLength() for chain in self._seqs)
[docs] def addGapsByIndices(self, gap_idxs): # See parent class for method documentation # Adding a gap is ambiguous at chain boundaries. We always add to # the beginning of the later sequence. chain_i = 0 gaps_by_chain = [[] for _ in range(len(self._seqs))] num_chains = len(self._seqs) for gap_i, cur_index in enumerate(sorted(gap_idxs)): while chain_i < num_chains: adjusted = cur_index - gap_i - self._seq_offsets[chain_i] # figure out the index of last allowable gap for this chain last_allowable_gap = len(self._seqs[chain_i]) if chain_i < num_chains - 1: last_allowable_gap -= 1 if adjusted > last_allowable_gap: chain_i += 1 continue else: gaps_by_chain[chain_i].append(adjusted) break else: raise ValueError("Invalid gap index") for split_seq, gaps in zip(self._seqs, gaps_by_chain): if gaps: split_seq.addGapsBeforeIndices(gaps)
# note that the sequence offsets will be updated in response to the # lengthChanged signal, so we don't need to explicitly call # _updateSeqOffsets here
[docs] def addGapsToChainStartsAndEnds(self, gaps: GapRegionsForChains): """ Add the specified numbers of gaps to the starts and ends of each chain. :param gaps: The numbers of gaps to add """ assert len(self._seqs) == len(gaps) for chain, gap_region in zip(self._seqs, gaps): start_gaps = [residue.Gap() for _ in range(gap_region.from_start)] chain.insertElements(0, start_gaps) end_gaps = [residue.Gap() for _ in range(gap_region.from_end)] chain.extend(end_gaps)
[docs] def removeGapsFromChainStartsAndEnds(self, gaps: GapRegionsForChains): """ Remove the specified numbers of gaps from the starts and ends of each chain. :param gaps: The numbers of gaps to remove """ assert len(self._seqs) == len(gaps) elems = self._getGapsFromChainStartsAndEnds(gaps) elems = [ residue.CombinedChainResidueWrapper(gap, self) for gap in elems ] self.removeElements(elems)
[docs] def validateGapsToRemoveFromChainStartAndEnds(self, gaps: GapRegionsForChains): """ Make sure that we can remove the specified numbers of gaps from the starts and ends of each chain. :param gaps: The numbers of gaps to remove :raise AssertionError: If some of the sequence elements to be removed aren't actually gaps. """ self._getGapsFromChainStartsAndEnds(gaps)
def _getGapsFromChainStartsAndEnds(self, gaps: GapRegionsForChains): """ Fetch the sequence elements representing the specified numbers of gaps from the starts and ends of each chain. :param gaps: The numbers of gaps to remove :raise AssertionError: If some of the sequence elements to be removed aren't actually gaps. """ elems = [] for chain, gap_region in zip(self._seqs, gaps): elems.extend(chain[0:gap_region.from_start]) if gap_region.from_end: elems.extend(chain[-gap_region.from_end:]) assert all(e.is_gap for e in elems) return elems # ========================================================================== # Annotations # ========================================================================== @property def disulfide_bonds(self): """ :return: A sorted tuple of the valid disulfide bonds. :rtype: tuple(residue.CombinedChainDisulfideBond) """ if self._disulfide_bonds is None: bonds = { residue.CombinedChainDisulfideBond(cur_bond, self) for chain in self._seqs for cur_bond in chain.disulfide_bonds } self._disulfide_bonds = tuple( sorted((bond for bond in bonds if bond.isValid()), key=lambda bond: min(self.index(res) for res in bond))) return self._disulfide_bonds def _updateSecondaryStructure(self): # See AbstractProteinSequenceMixin for documentation self._secondary_structures = [] for chain, offset in zip(self._seqs, self._seq_offsets): for split_ss in chain.secondary_structures: split_start, split_end = split_ss.limits combined_start = split_start + offset combined_end = split_end + offset combined_ss = _SecondaryStructure( (combined_start, combined_end), split_ss.ssa_type) self._secondary_structures.append(combined_ss) def _clearSecondaryStructureCache(self): self._secondary_structures = None
[docs] def isKinaseChain(self): return any(chain.isKinaseChain() for chain in self.chains)
# ========================================================================== # Predictions # ========================================================================== @property def pred_disulfide_bonds(self): # See AbstractProteinSequenceMixin for documentation if self._pred_disulfide_bonds is None: bonds = { residue.CombinedChainDisulfideBond(cur_bond, self) for chain in self._seqs for cur_bond in chain.pred_disulfide_bonds } key = lambda bond: min( self.index(res) for res in bond if res in self) self._pred_disulfide_bonds = tuple(sorted(bonds, key=key)) return self._pred_disulfide_bonds def _updatePredSecondaryStructure(self): # See AbstractProteinSequenceMixin for documentation self._pred_secondary_structures = [] for chain, offset in zip(self._seqs, self._seq_offsets): for split_ss in chain.pred_secondary_structures: split_start, split_end = split_ss.limits combined_start = split_start + offset combined_end = split_end + offset combined_ss = _SecondaryStructure( (combined_start, combined_end), split_ss.ssa_type) self._pred_secondary_structures.append(combined_ss) # ========================================================================== # Chains and Sequences # ========================================================================== def _updateSeqOffsets(self): """ Update `self._seq_offsets`, which gives the residue index for the start of each chain. Should be called when `self._seqs` changes. """ self._seq_offsets = [0] + list( itertools.accumulate(len(seq) for seq in self._seqs[:-1])) self._ro_seq_offsets = tuple(self._seq_offsets) def _updateSeqCaches(self): """ Update the read-only copy of the sequence and sequence offsets. These copies are used to ensure that clients of this class can't modify internal data. """ self._ro_seqs = tuple(self._seqs) self._chain_indices = types.MappingProxyType( {seq: i for i, seq in enumerate(self._seqs)}) def _indexToSeqIndexAndIndex(self, index): """ Convert a combined-chain residue index to a sequence index and a residue index within the specified sequence. :param index: A valid combined-chain residue index :type index: int :return: The split-chain sequence index and residue index :rtype: tuple(int, int) """ if index < 0: index = len(self) + index seq_index = bisect.bisect_right(self._seq_offsets, index) - 1 index_in_seq = index - self._seq_offsets[seq_index] return seq_index, index_in_seq
[docs] def indexToSeqAndIndex(self, index): """ Convert a combined-chain residue index to a split-chain sequence and a residue index within the specified sequence. :param index: A valid combined-chain residue index :type index: int :return: A tuple of - the split-chain sequence - residue index - the starting index of the split-chain sequence :rtype: tuple(ProteinSequence, int, int) """ seq_index, index_in_seq = self._indexToSeqIndexAndIndex(index) return self._seqs[seq_index], index_in_seq, self._seq_offsets[seq_index]
@property def chain(self): return "All" @property def chains(self): return self._ro_seqs @property def chain_offsets(self): return self._ro_seq_offsets
[docs] def hasChain(self, chain_name): """ Does this sequence contain a chain with the specified name? :param chain_name: The chain name to check :type chain_name: str :rtype: bool """ return any(chain_name == seq.chain for seq in self._seqs)
[docs] def addChain(self, seq): """ Add a new chain to this sequence. :param seq: The chain to add :type seq: ProteinSequence """ old_len = len(self) new_len = old_len + len(seq) self.lengthAboutToChange.emit(old_len, new_len) self._seqs.append(seq) self._updateSeqOffsets() self._updateSeqCaches() self.clearDisulfideBondsCache() self._connectSignals(seq) self._clearSecondaryStructureCache() self.annotations.chainAdded(seq) self.lengthChanged.emit(old_len, new_len)
[docs] def removeChain(self, seq): """ Remove a chain from this sequence. Note that you should not remove the last chain; instead, remove this sequence from the alignment. :param seq: The chain to remove :type seq: ProteinSequence """ if len(self._seqs) == 1: raise ValueError(f"Cannot remove last chain from a " f"{self.__class__.__name__}") index = self._seqs.index(seq) old_len = len(self) new_len = old_len - len(seq) self.lengthAboutToChange.emit(old_len, new_len) del self._seqs[index] self._updateSeqOffsets() self._updateSeqCaches() self._disconnectSignals(seq) self._clearSecondaryStructureCache() self.clearDisulfideBondsCache() self.annotations.chainRemoved(seq) self.lengthChanged.emit(old_len, new_len)
[docs] def removeChains(self, seqs): """ Remove multiple chains from this sequence. Note that you should not all chain from a combine-chain sequence; instead, remove the sequence from the alignment. :param seqs: The chains to remove :type seqs: list[ProteinSequence] """ for seq in seqs: self.removeChain(seq)
[docs] def insertElementByChain(self, index, chain, element): """ Add the given element to the specified chain of a sequence. :param index: The index to insert the element at. Note that this is the index in the sequence, not the chain. :type index: int :param chain: The chain to insert into :type chain: sequence.ProteinSequence :param element: The element to insert :type element: str or residue.AbstractSequenceElement """ chain_index = self._chain_indices[chain] index_in_chain = index - self._seq_offsets[chain_index] assert 0 <= index_in_chain <= len(chain) chain.insertElements(index_in_chain, [element])
[docs] def offsetForChain(self, chain): """ Get the combined-chain residue index for the first residue of the specified chain. :param chain: The chain :type chain: ProteinSequence :return: The offset :rtype: int """ chain_index = self._chain_indices[chain] return self._seq_offsets[chain_index]
def _getMutationChainIndices(self, start, end): """ Get the chain and chain indices referred to by the specified combined- chain indices. :param start: The index at which to start mutating :type start: int :param end: The index of the last mutated element (exclusive) :type end: int :return: A tuple of - the chain that the indices refer to - The start index within the chain - The end index within the chain :rtype: tuple(sequence.ProteinSequence, int, int) :raise MultipleChainsError: If the specified residue range spans multiple chains. """ start_chain, start_chain_index, _ = self.indexToSeqAndIndex(start) # since end is exclusive, we want the chain of the previous residue if end > start: end -= 1 offset = 1 elif end < start: raise RuntimeError("End must be greater than or equal to start") else: offset = 0 end_chain, end_chain_index, _ = self.indexToSeqAndIndex(end) end_chain_index += offset if start_chain != end_chain: raise MultipleChainsError("Cannot mutate multiple chains.") return start_chain, start_chain_index, end_chain_index # ========================================================================== # Caching # ==========================================================================
[docs] def clearAllCaching(self): # we intentionally don't clear the read-only sequence and sequence- # offset caches, since those aren't regenerated as needed. self.annotations.clearAllCaching() self._clearSecondaryStructureCache() self.clearDisulfideBondsCache() for chain in self._seqs: chain.clearAllCaching()
[docs]class MultipleChainsError(ValueError): """ An exception raised when the specified indices span multiple chains but the operation can only be carried out on a single chain. """
# This class intentionally left blank
[docs]class StructureSequence(structure._AtomCollection): """ Class representing a sequence of protein residues. """ def _getResidueIterator(self): return structure._structure._get_residues_by(self) residue = property( _getResidueIterator, doc="Returns residue iterator for all residues in the sequence")
[docs]def get_structure_sequences(st): """ Iterates over all sequences in the given structure. """ mm.mmpdb_initialize(mm.error_handler) new_indices = mm.mmpdb_get_sequence_order(st.handle, mm.MMPDB_SEQUENCE_ORDER) mm.mmpdb_terminate() curr_sequence_atoms = [] prev_res_atom = None # An atom of the previous residue prev_residue_id = (None, None, None) # Skip atom 0, as it is simply a place holder, because atoms are 1-indexed. for ai in new_indices[1:]: atom = st.atom[ai] residue_id = (atom.resnum, atom.chain, atom.inscode) # Check to see whether we reached the end of this sequence if residue_id != prev_residue_id and prev_res_atom is not None and not mm.mmct_res_connected( st.handle, prev_res_atom, ai): # This is the start of a new sequence yield StructureSequence(st, curr_sequence_atoms) curr_sequence_atoms = [] prev_res_atom = ai prev_residue_id = residue_id # Whether new sequence or not, we need to update these: curr_sequence_atoms.append(ai) if curr_sequence_atoms: yield StructureSequence(st, curr_sequence_atoms)
[docs]def find_generalized_pattern(sequence_list, pattern, validate_pattern=False): """ Finds a generalized sequence pattern within specified sequences. NOTE: The search is performed in the forward direction only. :param sequence_list: list of sequence dictionaries to search. :type sequence_list: list(dict) Each sequence is a dictionary including amino acid strings and associated data. The amino acid string should be ungapped. The dictionary should include 'amino_acids' key and optionally 'secondary_structure', 'solvent_accessibility' and 'flexibility' keys: - sequence is expressed as uppercase 1-letter code: sequence['amino_acids'] = 'ACDEFGHIKLMNPQRSTVWY' (sequence in 1-letter code, mandatory) - secondary structure usese 'h', 'e' and ' ' symbols: sequence['secondary_structure'] = 'hhhhh eeeee ' - solvent accessibility uses 'e' (exposed) and ' ' (buried) symbols: sequence['solvent_accessibiliy' = 'eeeee eeeee ' - flexibility uses 'f' (flexible) and ' ' (not flexible) symbols: sequence['flexibility'] = ' fffff fffff' :type pattern: str :param pattern: Pattern defined using extended PROSITE syntax. - standard IUPAC one-letter codes are used for all amino acids - each element in a pattern is separated using '-' symbol - symbol 'x' is used for position where any amino acid is accepted - ambiguities are listed using the acceptable amino acids between square brackets, e.g. [ACT] means Ala, Cys or Thr - amino acids not accepted for a given position are indicated by listing them between curly brackets, e.g. {GP} means 'not Gly and not Pro' - repetition is indicated using parentheses, e.g. A(3) means Ala-Ala-Ala, x(2,4) means between 2 to 4 any residues - the following lowercase characters can be used as additional flags: - 'x' means any amino acid - 'a' means acidic residue: [DE] - 'b' means basic residue: [KR] - 'o' means hydrophobic residue: [ACFILPWVY] - 'p' means aromatic residue: [WYF] - 's' means solvent exposed residue - 'h' means helical residue - 'e' means extended residue - 'f' means flexible residue - Each position can optionally by followed by @<res_num> expression that will match the position with a given residue number. - Entire pattern can be followed by :<index> expression that defines a 'hotspot' in the pattern. When the hotspot is defined, only a single residue corresponding to (pattern_match_start+index-1) will be returned as a match. The index is 1-based and can be used to place the hotspot outside of the pattern (can also be a negative number). Pattern examples: - N-{P}-[ST] : Asn-X-Ser or Thr (X != Pro) - N[sf]-{P}[sf]-[ST][sf] : as above, but all residues flexible OR solvent exposed - Nsf-{P}sf-[ST]sf : as above, but all residues flexible AND solvent exposed - Ns{f} : Asn solvent exposed AND not in flexible region - N[s{f}] : Asn solvent exposed OR not in flexible region - [ab]{K}{s}f : acidic OR basic, with exception of Lys, flexible AND not solvent exposed - Ahe : Ala helical AND extended - no match possible - A[he] : Ala helical OR extended - A{he} : Ala coiled chain conformation (not helical nor extended) - [ST] : Ser OR Thr - ST : Ser AND Thr - no match possible :type validate_pattern: boolean :param validate_pattern: If True, the function will validate the pattern without performing the search (the sequences parameter will be ignored) and return True if the pattern is valid, or False otherwise. The default is False. :rtype: list of lists of integer tuples or False if the pattern is invalid :return: False if the specified input pattern was incorrect. Otherwise, it returns a list of lists of matches for each input sequence. Each match is a (start, end) tuple where start and end are matching sequence positions. """ ''' FIXME This will modify the user's input. Not good. Intead we can expect the user to do this themselves. if invert and sequence_list: for k in sequences.values(): sequences[k] = sequences[k][::-1] ''' # Cast unicode pattern to str to avoid b''.translate() exception pattern = str(pattern) # Remove color designator if present if '#' in pattern: pattern = pattern[:pattern.find('#')] hotspot = None # Extract a hotspot marker if ':' in pattern: pos = pattern.find(':') try: hotspot = int(pattern[pos + 1:]) except ValueError: hotspot = None else: pattern = pattern[:pos] # Split the pattern by dash symbols plist = [p for p in pattern.split('-') if p] pattern = "" idlist = [] # Extract residue number matches for index, p in enumerate(plist): _id = "" match = re.match(r""" (.*) # everything before the @ @(\d+[a-zA-Z]?) # the @ and residue number with optional insertion code (.*)$ # everything after the residue number """, p, flags=re.VERBOSE) if match: # Fail if a residue number match is requested but # the sequence list does not have residue number information if sequence_list and not sequence_list[0].get("res_ids"): return False p = match.group(1) + match.group(3) _id = match.group(2) else: _id = "" if p == "": p = "x" plist[index] = p idlist.append(_id) replace_wildcards = lambda x: x.replace('a', 'ED'). \ replace('b', 'KR').replace('o', 'WILYFVPCA').replace('p', 'WYF') # Convert the pattern to a regular expression (a bit clunky code) for p in plist: s = p.split('(') p = s[0] pattern += '(' # Extract included, excluded and mandatory amino acids incl_aa = delete_from_str(''.join(re.findall(r"\[[abopA-Z]+\]", p)), '[]') excl_aa = delete_from_str(''.join(re.findall(r"\{[A-Zabop]+\}", p)), '{}') mand_aa = delete_from_str(p, incl_aa + excl_aa + "[]{}efhs") incl_aa = replace_wildcards(incl_aa) excl_aa = replace_wildcards(excl_aa) if 'x' in incl_aa + mand_aa: pattern += '.' else: if 'x' in excl_aa: # Exclude everything return False if len(mand_aa) > 1: # Wrong pattern, no match possible return False if mand_aa: if mand_aa in "abop": incl_aa = replace_wildcards(mand_aa) else: pattern += mand_aa if incl_aa or excl_aa: pattern += '[' if incl_aa: pattern += delete_from_str(incl_aa, excl_aa) else: pattern += '^' + excl_aa pattern += ']' # Extract included, excluded and mandatory flags incl_flag = delete_from_str(''.join(re.findall(r"\[[efhs\{\}]+\]", p)), '[]{}') excl_flag = delete_from_str(''.join(re.findall(r"\{[efhs\[\]]+\}", p)), '{}[]') mand_flag = [ c for c in delete_from_str(p, incl_flag + excl_flag + "[]{}") if c in "efhs" ] incl_flag = delete_from_str(incl_flag, excl_flag) if 'e' in mand_flag and 'h' in mand_flag: # Can't be simultaneously helix and extended return False p = "" for f in ['eh', 's', 'f']: p = "" for c in f: if c in mand_flag: p = c elif c in incl_flag: if '[' in p: p = '[' + c + p[1:] else: p = '[' + c + ']' elif c in excl_flag: if '[' in p: p = '[^' + c + p[2:] else: p = '[^' + c + ']' if not p: p = '.' pattern += p pattern += '-)' # Add number of pattern repetitions if len(s) > 1: pattern += '{' + s[1].replace(')', '}') # Compile the pattern try: pattern = re.compile(pattern) except Exception: # Syntax error: quit return False if validate_pattern: return True results = [] # Use the pattern to search the sequences for sequence in sequence_list: match_list = [] if 'amino_acids' in sequence: length = len(sequence['amino_acids']) if 'secondary_structure' not in sequence: sequence['secondary_structure'] = ' ' * length if 'solvent_accessibility' not in sequence: sequence['solvent_accessibility'] = ' ' * length else: # replace "e" for "exposed" with "s" for "solvent-exposed" sequence['solvent_accessibility'] = sequence[ 'solvent_accessibility'].replace('e', 's') if 'flexibility' not in sequence: sequence['flexibility'] = ' ' * length # Convert the sequence to flat format: Ahsf- # (5 characters per residue) seq_string = ''.join([ ''.join(s) for s in zip(sequence['amino_acids'], sequence['secondary_structure'], sequence['solvent_accessibility'], sequence['flexibility'], '-' * length) ]) # Build list of matches match_list = [] match = pattern.search(seq_string) while match: start = match.start() end = match.end() # Now test residue IDs has_match = True if 'res_ids' in sequence and any(idlist): res_ids = sequence['res_ids'] for index, id in enumerate(idlist): if id and id != res_ids[old_div(start, 5) + index]: has_match = False break if has_match: if hotspot is None: match_list.append((old_div(start, 5), old_div(end, 5))) else: match_list.append(((old_div(start, 5)) + hotspot - 1, (old_div(start, 5)) + hotspot - 1)) # Why is the second item needed? # It is only needed because MSV expects a range # of residues returned for each match, so it knows # start and end position. If the hotspot is defined, # start == end. Thus MSV doesn't have to know # if the pattern included hotspot definition or not. # If the caller knows the hotspot was defined, # it can simply ignore the second value. match = pattern.search(seq_string, start + 1) # Append the result even if it's [] results.append(match_list) return results
[docs]def find_pattern(seq, pattern): """ Find pattern matches in a specified StructureSequence object. Returns a list of matching positions. :type seq: `StructureSequence` :param seq: StructureSequence object :type pattern: string :param pattern: Sequence pattern. The syntax is described in find_generalized_pattern. :rtype: list of lists of integer tuples or None :return: None if the specified input pattern was incorrect. Otherwise, it returns a list of lists of matches for each residue position in the input structure. Each match is a (start, end) tuple where start and end are matching sequence positions. If 'hotspot' is specified then start = end. """ seq_dict = convert_structure_sequence_for_pattern_search(seq) return find_generalized_pattern([seq_dict], pattern)
[docs]def assign_residue_numbers(residues, start_res=None, end_res=None): """ Assign residue numbers to the given residues based on the residues before and after :param residues: Residues that need numbering. Will be modified in-place. :type residues: list[residue.Residue] :param start_res: Previous residue. Pass None if the residues are N-terminal :type start_res: residue.Residue or NoneType :param end_res: Next residue. Pass None if the residues are C-terminal :type end_res: residue.Residue or NoneType """ # Make shallow copy without gaps residues = [elem for elem in residues if elem.is_res] start_num = None resnum_and_inscode = None num_to_renumber = len(residues) if start_res is None: if end_res is None: # Both are None - number starting from 1 start_num = 1 else: # Number N-terminal residues start_num = end_res.resnum - num_to_renumber elif end_res is None or (end_res.resnum - start_res.resnum - 1 >= num_to_renumber): # Residues are C-terminal or there are enough missing residue numbers # that each residue can have its own number without the need for # insertion codes. start_num = start_res.resnum + 1 else: # There are more residues than residue numbers, so we need # to use insertion codes. resnum_and_inscode = gen_resnums_and_inscodes(start_res.resnum, start_res.inscode, end_res.resnum, end_res.inscode) if len(resnum_and_inscode) < num_to_renumber: # There are more residues than we can possibly fit even # using every number + insertion code combination, so # pad with Nones nones = [None] * (num_to_renumber - len(resnum_and_inscode)) resnum_and_inscode.extend(nones) if resnum_and_inscode is not None: for resnum, res in zip(resnum_and_inscode, residues): if resnum is None: inscode = None else: resnum, inscode = resnum res.resnum = resnum res.inscode = inscode else: for idx, res in enumerate(residues, start=start_num): res.resnum = idx res.inscode = None
[docs]def gen_resnums_and_inscodes(start_resnum, start_inscode, end_resnum, end_inscode): """ Create a list of all residue numbers/insertion code combinations possible between the given endpoints. If the ending residue number and insertion code are less than or equal to the starting residue number and insertion code, then an empty list will be returned. :param start_resnum: The starting residue number. :type start_resnum: int :param start_inscode: The starting insertion code. :type start_inscode: str :param end_resnum: The ending residue number. :type end_resnum: int :param end_inscode: The ending insertion code. :type end_inscode: str :return: A list of residue numbers and insertion codes :rtype: list[tuple[int, str]] """ # Make sure that the insertion codes are ASCII letters or blanks start_inscode = start_inscode.upper() or " " end_inscode = end_inscode.upper() or " " for resnum, inscode in ((start_resnum, start_inscode), (end_resnum, end_inscode)): if inscode not in string.ascii_uppercase + " ": raise IOError("Invalid insertion code for residue %s%s." % (resnum, inscode)) if end_resnum < start_resnum or (end_resnum == start_resnum and (end_inscode <= start_inscode or end_inscode == " ")): # the end point is before the start point return [] cur_resnum = start_resnum cur_inscode = start_inscode resnum_and_inscode = [] while True: if cur_inscode == " ": cur_inscode = "A" elif cur_inscode == "Z": cur_inscode = " " cur_resnum += 1 else: cur_inscode = chr(ord(cur_inscode) + 1) if cur_resnum == end_resnum and cur_inscode == end_inscode: return resnum_and_inscode resnum_and_inscode.append((cur_resnum, cur_inscode))
def _get_protein_sequence(chain): """ Converts a protein chain to a ProteinSequence object. :param chain: Protein chain to get the sequence of. :type chain: `structure._Chain` :return: A protein sequence. :rtype: `ProteinSequence` """ sequence = ProteinSequence([res.getCode() for res in chain.residue]) sequence.chain = chain.name return sequence
[docs]def get_pairwise_sequence_similarity(chain1, chain2, consider_gap=True, method='muscle'): """ Given two single chain sequences, align them, and return sequence similarity among them. :param chain1: The first sequence chain. :type chain1: `structure._Chain` :param chain2: The second sequence chain. :type chain2: `structure._Chain` :param consider_gap: Whether or not to consider gaps in the alignment, default to true. :type consider_gap: bool :param method: Which alignment method to use ('muscle' or 'clustalw') :type method: string :return: Sequence similarity of the alignment of the two. :rtype: float, between 0.0 and 1.0 """ with ioredirect.IOSilence(): aln = align_from_chains([chain1, chain2], method=method) return aln[0].getSimilarity(aln[1], consider_gap)
[docs]def create_alignment_from_chains(chains): """ Return `ProteinAlignment` object comprised of two chains :param chains: Chains to be aligned :type chains: iterable(structure._Chain) """ # Dev note: Consider moving this to a ProteinSequence class method # To avoid a circular import from schrodinger.protein.alignment import ProteinAlignment aln = ProteinAlignment() for chain in chains: seq = _get_protein_sequence(chain) aln.addSeq(seq) return aln
[docs]def align_alignment(aln, second_aln=None, method='muscle'): """ Perform alignment from an ProteinAlignment object :param aln: Alignment data :type aln: `ProteinAlignment` :param method: Which method/program to use :type method: string :return: Aligned sequences :rtype: `ProteinAlignment` """ if method == 'muscle': from schrodinger.protein.tasks import muscle job = muscle.MuscleJob(aln, second_aln=second_aln) new_aln = job.run() elif method == 'clustalw': from schrodinger.protein.tasks import clustal if second_aln: raise NotImplementedError(""" ClustalW2 alignment of alignments (profiling) is not supported directly via this API. Consider using MUSCLE instead (method='muscle') """) job = clustal.ClustalJob(aln) new_aln = job.run() else: raise ValueError("Unsupported alignment method: {}".format(method)) return new_aln
[docs]def align_from_chains(chains, method='muscle'): """ Perform alignment on a series of chains :param chains: Chains to be aligned :type chains: iterable(structure._Chain) :param method: Which method/program to use (choices 'muscle', 'clustalw') :type method: string :return: Aligned sequences :rtype: `ProteinAlignment` """ aln = create_alignment_from_chains(chains) return align_alignment(aln, method=method)
[docs]def get_aligned_residues(st1, st2, method='muscle'): """ This generator will yield 2 structure._Residue objects - one from each structure - for each position in aligned sequences. :param st1: First structure. :type st1: `structure.Structure` :param st2: Second structure :type st2: `structure.Structure` :return: Generates tuples of 2 residues that align at each position. :rtype: generator(structure._Residue or None, structure._Residue or None) :raises ValueError: if structures don't have equivalent chains. """ if st1.chain.chainNames() != st2.chain.chainNames(): raise ValueError("Input structures must have same chains") for chain1, chain2 in zip(st1.chain, st2.chain): alignment = align_from_chains([chain1, chain2], method=method) residues1 = list(chain1.residue) residues2 = list(chain2.residue) for seq_res1, seq_res2 in zip(*alignment): st_res1 = residues1[ seq_res1.gapless_idx_in_seq] if seq_res1.is_res else None st_res2 = residues2[ seq_res2.gapless_idx_in_seq] if seq_res2.is_res else None yield st_res1, st_res2
[docs]def get_aligned_structure_residues(sts, method='muscle'): """ This generator will yield 2 structure._Residue objects - one from each structure for each position in aligned sequences. :param sts: Structures to align :type sts: list(structure.Structure) :return: Generates lists of residues that align at each position. :rtype: generator(list[structure._Residue or None]) """ # To avoid a circular import from schrodinger.protein.alignment import ProteinAlignment aln = ProteinAlignment() for st in sts: seq = ProteinSequence([res.getCode() for res in st.residue]) # sequence.chain = chain.name aln.addSeq(seq) alignment = align_alignment(aln, method=method) residues_by_st = [list(st.residue) for st in sts] for aligned_residues in zip(*alignment): residues = [ residues_by_st[i][seq_res.gapless_idx_in_seq] if seq_res.is_res else None for i, seq_res in enumerate(aligned_residues) ] yield residues
[docs]def offset_indices(indices): """ Offset insertion indices based on numbering before insertion to reflect numbering after insertion. For example, [1, 1, 2, 3, 5, 8] would be changed to [1, 2, 4, 6, 9, 13] :type indices: list[int] """ sorted_indices = sorted(indices) for i, cur_index in enumerate(sorted_indices): sorted_indices[i] = cur_index + i return sorted_indices