Source code for schrodinger.protein.sequence

"""
Implementation of ProteinSequence, Sequence, and StructureSequence class.

StructureSequence allows iteration over all sequences in a given protein CT,
and iteration over residues of each (in sequence order).
"""

import bisect
import collections
import copy
import dataclasses
import enum
import itertools
import re
import string
import types
import typing
from collections import namedtuple
from functools import partial
from typing import Dict

from past.utils import old_div

from Bio.Seq import translate

from schrodinger import structure
from schrodinger.application.msv import utils as msv_utils
from schrodinger.infra import mm
from schrodinger.models import json
from schrodinger.protein import annotation
from schrodinger.protein import constants
from schrodinger.protein import properties
from schrodinger.protein import residue
from schrodinger.protein.constants import AMINO_ACIDS_3_TO_1
from schrodinger.protein.properties import PropertyType
from schrodinger.Qt import QtCore
from schrodinger.structutils import analyze
from schrodinger.test import ioredirect
from schrodinger.ui.sequencealignment.sequence import delete_from_str
from schrodinger.ui.sequencealignment.structure_utils import calculate_sasa_dict

SEQ_ANNO_TYPES = annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES
_SecondaryStructure = namedtuple('_SecondaryStructure', ['limits', 'ssa_type'])
DISALLOWED_SEQ_CHARS = re.compile(r"[^a-zA-Z0-9~\.\-]")


[docs]def guess_seq_type(res_names):
    """
    Takes an iterable of residue names and returns the appropriate sequence
    class

    :param res_names: An iterable of residue names.  Note that all residue names
        must be uppercase.
    :type res_names: Iterable(str)

    :return: The appropriate class for the input sequence
    :rtype: Type[AbstractSingleChainSequence]

    Note that we use a pretty simple heuristic here.
    """
    res_names = set(res_names)
    seq_classes = [RNASequence, DNASequence, ProteinSequence]
    element_names = []
    for seq_class in seq_classes:
        eles = set(seq_class.ALPHABET.keys())
        unk = seq_class._unknown_res_type
        eles.add(unk.short_code)
        eles.add(unk.long_code)
        element_names.append(eles)

    # Determine type by the highest number of matching pdbres names
    match_counts = [
        len(res_names & seq_elem_names) for seq_elem_names in element_names
    ]
    if not any(match_counts):
        return ProteinSequence
    idxmax = match_counts.index(max(match_counts))
    return seq_classes[idxmax]


[docs]def make_sequence(elements, *args, **kwargs):
    """
    Guesses the appropriates Sequence type from the names of residues and
    returns an instance with those residues.

    :param elements: A list of strings to examine
    :type elements: list(str)

    :rtype: protein.Sequence
    :return: An instance of the appropriate type
    """
    SeqClass = guess_seq_type(elements)
    return SeqClass(elements, *args, **kwargs)


[docs]class Sequence:
    # We use this as a base class for both actual sequence objects and
    # read-only sequence proxies. This is necessary for PyQt signal typing.
    pass


[docs]class AbstractSequence(Sequence, QtCore.QObject):
    """
    A base class for single-chain and combined-chain biological sequences.

    :cvar ORIGIN: Possible sequence origins
    :vartype ORIGIN: enum.Enum

    :cvar AnnotationClass: Class to use for annotations
    :vartype AnnotationClass: annotation.SequenceAnnotations

    :cvar ElementClass: Class to use for elements
    :vartype ElementClass: Type[residue.Residue]

    :cvar ALPHABET: A mapping of string representations of elements to
        element types. Concrete subclasses must define this.
    :vartype ALPHABET: dict(str, residue.ElementType)

    :cvar _gap_chars: A tuple of permissible gap characters in the element list;
        the first item will be used for serialization.
    :vartype _gap_chars: tuple(str)

    :cvar _unknown_res_type: The type for an unknown residue
    :vartype _unknown_res_type: residue.ElementType

    :cvar residuesChanged: A signal emitted when sequence residues are changed.
        Emitted with the indices of the first and last changed residues.
    :vartype residuesChanged: QtCore.pyqtSignal

    :cvar lengthAboutToChange: A signal emitted when the sequence length is
        about to change. Emitted with the old and new lengths.
    :vartype lengthAboutToChange: QtCore.pyqtSignal

    :cvar lengthChanged: A signal emitted when the sequence length is changed.
        Emitted with the old and new lengths.
    :vartype lengthChanged: QtCore.pyqtSignal

    :cvar nameChanged: A signal emitted when the sequence name is changed.
    :vartype nameChanged: QtCore.pyqtSignal

    :cvar visibilityChanged: A signal emitted when the visibility is changed.
    :vartype visibilityChanged: QtCore.pyqtSignal

    :cvar structureChanged: A signal emitted when the structure changes.
    :vartype structureChanged: QtCore.pyqtSignal

    :cvar annotationTitleChanged: A signal emitted when an annotation title
        is changed.
    :vartype annotationTitleChanged: QtCore.pyqtSignal
    """

    ORIGIN = enum.Enum("ORIGIN", ["Maestro", "PyMOL"])
    AnnotationClass = None
    ElementClass = residue.Residue
    ALPHABET = {}
    _gap_char = "~"
    _gap_chars = frozenset((_gap_char))
    _unknown_res_type = residue.UNKNOWN

    residuesRemoved = QtCore.pyqtSignal(set)
    residuesAdded = QtCore.pyqtSignal(set)
    residuesChanged = QtCore.pyqtSignal(int, int)
    lengthAboutToChange = QtCore.pyqtSignal(int, int)
    lengthChanged = QtCore.pyqtSignal(int, int)
    nameChanged = QtCore.pyqtSignal()
    visibilityChanged = QtCore.pyqtSignal()
    structureChanged = QtCore.pyqtSignal()
    annotationTitleChanged = QtCore.pyqtSignal()
    pfamChanged = QtCore.pyqtSignal()
    descriptorsCleared = QtCore.pyqtSignal()

    # ==========================================================================
    # Magic Methods
    # ==========================================================================

[docs]    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if self.AnnotationClass:
            self._annotations = self.AnnotationClass(self)
            self._annotations.titleChanged.connect(self.annotationTitleChanged)
        else:
            self._annotations = None

        self.residuesRemoved.connect(
            self._invalidateDescriptorsIfResiduesChanged)
        self.residuesAdded.connect(self._invalidateDescriptorsIfResiduesChanged)
        self.structureChanged.connect(self._invalidateDescriptors)

        self.clearDescriptors()

    # ==========================================================================
    # Residues
    # ==========================================================================

[docs]    def residues(self):
        """
        Return an iterable of residues, ignoring gaps.

        :return: Iterable of residues
        :rtype: iter(Residue)
        """
        for res in self:
            if res.is_res:
                yield res

[docs]    def getNextResidue(self, res):
        """
        Return the next residue in the sequence (ignoring gaps) or None if this
        is the last residue.

        :param res: A given residue in the sequence
        :type res: schrodinger.protein.residue.Residue

        :return: The previous residue in the sequence
        :rtype: schrodinger.protein.residue.Residue
        """

        indices = range(res.idx_in_seq + 1, len(self))

        for position in indices:
            if not self[position].is_gap:
                return self[position]

[docs]    def getPreviousResidue(self, res):
        """
        Return the previous residue in the sequence (ignoring gaps) or None if
        this is the first residue.

        :param res: A given residue in the sequence
        :type res: schrodinger.protein.residue.Residue

        :return: The previous residue in the sequence
        :rtype: `schrodinger.protein.residue.Residue`
        """

        indices = range(res.idx_in_seq - 1, -1, -1)

        for position in indices:
            if not self[position].is_gap:
                return self[position]

[docs]    def iterNeighbors(self):
        """
        Return an iterable of three element tuples consisting of
        (prev_res, curr_res, next_res), ignoring gaps.

        None is used for neighbors of first and last residues in the sequence,
        and does not indicate gaps here.

        :return: Iterable of 3-tuples, each element of the each tuple being
            either a `schrodinger.protein.residue.Residue` or None
        :rtype: iter(tuple(Residue or NoneType, Residue, Residue or NoneType))
        """
        res_iter = self.residues()
        try:
            next_res = next(res_iter)
        except StopIteration:
            return
        prev_res, cur_res = None, None
        # Add None to the end to represent the neighbor of the last residue
        for res in itertools.chain(res_iter, [None]):
            prev_res, cur_res, next_res = cur_res, next_res, res
            yield (prev_res, cur_res, next_res)

[docs]    def index(self, res):
        """
        Returns the index of the specified residue.

        :param res: The residue to find
        :type res: residue.Residue

        :return: The index of the residue
        :rtype: int
        """
        raise NotImplementedError

[docs]    def indices(self, residues):
        """
        Returns the indices of all specified residues.  Note that there is no
        guarantee that the returned integers will be in the same order as the
        input residues.  (For combined-chain sequences, it's highly likely that
        they *won't* be.)

        :param res: The residues to find indices of
        :type res: Iterable(residue.Residue)

        :return: The indices of the residues
        :rtype: list[int]
        """
        raise NotImplementedError

[docs]    def getRun(self, res):
        """
        For a given residue or gap, return a set of all adjacent
        element indices in the sequence that are also residues or gaps.

        :param res: Residue to get the run of
        :type res: residue.AbstractSequenceElement

        :reuturn: Set of residue indices in the run
        :rtype: set(int)
        """
        res_idx = self.index(res)
        run_idx = set([res_idx])
        curr_idx = res_idx - 1
        while (curr_idx >= 0 and self[curr_idx] and
               self[curr_idx].is_gap == res.is_gap):
            run_idx.add(curr_idx)
            curr_idx -= 1
        curr_idx = res_idx + 1
        while (curr_idx < len(self) and self[curr_idx] and
               self[curr_idx].is_gap == res.is_gap):
            run_idx.add(curr_idx)
            curr_idx += 1
        return run_idx

[docs]    def insertElements(self, index, elements):
        """
        Insert a list of elements or sequence element into this sequence.

        If `elements` is a string or iterable of strings, residue numbers will
        be automatically assigned.

        :param index: The index at which to insert elements
        :type index: int

        :param elements: A list of elements to insert
        :type elements: iterable(self.ElementClass) or iterable(str)
        """
        raise NotImplementedError

[docs]    def mutate(self, start, end, elements):
        """
        Mutate sequence elements starting at the given index to the provided
        elements.

        :param start: The index at which to start mutating
        :type start: int

        :param end: The index of the last mutated element (exclusive)
        :type end: int

        :param elements: The elements to which to mutate the sequence
        :type elements: iterable(self.ElementClass) or iterable(str)
        """
        raise NotImplementedError

[docs]    def append(self, element):
        """
        Appends an element to the sequence

        :param element: The element to append to this sequence
        :type: element: self.ElementClass or basestring
        """
        raise NotImplementedError

[docs]    def extend(self, elements):
        """
        Extends the sequence with elements from an iterable

        :param elements: The iterable containing elements with which to extend
            this sequence
        :type elements: iterable(self.ElementClass) or iterable(str)
        """
        raise NotImplementedError

[docs]    def getSubsequence(self, start, end):
        """
        Return a sequence containing a subset of the elements in this one

        :param start: The index at which the subsequence should start
        :type start: int

        :param end: The index at which the subsequence should end (exclusive)
        :type end: int

        :return: The requested subsequence
        :rtype: AbstractSequence
        """
        raise NotImplementedError

[docs]    def removeElements(self, eles):
        """
        Remove elements from the sequence.

        :param eles: A list of elements to remove from the sequence.
        :type  eles: list(residue.AbstractSequenceElement)

        :raise ValueError: If any of the given elements are not in the sequence.
        """
        raise NotImplementedError

    @property
    def _res_positions(self):
        """
        Return the residue positions in the sequence,
        updating the cache if needed.

        :return: The residue positions
        :rtype: dict(`schrodinger.protein.residue.AbstractSequenceElement`,
                     tuple(int, int))
        """
        if self._res_positions_cache is None:
            self._updateResiduePositionCache()
        return self._res_positions_cache

    def _removeFromSequence(self, keep_func, start=0, end=None):
        """
        Remove any residues not matching keep_func from the sequence

        :param keep_func: A callable taking a residue and returning a bool
            indicating whether to keep it in the sequence
        :type keep_func: callable

        :param start: The index at which to start filtering
        :type start: int

        :param end: The index at which to end filtering
        :type end: int

        :raises ValueError: In the event that invalid indices are specified
        """
        raise NotImplementedError

    # ==========================================================================
    # Gaps
    # ==========================================================================

    @property
    def gap_char(self):
        # Make gap_char a read-only property to avoid string comparison errors
        return self._gap_char

[docs]    def getGaplessLength(self):
        """
        :return: Length of this sequence ignoring gaps
        :rtype: int
        """
        raise NotImplementedError

[docs]    def getGaps(self):
        """
        :rtype: list(residue.Gap)
        :return: The gaps in the sequence.
        """
        return [res for res in self if res.is_gap]

[docs]    def addGapsByIndices(self, gap_idxs):
        """
        Add gaps to the sequence from a list of gap indices.  Note that these
        indices are based on numbering *after* the insertion.  To insert gaps
        using indices based on numbering before the insertion, see
        `addGapsBeforeIndices`.

        :param gap_idxs: A list of gap indices
        :type gap_idxs: list(int)
        """
        raise NotImplementedError

[docs]    def validateGapIndices(self, gap_idxs):
        """
        Make sure that the specified gap indices are valid input for
        `addGapsByIndices`.  If the indices are invalid, a `ValueError` will be
        raised.  Indices are considered invalid if:

          - they refer to a position that's more than one residue past the end
            of the sequence
          - they are negative

        :param gap_idxs: The gap indices to validate
        :type gap_idxs: list[int]
        """
        if any(idx < 0 for idx in gap_idxs):
            raise ValueError("Negative gap index")
        for i, cur_gap_idx in enumerate(sorted(gap_idxs)):
            if cur_gap_idx > len(self) + i:
                raise ValueError("Invalid gap index")

[docs]    def addGapsBeforeIndices(self, indices):
        """
        Add one gap to the alignment before each of the specified residue
        positions.  Note that these indices are based on numbering *before* the
        insertion.  To insert gaps using indices based on numbering after the
        insertion, see `addGapsByIndices`.

        :param indices: A list of indices to insert gaps before.
        :type indices: list(int)
        """
        if not indices:
            return
        indices = offset_indices(indices)
        self.addGapsByIndices(indices)

[docs]    def removeTerminalGaps(self):
        """
        Remove gaps from the end of the sequence
        """
        terminal_gaps = self.getTerminalGaps()
        if terminal_gaps:
            self.removeElements(terminal_gaps)

[docs]    def getTerminalGaps(self):
        """
        Return terminal gaps.

        :return: A list of terminal gaps (in ascending index order)
        :rtype: list(residue.Gap)
        """

        last_res_idx = len(self) - 1
        while last_res_idx >= 0 and self[last_res_idx].is_gap:
            last_res_idx -= 1
        return self[last_res_idx + 1:]

[docs]    def getGapCount(self):
        """
        :return: the number of gaps in the sequence
        :rtype: int
        """
        return len(self) - self.getGaplessLength()

[docs]    def removeAllGaps(self):
        """
        Remove gaps from the sequence
        """
        self._removeFromSequence(lambda res: res.is_res, 0)

    # ==========================================================================
    # Annotations
    # ==========================================================================

    @property
    def annotations(self):
        return self._annotations

[docs]    def getAnnotation(self, index, annotation):
        """
        Returns the annotation at the specified index or None for a gap.

        :raises ValueError: if the annotation is not available
        """
        anno_name = annotation.name
        if index < len(self):
            element = self[index]
            if element.is_res:
                try:
                    return getattr(element, anno_name)
                except AttributeError:
                    if (self.annotations is None or not isinstance(
                            annotation, self.annotations.ANNOTATION_TYPES)):
                        msg = "%s is not in annotation types" % anno_name
                        raise ValueError(msg)
                    # get sequence-level annotation
                    array = getattr(self.annotations, anno_name)
                    return array[index]
        return None

[docs]    def getAnnotationValueForComparison(self,
                                        col,
                                        anno,
                                        ann_index=0,
                                        cdr_scheme=None):
        """
        Get an annotation value appropriate for determining whether two
        residues have the same annotation value. e.g. for binding site, only
        same-distance contacts of the same ligand should compare equal.

        :param anno: Protein sequence annotation enum member
        :type anno: annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES

        :param ann_index: Annotation index for multi-value annotations
        :type ann_index: int

        :param cdr_scheme: CDR scheme for antibody annotation. (Will be ignored
            if `anno` is not SEQ_ANNO_TYPES.antibody_cdr.)
        :type cdr_scheme: annotation.AntibodyCDRScheme

        :return: Annotation value for comparison or None if the corresponding
            residue is a gap or has no value for this annotation
        :rtype: object or None
        """
        elem = self[col]
        if not anno.can_expand or elem.is_gap:
            return None
        if anno.multi_value:
            if anno is SEQ_ANNO_TYPES.domains:
                if self.annotations.domains is None or elem.domains is None:
                    return None
                domain = self.annotations.domains[ann_index]
                if domain in elem.domains:
                    # Only return domain string if the current residue is part
                    # of this domain
                    return domain
                return None
            elif anno is SEQ_ANNO_TYPES.binding_sites:
                # Return a tuple of (ligand name, contact_enum) so that only
                # contacts of the same ligand match
                if not self.annotations.ligands:
                    return None
                lig = self.annotations.ligands[ann_index]
                lig_contacts = self.getAnnotation(col, anno)
                binding_site = lig_contacts[ann_index]
                if binding_site is annotation.BINDING_SITE.NoContact:
                    return None
                return (lig, binding_site)
            else:
                # TODO MSV-3355 test whether this works for kinase conservation
                anno_val = self.getAnnotation(col, anno)
                if not anno_val:
                    return None
                return anno_val[ann_index]
        elif anno is SEQ_ANNO_TYPES.antibody_cdr:
            cdr_feature = self.annotations.getAntibodyCDR(col,
                                                          scheme=cdr_scheme)
            cdr_label = cdr_feature.label
            if cdr_label is annotation.AntibodyCDRLabel.NotCDR:
                return None
            return cdr_label
        elif anno is SEQ_ANNO_TYPES.kinase_features:
            kinase_feature = elem.kinase_features
            if kinase_feature is annotation.KinaseFeatureLabel.NO_FEATURE:
                return None
            return kinase_feature
        else:
            return self.getAnnotation(col, anno)

[docs]    def getNumAnnValues(self, ann):
        return self.annotations.getNumAnnValues(ann)

[docs]    def clearAllCaching(self):
        """
        This method should be implemented in subclasses that cache any data.
        """

[docs]    def setPfam(self, new_pfam, pfam_name):
        residues = list(self.residues())
        if len(new_pfam) != len(residues):
            raise ValueError("Length of `new_pfam` must be equal to the number "
                             "of residues in the sequence."
                             f"Got: {len(new_pfam)} Expected: {len(residues)}")
        for res, p, in zip(residues, new_pfam):
            res.pfam = p
        self.pfam_name = pfam_name
        self.pfamChanged.emit()

[docs]    def clearPfam(self):
        for res in self.residues():
            res.pfam = None
        self.pfamChanged.emit()

[docs]    def hasPfam(self):
        return not all(p is None for p in self.annotations.pfam)

    # ==========================================================================
    # Structure
    # ==========================================================================

[docs]    def hasStructure(self):
        """
        :return: Whether this sequence has an associated structure.
        :rtype: bool
        """
        raise NotImplementedError

[docs]    def getStructure(self):
        """
        :return: The associated structure.  Will return None if there is no
            associated structure.
        :rtype: schrodinger.structure.Structure or NoneType
        """
        raise NotImplementedError

[docs]    def setStructure(self, struc):
        """
        Set the associated structure.  Can only be used on sequences with an
        associated structure.

        :param struc: The new structure for this sequence
        :type struc: schrodinger.structure.Structure

        :raise RuntimeError: If there's no structure associated with this
            sequence object.
        """
        raise NotImplementedError

    # ==========================================================================
    # Metrics
    # ==========================================================================

    def _getScore(self,
                  scoring_function,
                  other,
                  consider_gaps=True,
                  only_consider=None):
        """
        Calculate and return the average score after applying a scoring function
        to the aligned residues of this sequence and the other sequence.

        :param scoring_function: A function that takes two residues and returns
            a score.
        :type scoring_function: callable(Residue, Residue)

        :param other: A sequence to compare against
        :type other: schrodinger.protein.sequence.Sequence

        :param consider_gaps: Whether we should count gaps when we're
            calculating the average score. Note that the scoring function is not
            applied to gaps, they are only used in finding the N to divide the
            total score by.
        :type consider_gaps: bool

        :param only_consider: Restrict the calculation to a subset of the
                              specified residues
        :type: None or set

        :return: The average score after applying the scoring function to
            the aligned residues of this sequence and the other sequence.
        :rtype: float

        """
        total_score = 0.0
        total_length = 0

        for res1, res2 in zip(self, other):
            if only_consider is not None:
                if res1 not in only_consider:
                    continue
            if res1.is_gap and res2.is_gap:
                # Don't consider gap-only columns
                continue
            elif res1.is_res and res2.is_res:
                total_score += scoring_function(res1, res2)
                total_length += 1
            elif consider_gaps:
                total_length += 1

        if total_length > 0:
            return total_score / total_length
        else:
            return 0.0

[docs]    def getIdentity(self, other, consider_gaps=True, only_consider=None):
        """
        Return a float scoring the identity between the sequence and another
        sequence, assuming that they're already aligned

        :param other: A sequence to compare against
        :type other: schrodinger.protein.sequence.Sequence

        :param consider_gaps: Whether we should count gaps when we're
            calculating the average score.
        :type consider_gaps: bool

        :return: The sequence identity score (between 0.0 and 1.0)
        :rtype: float
        """
        return self._getScore(self.ElementClass.getIdentity,
                              other,
                              consider_gaps,
                              only_consider=only_consider)

[docs]    def getSimilarity(self, other, consider_gaps=True, only_consider=None):
        """
        Return a float score of the similarity count between the sequence and
        another sequence, assuming that they're already aligned.

        :param other: A sequence to compare against
        :type other: schrodinger.protein.sequence.Sequence

        :param consider_gaps: Whether we should count gaps when we're
            calculating the average score.
        :type consider_gaps: bool

        :return: The sequence similarity score (between 0.0 and 1.0)
        :rtype: float
        """
        return self._getScore(self.ElementClass.getBinarySimilarity,
                              other,
                              consider_gaps,
                              only_consider=only_consider)

[docs]    def getConservation(self, other, consider_gaps=True, only_consider=None):
        """
        Return a float scoring the homology conservation between the sequence
        and another sequence, assuming that they're already aligned.

        The homology criterion is based on "side chain chemistry" descriptor
        matching.

        :param other: A sequence to compare against
        :type other: schrodinger.protein.sequence.Sequence

        :param consider_gaps: Whether we should count gaps when we're
            calculating the average score.
        :type consider_gaps: bool

        :return: The sequence conservation score (between 0.0 and 1.0)
        :rtype: float
        """
        return self._getScore(self.ElementClass.getConservation,
                              other,
                              consider_gaps,
                              only_consider=only_consider)

[docs]    def getSimilarityScore(self, other, consider_gaps=True, only_consider=None):
        """
        Return the total score of similarity between the sequence and a
        other sequence, assuming that they're already aligned.

        :param other: A sequence to compare against
        :type other: schrodinger.protein.sequence.Sequence

        :param consider_gaps: Ignored because the similarity with a gap is 0.0
        :type consider_gaps: bool

        :param only_consider: A set of residues to restrict attention to
        :type only_consider: set or None

        :return: The total sequence similarity score
        :rtype: int
        """

        def include_in_calculation(pair):
            res1, res2 = pair
            if res1.is_gap or res2.is_gap:
                return False
            if only_consider is not None and res1 not in only_consider:
                return False
            return True

        residue_pairs = zip(self, other)
        residue_pairs = (
            pair for pair in residue_pairs if include_in_calculation(pair))
        similarities = (
            self.ElementClass.getSimilarity(*pair) for pair in residue_pairs)
        return int(sum(similarities))

[docs]    def getStructureResForRes(self, res):
        """
        :param res: Residue to get structure residue for
        :type res: residue.Residue

        :return: Structure residue or None if no matching residue is found
        :rtype: schrodinger.structure._Residue or NoneType
        """
        raise NotImplementedError

[docs]    def structuredResidueCount(self):
        """
        Get the number of residues in this sequence with an associated
        structured residue.
        :rtype: int
        """
        if not self.hasStructure():
            return 0
        return sum(
            1 for elem in self if (not elem.is_gap and not elem.seqres_only))

[docs]    def hasStructuredResidues(self):
        """
        Return whether this sequence has any structured residues.  This method
        is equivalent to `bool(seq.structuredResidueCount())` but doesn't
        (typically) require iterating through the entire sequence.

        :rtype: bool
        """
        return (self.hasStructure() and
                any(not (elem.is_gap or elem.seqres_only) for elem in self))

[docs]    def getProperty(self, seq_prop):
        """
        Get the sequence's value corresponding to the given SequenceProperty
        object

        :param seq_prop: The object describing the sequence property
        :type seq_prop: schrodinger.protein.properties.SequenceProperty

        :return: The value of the sequence property
        :rtype: float or None
        """
        if seq_prop.property_type == PropertyType.StructureProperty:
            st = self.getStructure()
            if st is not None:
                return st.property.get(seq_prop.property_name)
        else:
            descriptors = self._descriptors[seq_prop.property_source]
            return descriptors.get(seq_prop.property_name)

[docs]    def updateDescriptors(self, descriptors, property_source):
        """
        Updates the descriptor dicts with new descriptor values

        :param descriptors: A dict mapping descriptor names to their values
        :type descriptors: dict[str, float]

        :param property_source: The source of the descriptors
        :type property_source: properties.PropertySource
        """
        self._descriptors[property_source].update(descriptors)

[docs]    def clearDescriptors(self):
        self._descriptors = {
            properties.PropertySource.Sequence: dict(),
            properties.PropertySource.Structure: dict(),
            properties.PropertySource.Residue: dict(),
        }
        self.descriptorsCleared.emit()

    @property
    def descriptors(self):
        return self._descriptors

[docs]    def hasDescriptors(self):
        for descriptors in self._descriptors.values():
            if descriptors:
                return True
        return False

    def _invalidateDescriptorsIfResiduesChanged(self, elements):
        if not self.hasDescriptors():
            return
        if any(elem.is_res for elem in elements):
            self.clearDescriptors()

    def _invalidateDescriptors(self):
        if self.hasDescriptors():
            self.clearDescriptors()


# To convince Hypothesis that Sequence is a sequence (MSV-1966)
collections.abc.Sequence.register(AbstractSequence)


[docs]class AbstractSingleChainSequence(AbstractSequence):
    """
    Base class for single-chain biological sequences

    Note: Protein-specific functionality should go in ProteinSequence.

    :cvar sequenceCopied: A signal emitted when this sequence is copied.
        Emitted with the sequence being copied and the newly created copy.
        This signal is used by the structure model to make sure that the newly
        created copy is kept in sync with the structure.
    :vartype sequenceCopied: QtCore.pyqtSignal
    """

    sequenceCopied = QtCore.pyqtSignal(object, object)

    # ==========================================================================
    # Magic Methods
    # ==========================================================================

[docs]    def __init__(self,
                 elements="",
                 name="",
                 origin=None,
                 entry_id=None,
                 entry_name="",
                 pdb_id="",
                 chain="",
                 structure_chain=None,
                 long_name="",
                 resnums=None):
        """
        Make a sequence object from a list of strings and/or `self.ElementClass`

        Strings are converted to `self.ElementClass` using a mapping of strings
        to element types.

        :param elements: An iterable of elements making up the sequence.
            `None` are interpretted as gaps.
        :type elements: iterable(self.ElementClass or None) or
            iterable(str or None)

        :param name: The name of the sequence (possibly shortened), used for
            display purposes. For a FASTA sequence, this should be a short
            identifier such as the Uniprot ID.
        :type name: str

        :param origin: A piece of metadata indicating where the sequence came
                       from
        :type origin: Sequence.ORIGIN or None

        :param entry_id: An entry associated with the sequence, if any
        :type: str

        :param entry_name: An entry name associated with the sequence, if any
        :type: str

        :param pdb_id: An id associated with the sequence, if any
        :type: str

        :param chain: The chain to which the sequence belongs
        :type chain: str

        :param structure_chain: The chain of the structure this sequence
            is associated with. This is usually the same as `chain` if the
            sequence has a structure but isn't necessarily.
        :type structure_chain: str

        :param long_name: The full name of the sequence. For a FASTA
            sequence, this should be the full FASTA header.
        :type long_name: str

        :param resnums: Residue numbers to assign to `elements`.  If not given,
            residues will be numbered starting from one.  Regardless of what's
            given here, no residue numbering will occur if `elements` is an
            iterable of `ElementClass` and any element already has a residue
            number set. If this iterable is shorter than `elements`, additional
            numbers will be generated by incrementing the last number present.
        :type resnums: Iterable(int)
        """

        if elements is None:
            raise ValueError(
                "Cannot pass value of NoneType to protein sequence")

        if not self.ALPHABET:
            cls_name = self.__class__.__name__
            raise NotImplementedError(
                f"{cls_name} cannot be instantiated directly.")

        super().__init__()
        self.name = name
        self._origin = None
        self.origin = origin
        self.entry_name = entry_name
        if entry_id is not None:
            assert str(entry_id)
        self.entry_id = entry_id
        self.long_name = long_name
        self._chain = chain
        self.structure_chain = structure_chain
        self.pdb_id = pdb_id
        self._visibility = None
        # The _get_structure and _set_structure values will be overwritten by the
        # structure model if this sequence is associated with a structure.
        self._get_structure = None
        self._set_structure = None
        self._residue_map = {}
        self._structure_res_map = None
        if not resnums:
            # Make sure that we set residue numbers if the input was strings
            resnums = (1,)
        self._sequence = self._makeSeqElements(elements, resnums)
        self._cached_str = None
        self._res_positions_cache = None
        self._gapless_length = None

        self.residuesChanged.connect(self._clearCachedStr)
        self.lengthChanged.connect(self._clearCachedStr)

    def __str__(self):
        if self._cached_str is None:
            self._cached_str = "".join(
                self._gap_char if res.is_gap else res.type.short_code
                for res in self._sequence)
        return self._cached_str

[docs]    def __len__(self):
        return len(self._sequence)

    def __iter__(self):
        return iter(self._sequence)

    def __repr__(self):
        """
        Returns a representation of the instance that can be pasted into a repl
        """
        class_open = self.__class__.__name__ + "("
        arguments = ", ".join(self._getReprArguments())
        seq_rep = [class_open, arguments, ")"]
        return "".join(seq_rep)

[docs]    def __contains__(self, item):
        return item in self._res_positions

    def __bool__(self):
        return bool(self._sequence)

    def __getitem__(self, index):
        try:
            return self._sequence[index]
        except IndexError:
            msg = ("index (%d) out of range (sequence length is only %d)" %
                   (index, len(self)))
            msg += "\n" + str(self)
            raise IndexError(msg)

    def __delitem__(self, index):
        elements = self[index] if isinstance(index, slice) else [self[index]]
        self.removeElements(elements)

    def __copy__(self):
        raise TypeError("Cannot make shallow copy. Must call deepcopy")

    def __deepcopy__(self, memo):
        copied_elements = copy.deepcopy(self._sequence, memo)
        copied_descriptors = copy.deepcopy(self._descriptors, memo)
        SequenceClass = self.__class__
        new_seq = SequenceClass(copied_elements,
                                name=self.name,
                                origin=self.origin,
                                entry_id=self.entry_id,
                                chain=self.chain,
                                structure_chain=self.structure_chain,
                                long_name=self.long_name)
        new_seq._descriptors = copied_descriptors
        intra_ss_bonds = (
            bond for bond in self.disulfide_bonds if bond.is_intra_sequence)
        for res1, res2 in intra_ss_bonds:
            residue.add_disulfide_bond(new_seq[res1.idx_in_seq],
                                       new_seq[res2.idx_in_seq])
        intra_ss_bonds = (bond for bond in self.pred_disulfide_bonds
                          if bond.is_intra_sequence)
        for res1, res2 in intra_ss_bonds:
            residue.add_disulfide_bond(new_seq[res1.idx_in_seq],
                                       new_seq[res2.idx_in_seq],
                                       known=False)
        # Note that this method does not copy _get_structure and _set_structure.
        # Instead we emit sequenceCopied.  If this sequence is being monitored
        # by a structure model, then the structure model is responsible for
        # setting these attributes properly in new_seq.
        self.sequenceCopied.emit(self, new_seq)
        return new_seq

    @QtCore.pyqtSlot()
    def _clearCachedStr(self):
        self._cached_str = None

    def _getReprArguments(self):
        """
        Generate the arguments to add to the repr string.

        :return: A list of argument strings
        :rtype: list(str)
        """
        elements = "[%s]" % ",".join(self._resFormatCode(res) for res in self)
        name = "name='%s'" % re.escape(self.name)
        origin = "origin=%s" % self.origin
        entry_id = "entry_id='%s'" % self.entry_id
        chain = "chain='%s'" % self.chain
        return [elements, name, origin, entry_id, chain]

    def _resFormatCode(self, res):
        """
        Returns a representation of a residue for __repr__
        """
        if res.is_gap:
            return "'%s'" % self.gap_char
        return "'%s'" % res

    # ==========================================================================
    # Construction
    # ==========================================================================

[docs]    @classmethod
    def makeSeqElement(cls, element):
        """
        :param element: A sequence element or string representation thereof
        :type element: str or cls.ElementClass

        :return: sequence element
        :rtype: cls.ElementClass
        """

        # TODO Once MSV-1504 is implemented, type checking should be stricter
        # to avoid accidental mixing of amino acid and nucleotide residues
        if isinstance(element, residue.AbstractSequenceElement):
            return element
        elif isinstance(element, residue.CombinedChainResidueWrapper):
            return element.split_res
        elif element is None or element.strip() in cls._gap_chars:
            return residue.Gap()
        element = element.upper().strip()
        element_type = cls.ALPHABET.get(element)
        if element_type is None:
            element_type = cls._unknown_res_type
            if len(element) == 3:
                # Make custom unknown residue type for 3-character unknowns
                element_type = element_type.makeVariant(element)
        res = cls.ElementClass(element_type)
        return res

    def _makeSeqElement(self, element):
        """
        Helper function to call self.makeSeqElement and set Residue.sequence.
        If the sequence has a structure, then newly generated elements will have
        their `seqres_only` attribute default to True.
        """
        res = self.makeSeqElement(element)
        res.sequence = self
        if isinstance(element, str) and self.hasStructure():
            res.seqres_only = True
        return res

    def _makeSeqElements(self, elements, resnums=()):
        """
        :param elements: iterable of sequence elements
        :type elements: iterable(self.ElementClass or None) or
            iterable(str or None)

        :param resnums: Residue numbers to assign to `elements`.  These numbers
            will be ignored if `elements` is an iterable of `ElementClass` and
            any element already has a residue number set. If this iterable is
            shorter than `elements`, additional numbers will be generated by
            incrementing the last number present.
        :type resnums: Iterable(int)

        :return: list(self.ElementClass)
        :rtype: list

        .. note::

            The performance of this method is critical for quickly loading
            sequences into the MSV.  As such, it has been written to optimize
            performance over code reuse.  (In other words, there's a good reason
            that this method doesn't just call `_makeSeqElement`.)
        """
        if not elements:
            return []
        if isinstance(elements, str):
            # remove all whitespace and other disallowed characters
            elements = DISALLOWED_SEQ_CHARS.sub("", elements)
            elements = elements.upper()
            return self._makeSeqElementsFromStr(elements, resnums)

        try:
            elements = list(elements)
        except TypeError:
            # TODO: Remove this.  See MSV-2794.
            elements = [elements]
        # Figure out the type of the contents of elements.  Since Nones are
        # allowed for gaps, we need to find the first non-None element
        elem = None
        for elem in elements:
            if elem is not None:
                break
        if elem is None:
            # elements is all Nones
            return [residue.Gap() for _ in elements]
        elif isinstance(elem, str):
            # convert elements to uppercase and remove spaces
            elements = [
                None if e is None else e.upper().strip() for e in elements
            ]
            return self._makeSeqElementsFromStr(elements, resnums)
        elif isinstance(elem, residue.AbstractSequenceElement):
            return self._makeSeqElementsFromElements(elements, resnums)
        elif isinstance(elem, residue.CombinedChainResidueWrapper):
            # convert combined-chain elements to split-chain elements
            elements = [None if e is None else e.split_res for e in elements]
            return self._makeSeqElementsFromElements(elements, resnums)
        else:
            raise TypeError("Unrecognized type for element:", type(elem))

    def _makeSeqElementsFromStr(self, elements, resnums=()):
        """
        :param elements: iterable of sequence elements
        :type elements: iterable(str or None)

        :param resnums: Residue numbers to assign to `elements`.  If this
            iterable is shorter than `elements`, additional numbers will be
            generated by incrementing the last number present.
        :type resnums: Iterable(int)

        :return: list(self.ElementClass)
        :rtype: list
        """
        seq_elements = [None] * len(elements)
        has_struc = self.hasStructure()
        resnum = 1 if resnums else None
        for i, (elem, input_resnum) in enumerate(
                itertools.zip_longest(elements, resnums)):
            if input_resnum is not None:
                resnum = input_resnum
            if elem is None or elem in self._gap_chars:
                new_seq_elem = residue.Gap()
            else:
                elem_type = self.ALPHABET.get(elem, self._unknown_res_type)
                # Do not use keyword arguments here.  Function calls with
                # only positional arguments are faster.
                new_seq_elem = self.ElementClass(elem_type, None, resnum,
                                                 has_struc)
                if resnums:
                    resnum += 1

            new_seq_elem.sequence = self
            seq_elements[i] = new_seq_elem
        return seq_elements

    def _makeSeqElementsFromElements(self, elements, resnums):
        """
        :param elements: iterable of sequence elements
        :type elements: iterable(self.ElementClass or None)

        :param resnums: Residue numbers to assign to `elements`.  These numbers
            will be ignored if a residue number is set for any element in
            `elements.  If this iterable is shorter than `elements`, additional
            numbers will be generated by incrementing the last number present.
        :type resnums: Iterable(int)

        :return: list(self.ElementClass)
        :rtype: list
        """
        if resnums and any(elem.hasSetResNum()
                           for elem in elements
                           if elem is not None and not elem.is_gap):
            # ignore resnums if any of the input elements have residue numbers
            # set already
            resnums = ()
        resnum = 1 if resnums else None
        for i, (elem, input_resnum) in enumerate(
                itertools.zip_longest(elements, resnums)):
            if elem is None:
                elem = residue.Gap()
                elements[i] = elem
            elem.sequence = self
            if resnums and not elem.is_gap:
                if input_resnum is not None:
                    resnum = input_resnum
                elem.resnum = resnum
                resnum += 1
        return elements

[docs]    @classmethod
    def isValid(cls, elements):
        """
        :param elements: An iterable of string representations of elements
            making up the sequence
        :type elements: iterable(str) or str

        :return: Tuple indicating whether valid and a set of invalid characters,
            if any
        :rtype: tuple(bool, set(str))
        """
        raise NotImplementedError

    # ==========================================================================
    # Sequence-Level Properties
    # ==========================================================================

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, new_name):
        """
        Set the name on the instance and emit a notification

        :param name: The new name for the sequence
        :type name: str

        :raises: ValueError if name is not str
        """
        if not isinstance(new_name, str):
            raise ValueError(f"name must be str, not {type(new_name)}")
        # Strip out non-ASCII characters
        new_name = new_name.encode('ascii', 'ignore').decode('ascii')
        self._name = new_name
        self.nameChanged.emit()

    @property
    def chain(self):
        return self._chain

    @chain.setter
    def chain(self, name):
        self._chain = name
        self.nameChanged.emit()

    @property
    def fullname(self):
        """
        :return: a formatted name + optional chain name for the sequence
        :rtype: str
        """
        if self.chain:
            return "%s_%s" % (self._name, self.chain)
        return self._name

    @property
    def visibility(self):
        return self._visibility

    @visibility.setter
    def visibility(self, value):
        if self._visibility != value:
            self._visibility = value
            self.visibilityChanged.emit()

    # ==========================================================================
    # Residues
    # ==========================================================================

[docs]    def index(self, res, ignore_gaps=False):
        """
        Returns the index of the specified residue

        :param res: The residue to find
        :type res: residue.Residue

        :param ignore_gaps: Whether the index returned should ignore gaps in
                            the sequence or not.
        :type ignore_gaps: bool

        :raises: A ValueError if the residue is not present

        :rtype: int
        :return: The index of the residue
        """
        if res not in self._res_positions:
            err_msg = f"Specified residue {res} not found in sequence {self}"
            raise ValueError(err_msg)
        idx = 1 if ignore_gaps else 0
        return self._res_positions[res][idx]

[docs]    def indices(self, residues, ignore_gaps=False):
        """
        Returns the indices of all specified residues.  Note that there is no
        guarantee that the returned integers will be in the same order as the
        input residues.  (For combined-chain sequences, it's highly likely that
        they *won't* be.)

        :param res: The residues to find indices of
        :type res: Iterable(residue.Residue)

        :param ignore_gaps: Whether the indices returned should ignore gaps in
            the sequence or not.
        :type ignore_gaps: bool

        :return: The indices of the residues
        :rtype: list[int]
        """
        if self._res_positions_cache is None:
            self._updateResiduePositionCache()
        idx = 1 if ignore_gaps else 0
        return [self._res_positions_cache[res][idx] for res in residues]

[docs]    def insertElements(self, index, elements):
        # See parent class for method documentation
        try:
            elements = list(elements)
        except TypeError:
            # TODO: Remove this.  See MSV-2794.
            elements = [elements]
        assign_numbers = all(isinstance(c, str) for c in elements)
        elements = self._makeSeqElements(elements)
        if assign_numbers:
            next_res = None
            prev_res = None
            true_index = min(index, max(0, len(self) - 1))

            prev_indices = range(true_index - 1, -1, -1)
            for position in prev_indices:
                elem = self[position]
                if elem.is_res and elem.hasSetResNum():
                    prev_res = elem
                    break

            next_indices = range(true_index, len(self))
            for position in next_indices:
                elem = self[position]
                if elem.is_res and elem.hasSetResNum():
                    next_res = elem
                    break

            assign_residue_numbers(elements,
                                   start_res=prev_res,
                                   end_res=next_res)

        old_length = len(self._sequence)
        new_length = old_length + len(elements)
        self.lengthAboutToChange.emit(old_length, new_length)
        self._sequence[index:index] = elements
        self._addNewResiduesToMap(elements)
        self.lengthChanged.emit(old_length, new_length)
        self.residuesAdded.emit(set(elements))
        self.residuesChanged.emit(index, old_length)

[docs]    def mutate(self, start, end, elements):
        # See parent class for method documentation
        elements = self._makeSeqElements(elements)

        old_length = len(self._sequence)

        length_change = len(elements) - (end - start)

        if length_change != 0:
            new_length = old_length + length_change
            self.lengthAboutToChange.emit(old_length, new_length)

        removed = self._sequence[start:end]
        for ele in removed:
            if ele.is_res:
                if ele.disulfide_bond is not None:
                    residue.remove_disulfide_bond(ele.disulfide_bond)
                if ele.pred_disulfide_bond is not None:
                    residue.remove_disulfide_bond(ele.pred_disulfide_bond)
            ele.sequence = None
        self._sequence[start:end] = elements
        self._removeResiduesFromMap(removed)
        self._addNewResiduesToMap(elements)

        if length_change != 0:
            self.lengthChanged.emit(old_length, new_length)
            last_changed_res_index = min(old_length, new_length)
            self.residuesChanged.emit(start, last_changed_res_index - 1)
        else:
            self.residuesChanged.emit(start, end - 1)
        self.residuesRemoved.emit(set(removed))
        self.residuesAdded.emit(set(elements))

[docs]    def append(self, element):
        # See parent class for method documentation
        element = self._makeSeqElement(element)
        self.extend([element])

[docs]    def extend(self, elements):
        # See parent class for method documentation
        old_length = len(self._sequence)
        new_length = old_length + len(elements)
        self.lengthAboutToChange.emit(old_length, new_length)
        elements = self._makeSeqElements(elements)
        self._sequence.extend(elements)
        self.lengthChanged.emit(old_length, new_length)
        self.residuesChanged.emit(old_length, new_length)
        self.residuesAdded.emit(set(elements))

[docs]    def getSubsequence(self, start, end):
        # See parent class for method documentation
        SequenceClass = self.__class__
        return SequenceClass(copy.copy(res) for res in self[start:end])

[docs]    def removeElements(self, eles):
        # See parent class for method documentation
        if len(eles) == 0:
            return
        elif any(ele not in self for ele in eles):
            err_msg = ('Elements must be contained by sequence in order to be '
                       'removed')
            raise ValueError(err_msg)
        start_idx_removed = min(self.index(ele) for ele in eles)

        new_length = len(self) - len(eles)
        old_length = len(self)
        self.lengthAboutToChange.emit(old_length, new_length)
        self._removeResiduesFromMap(eles)
        self._sequence = [ele for ele in self._sequence if ele not in eles]
        for ele in eles:
            if ele.is_res:
                if ele.disulfide_bond is not None:
                    residue.remove_disulfide_bond(ele.disulfide_bond)
                if ele.pred_disulfide_bond is not None:
                    residue.remove_disulfide_bond(ele.pred_disulfide_bond)
            ele.sequence = None
        self.lengthChanged.emit(old_length, new_length)
        self.residuesChanged.emit(start_idx_removed, new_length - 1)
        self.residuesRemoved.emit(set(eles))

    def _replaceAllElements(self, elements):
        """
        Replace _sequence entirely with the supplied elements

        :param elements: Elements for the sequence
        :type elements: iterable(self.ElementClass) or iterable(str)
        """
        new_length = len(elements)
        old_length = len(self)
        announce_length_change = new_length != old_length
        if announce_length_change:
            self.lengthAboutToChange.emit(old_length, new_length)
        elements = self._makeSeqElements(elements)
        self._sequence = elements
        if announce_length_change:
            self.lengthChanged.emit(old_length, new_length)
        self.residuesChanged.emit(0, old_length)

    def _removeFromSequence(self, keep_func, start=0, end=None):
        """
        Remove any residues not matching keep_func from the sequence

        :param keep_func: A callable taking a residue and returning a bool
            indicating whether to keep it in the sequence
        :type keep_func: callable

        :param start: The index at which to start filtering
        :type start: int

        :param end: The index at which to end filtering
        :type end: int

        :raises ValueError: In the event that invalid indices are specified
        """
        old_length = len(self)
        end = end if end is not None else old_length
        if end > old_length or start < 0:
            msg = ("Invalid indices specified (start=%d, end=%d) for a sequence"
                   "of length %d" % (start, end, old_length))
            raise ValueError(msg)
        new_sequence = self._sequence[0:start]
        for res in self._sequence[start:end]:
            if keep_func(res):
                new_sequence.append(res)
            else:
                res.sequence = None
        new_sequence.extend(self._sequence[end:])
        self._replaceAllElements(new_sequence)

    def _updateResiduePositionCache(self):
        """
        Update the cache of residues to their index within the sequence.

        The residue-index cache is stored as a mapping between residues
        and two indexes: 1) the index of the residue in the sequence including
        gaps and 2) the index of the residue in the sequence not including
        gaps. Gaps store `None` for their gapless indexes. Put more concisely,
        the residue-index cache maps:

        residue.AbstractSequenceElement -> tuple(gapped_index, gapless_index)

        """
        self._res_positions_cache = {}
        gapless_idx = 0
        for gapped_idx, res in enumerate(self):
            if res.is_res:
                self._res_positions_cache[res] = (gapped_idx, gapless_idx)
                gapless_idx += 1
            else:
                self._res_positions_cache[res] = (gapped_idx, None)

        self._gapless_length = gapless_idx

    @QtCore.pyqtSlot()
    def _invalidateResiduePositionCache(self):
        """
        Invalidate the cache of residues to their index within the sequence

        The connected signal emits no arguments and the slot decorator must
        match the signal, but the method does not need to have the params.

        Note that we do not actually calculate the indices until they are
        needed; this happens in _updateResiduePositionCache.
        """
        self._res_positions_cache = None
        self._gapless_length = None

    # ==========================================================================
    # Gaps
    # ==========================================================================

[docs]    def getGaplessLength(self):
        # See parent class for method documentation
        if self._gapless_length is None:
            self._updateResiduePositionCache()
        return self._gapless_length

[docs]    def addGapsByIndices(self, gap_idxs):
        # See parent class for method documentation
        if not gap_idxs:
            return
        self.validateGapIndices(gap_idxs)
        gap_idxs = set(gap_idxs)
        old_length = len(self)
        new_length = len(self) + len(gap_idxs)
        self.lengthAboutToChange.emit(old_length, new_length)
        _sequence = iter(self._sequence)

        new_sequence = []
        for i in range(new_length):
            next_element = None
            if i not in gap_idxs:
                next_element = next(_sequence, None)
            if next_element is None:
                next_element = residue.Gap()
                next_element.sequence = self
            new_sequence.append(next_element)

        assert len(new_sequence) == new_length, "Sanity check"

        self._sequence = new_sequence

        self.lengthChanged.emit(old_length, new_length)
        self.residuesChanged.emit(min(gap_idxs), old_length - 1)

    # ==========================================================================
    # Predictions
    # ==========================================================================
[docs]    def setSSA(self, new_ssa):
        residues = list(self.residues())
        if len(new_ssa) != len(residues):
            raise ValueError('New ssa must be a list of equal length '
                             'to the number of residues')
        for res, ssa in zip(residues, new_ssa):
            res.secondary_structure = ssa
        self._invalidateSecondaryStructure()
        self.secondaryStructureChanged.emit()

[docs]    def setSSAPredictions(self, pred):
        if len(pred) != len(self):
            raise ValueError('Prediction must be a list of equal length '
                             'to the sequence')
        for res, ssa in zip(self, pred):
            res.pred_secondary_structure = ssa
        self._invalidateSecondaryStructure()
        self._invalidatePredictionFlags()
        self.predictionsChanged.emit()

[docs]    def setDisorderedRegionsPredictions(self, pred):
        if len(pred) != len(self):
            raise ValueError('Prediction must be a list of equal length '
                             'to the sequence')
        for res, dis in zip(self, pred):
            res.pred_disordered = dis
        self._invalidatePredictionFlags()
        self.predictionsChanged.emit()

[docs]    def setDomainArrangementPredictions(self, pred):
        if len(pred) != len(self):
            raise ValueError('Prediction must be a list of equal length '
                             'to the sequence')
        for res, dom in zip(self, pred):
            res.pred_domain_arr = dom
        self._invalidatePredictionFlags()
        self.predictionsChanged.emit()

[docs]    def setSolventAccessibilityPredictions(self, pred):
        if len(pred) != len(self):
            raise ValueError('Prediction must be a list of equal length '
                             'to the sequence')
        for res, acc in zip(self, pred):
            res.pred_accessibility = acc
        self._invalidatePredictionFlags()
        self.predictionsChanged.emit()

[docs]    def deletePrediction(self, prediction_type):
        if prediction_type is SEQ_ANNO_TYPES.pred_disulfide_bonds:
            for res in self:
                bond = res.pred_disulfide_bond
                if bond is not None:
                    residue.remove_disulfide_bond(bond)
            self.predictionsChanged.emit()
            return
        empty_pred = [None] * len(self)
        if prediction_type is SEQ_ANNO_TYPES.pred_secondary_structure:
            self.setSSAPredictions(empty_pred)
        elif prediction_type is SEQ_ANNO_TYPES.pred_accessibility:
            self.setSolventAccessibilityPredictions(empty_pred)
        elif prediction_type is SEQ_ANNO_TYPES.pred_disordered:
            self.setDisorderedRegionsPredictions(empty_pred)
        elif prediction_type is SEQ_ANNO_TYPES.pred_domain_arr:
            self.setDomainArrangementPredictions(empty_pred)
        else:
            raise ValueError(f"{prediction_type} is not a prediction")

[docs]    def deleteAllPredictions(self):
        all_predictions = [
            SEQ_ANNO_TYPES.pred_disulfide_bonds,
            SEQ_ANNO_TYPES.pred_secondary_structure,
            SEQ_ANNO_TYPES.pred_accessibility, SEQ_ANNO_TYPES.pred_disordered,
            SEQ_ANNO_TYPES.pred_domain_arr
        ]
        for prediction in all_predictions:
            self.deletePrediction(prediction)

    # ==========================================================================
    # Structure
    # ==========================================================================

    @property
    def origin(self):
        """
        :return: A piece of metadata indicating where the sequence came from
        :rtype origin: `Sequence.ORIGIN` or None
        """
        return self._origin

    @origin.setter
    def origin(self, new_origin):
        """
        :param new_origin: A piece of metadata indicating where the sequence came
                       from
        :type new_origin: `Sequence.ORIGIN` or None
        """
        if new_origin is None or new_origin in self.ORIGIN:
            self._origin = new_origin
        else:
            origins = "".join(self.ORIGIN.names())
            msg = "Sequence origin be one of the following values: %s" % origins
            raise ValueError(msg)

[docs]    def hasStructure(self):
        # See AbstractSequence for documentation
        return self._get_structure is not None

[docs]    def getStructure(self):
        # See AbstractSequence for documentation
        if self._get_structure is not None:
            return self._get_structure()
        else:
            return None

[docs]    def setStructure(self, struc):
        # See AbstractSequence for documentation
        if self._set_structure is None:
            raise RuntimeError("No structure associated with this sequence.")
        else:
            self._set_structure(struc)

[docs]    def onStructureChanged(self):
        self.annotations.onStructureChanged()
        self.structureChanged.emit()
        # This map is lazily regenerated when calling `getStructureResForRes`
        self._structure_res_map = None

[docs]    def setResidueMap(self, residue_map):
        """
        Set a new mapping between ResidueKey and structured residues.

        Note: the only intended user of this method is
        `schrodinger.application.msv.seqio.StructureConverter`, where the
        ResidueKey is computed from the `structure._Residue` used to create the
        `residue.Residue`. If the sequence has a structure, the map can be
        generated using `generateResidueMap`.

        :param residue_map: Mapping between residue key and Residue
        :type residue_map: dict(residue.ResidueKey, residue.Residue)
        """
        self._residue_map = residue_map

[docs]    def generateResidueMap(self):
        """
        Create residue map based on current structured residues.

        Note: this method requires `self.hasStructure()` to be True and
        `self.entry_id` to be set. If this sequence was produced by
        `schrodinger.application.msv.seqio.StructureConverter`, there should
        already be a residue map and this method does not need to be called.

        :raise RuntimeError: If sequence has no structure or entry id
        """
        if not self.hasStructure() or self.entry_id is None:
            raise RuntimeError("No structure associated with this sequence.")
        residue_map = {res.getKey(): res for res in self if res.hasStructure()}
        self.setResidueMap(residue_map)

    def _addNewResiduesToMap(self, elements):
        """
        Private method to add new residues to `self._residue_map`.

        Residues must already belong to the sequence and the sequence must have
        `entry_id` and structure set.

        :param elements: Sequence elements being added (gaps will be ignored)
        :type elements: list[residue.Residue]
        """
        for ele in elements:
            if ele.hasStructure():
                res_key = ele.getKey()
                self._residue_map[res_key] = ele

    def _removeResiduesFromMap(self, elements):
        """
        Private method to remove residues from `self._residue_map`.

        Residues must still belong to the sequence and the sequence must have
        `entry_id` and structure set.

        :param elements: Sequence elements being removed (gaps will be ignored)
        :type elements: list[residue.Residue]
        """
        for ele in elements:
            if ele.hasStructure():
                res_key = ele.getKey()
                self._residue_map.pop(res_key, None)

    def _getStructureResMap(self):
        """
        :return: Mapping between residue key and structure residue
        :rtype: dict(residue.ResidueKey, schrodinger.structure._Residue)

        :raise RuntimeError: If sequence has no structure
        """
        if not self.hasStructure():
            raise RuntimeError("No structure associated with this sequence.")
        if self._structure_res_map is None:
            structure_res_map = {}
            get_sres_key = partial(residue.get_structure_residue_key,
                                   entry_id=self.entry_id)
            st = self.getStructure()
            try:
                chain = st.chain[self.structure_chain]
            except KeyError:
                pass
            else:
                for sres in chain.residue:
                    sres_key = get_sres_key(sres)
                    structure_res_map[sres_key] = sres
            self._structure_res_map = structure_res_map
        return self._structure_res_map

    def _getStructureResByKey(self, res_key):
        """
        :param res_key: Residue key: (entry_id, chain, resnum, inscode)
        :type res_key: residue.ResidueKey

        :return: Structure residue or None if no matching residue is found
        :rtype: schrodinger.structure._Residue or NoneType

        :raise RuntimeError: If sequence has no structure
        """
        sres_map = self._getStructureResMap()
        return sres_map.get(res_key)

[docs]    def getResByKey(self, res_key):
        """
        :param res_key: Residue key: (entry_id, chain, resnum, inscode)
        :type res_key: residue.ResidueKey

        :return: Residue matching key or None if no matching residue is found
        :rtype: residue.Residue or NoneType

        :raise RuntimeError: If sequence has no structure
        """
        if not self.hasStructure():
            raise RuntimeError("No structure associated with this sequence.")
        return self._residue_map.get(res_key)

[docs]    def getStructureResForRes(self, res):
        # See parent class for method documentation
        if not self.hasStructure():
            return None
        res_key = res.getKey()
        sres = self._getStructureResByKey(res_key)
        return sres


[docs]class ProteinSequenceMeta(type(QtCore.QObject)):
    """
    Metaclass for split-chain and combined-chain protein sequences
    """

    @property
    def ALPHABET(self):
        # This metaclass property is needed because 1. ALPHABET must be
        # available from both the class and the instance 2. this call might
        # need to read the nonstandard residues database from disk so we don't
        # want to load it at import time.
        return residue.get_protein_alphabet()


[docs]class AbstractProteinSequenceMixin:
    """
    A mixin for code shared between split-chain and combined-chain protein
    sequences.
    """

    disulfideBondsCacheCleared = QtCore.pyqtSignal()
    predictionsChanged = QtCore.pyqtSignal()
    secondaryStructureChanged = QtCore.pyqtSignal()
    kinaseFeaturesChanged = QtCore.pyqtSignal()
    kinaseConservationChanged = QtCore.pyqtSignal()

    @property
    def ALPHABET(self):
        # This instance property is needed because metaclasses are excluded
        # from attribute lookup
        return type(self).ALPHABET

    # ==========================================================================
    # Magic Methods
    # ==========================================================================

[docs]    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._secondary_structures = None
        self._pred_secondary_structures = None
        self._disulfide_bonds = None
        self._pred_disulfide_bonds = None
        self.pfam_name = None
        self._invalidatePredictionFlags()

    # ==========================================================================
    # Annotations
    # ==========================================================================

    @property
    def disulfide_bonds(self):
        """
        :return: A sorted tuple of the valid disulfide bonds.
        :rtype: tuple(residue.DisulfideBond)
        """
        raise NotImplementedError

    @property
    def secondary_structures(self):
        """
        A list of _SecondaryStructure namedtuples containing the type of
        secondary structure and where the secondary structures begin and end.

        :return: A list of namedtuples containing an SS_TYPE from
            schrodinger.structure and the residue indexes marking the limits
            of the secondary structure.
        :rtype: list(_SecondaryStructure)
        """
        if self._secondary_structures is None:
            self._updateSecondaryStructure()
        return self._secondary_structures

    def _updateSecondaryStructure(self):
        """
        Calculates and caches tuples describing the sequence secondary
        structures. The tuples contain the SSA type and their limits (ie
        where the structure begins and ends)
        """
        raise NotImplementedError

[docs]    def clearDisulfideBondsCache(self):
        self._disulfide_bonds = None
        self._pred_disulfide_bonds = None
        self.disulfideBondsCacheCleared.emit()

[docs]    def isKinaseChain(self) -> bool:
        raise NotImplementedError

    # ==========================================================================
    # Predictions
    # ==========================================================================
    @QtCore.pyqtSlot()
    def _invalidatePredictionFlags(self):
        self._has_ssa_prediction = None
        self._has_dis_predictions = None
        self._has_dom_prediction = None
        self._has_acc_predictions = None

[docs]    def hasDisorderedRegionsPredictions(self):
        if self._has_dis_predictions is None:
            self._has_dis_predictions = any(
                r.pred_disordered is not None for r in self.residues())
        return self._has_dis_predictions

[docs]    def hasDisulfideBondPredictions(self):
        return bool(self.pred_disulfide_bonds)

[docs]    def hasDomainArrangementPredictions(self):
        if self._has_dom_prediction is None:
            self._has_dom_prediction = any(
                r.pred_domain_arr is not None for r in self.residues())
        return self._has_dom_prediction

[docs]    def hasSolventAccessibility(self):
        if self._has_acc_predictions is None:
            self._has_acc_predictions = any(
                r.pred_accessibility is not None for r in self.residues())
        return self._has_acc_predictions

[docs]    def hasSSAPredictions(self):
        if self._has_ssa_prediction is None:
            secondary_strucs = self.pred_secondary_structures
            if (len(secondary_strucs) == 1 and
                    secondary_strucs[0].ssa_type is None):
                self._has_ssa_prediction = False
            else:
                self._has_ssa_prediction = True
        return self._has_ssa_prediction

    @property
    def pred_secondary_structures(self):
        if self._pred_secondary_structures is None:
            self._updatePredSecondaryStructure()
        return self._pred_secondary_structures


[docs]class ProteinSequence(json.JsonableClassMixin,
                      AbstractProteinSequenceMixin,
                      AbstractSingleChainSequence,
                      metaclass=ProteinSequenceMeta):
    """
    A single-chain protein sequence.

    :cvar secondaryStructureCacheCleared: A signal emitted when the secondary
        structure cache has been cleared.  Used to keep the
        `CombinedChainProteinSequence` cache in sync.  If listening for changes
        in the secondary structure values, use `secondaryStructureChanged`
        instead.
    """

    AnnotationClass = annotation.ProteinSequenceAnnotations
    ElementClass = residue.Residue
    _gap_chars = constants.GAP_CHARS
    secondaryStructureCacheCleared = QtCore.pyqtSignal()

    # ==========================================================================
    # Magic Methods
    # ==========================================================================

[docs]    def __init__(self,
                 elements="",
                 name="",
                 origin=None,
                 entry_id=None,
                 entry_name="",
                 pdb_id="",
                 chain="",
                 structure_chain=None,
                 long_name="",
                 resnums=(),
                 disulfide_bonds=None,
                 pred_disulfide_bonds=None):
        """
        See `AbstractSingleChainSequence` for additional documentation.

        :param disulfide_bonds: A list of pairs of residue indices to link via
            disulfide bonds.
        :type disulfide_bonds: Iterable(tuple(int, int))

        :param pred_disulfide_bonds: A list of pairs of residue indices to link
            via predicted disulfide bonds.
        :type pred_disulfide_bonds: Iterable(tuple(int, int))
        """
        super().__init__(elements,
                         name=name,
                         origin=origin,
                         entry_id=entry_id,
                         entry_name=entry_name,
                         pdb_id=pdb_id,
                         chain=chain,
                         structure_chain=structure_chain,
                         long_name=long_name,
                         resnums=resnums)

        self._is_kinase_cons_annotated = False
        self._is_kinase_annotated = False

        self.residuesChanged.connect(self.kinaseFeaturesChanged)

        self.residuesChanged.connect(self._invalidateResiduePositionCache)
        self.lengthChanged.connect(self._invalidateResiduePositionCache)

        # Set up protein secondary structure slots
        self.residuesChanged.connect(self._invalidateSecondaryStructure)
        self.lengthChanged.connect(self._invalidateSecondaryStructure)

        self.residuesChanged.connect(self._invalidatePredictionFlags)
        self.lengthChanged.connect(self._invalidatePredictionFlags)

        self._pattern_seq_dict = None
        self.residuesChanged.connect(self._invalidatePatternSeqDict)

        if disulfide_bonds is not None:
            for i1, i2 in disulfide_bonds:
                residue.add_disulfide_bond(self[i1], self[i2])
        if pred_disulfide_bonds is not None:
            for i1, i2 in pred_disulfide_bonds:
                residue.add_disulfide_bond(self[i1], self[i2], known=False)

    def _getReprArguments(self):
        # See AbstractSingleChainSequence for method documentation
        arguments = super()._getReprArguments()
        if self.disulfide_bonds:
            arg = ("disulfide_bonds=" +
                   self._bondIndicesForRepr(self.disulfide_bonds))
            arguments.append(arg)
        if self.pred_disulfide_bonds:
            arg = ("pred_disulfide_bonds=" +
                   self._bondIndicesForRepr(self.pred_disulfide_bonds))
            arguments.append(arg)
        return arguments

    def _bondIndicesForRepr(self, disulfide_bonds):
        """
        Generate a string describing the residue indices involved in the given
        disulfide bonds.

        :param disulfide_bonds: The bond to describe.
        :type disulfide_bonds: list(residue.DisulfideBond)

        :return: A string describing the disulfide bonds.
        :rtype: str
        """
        intra_seq_bonds = [
            bond for bond in disulfide_bonds
            if bond.isValid() and bond.is_intra_sequence
        ]
        indices = [(res1.idx_in_seq, res2.idx_in_seq)
                   for (res1, res2) in intra_seq_bonds]
        indices.sort()
        return "[%s]" % ", ".join(f"({i1}, {i2})" for (i1, i2) in indices)

    def _resFormatCode(self, res):
        if res.is_gap:
            return super()._resFormatCode(res)
        return "'%s'" % res.long_code

    # ==========================================================================
    # Serialization
    # ==========================================================================

[docs]    def toJsonImplementation(self):
        intraseq_bonds = (
            bond for bond in self.disulfide_bonds if bond.is_intra_sequence)
        disulfide_bonds = [
            [res1.idx_in_seq, res2.idx_in_seq] for res1, res2 in intraseq_bonds
        ]
        intraseq_pred_bonds = (bond for bond in self.pred_disulfide_bonds
                               if bond.is_intra_sequence)
        pred_disulfide_bonds = [[res1.idx_in_seq, res2.idx_in_seq]
                                for res1, res2 in intraseq_pred_bonds]

        return {
            'residues': self._sequence,
            'name': self.name,
            'chain': self.chain,
            'structure_chain': self.structure_chain,
            'disulfide_bonds': disulfide_bonds,
            'pred_disulfide_bonds': pred_disulfide_bonds,
            'long_name': self.long_name,
            'entry_id': self.entry_id,
            'descriptors': {
                source.name: descriptor
                for source, descriptor in self.descriptors.items()
            },
        }

[docs]    @classmethod
    def fromJsonImplementation(cls, json_obj):
        residues = []
        residues_str = []
        is_kinase_cons_annotated = False
        is_kinase_annotated = False
        for ele in json_obj['residues']:
            try:
                elem = residue.Gap.fromJson(ele)
            except ValueError:
                elem = residue.Residue.fromJson(ele)
                if elem._kinase_conservation:
                    is_kinase_cons_annotated = True
                if elem.kinase_features is not None:
                    is_kinase_annotated = True
            residues.append(elem)
            residues_str.append(str(elem))
        SeqType = guess_seq_type(residues_str)
        rehydrated_seq = SeqType(residues,
                                 entry_id=json_obj['entry_id'],
                                 structure_chain=json_obj['structure_chain'])
        rehydrated_seq._is_kinase_cons_annotated = is_kinase_cons_annotated
        rehydrated_seq._is_kinase_annotated = is_kinase_annotated
        for res1_idx, res2_idx in json_obj['disulfide_bonds']:
            residue.add_disulfide_bond(rehydrated_seq[res1_idx],
                                       rehydrated_seq[res2_idx])
        for res1_idx, res2_idx in json_obj['pred_disulfide_bonds']:
            residue.add_disulfide_bond(rehydrated_seq[res1_idx],
                                       rehydrated_seq[res2_idx],
                                       known=False)
        rehydrated_seq.name = json_obj['name']
        rehydrated_seq.long_name = json_obj['long_name']
        rehydrated_seq.chain = json_obj['chain']
        descriptors_from_json = json_obj.get('descriptors')

        if descriptors_from_json is not None:
            for source, descriptors in descriptors_from_json.items():
                src = properties.PropertySource[source]
                rehydrated_seq.updateDescriptors(descriptors, src)
        return rehydrated_seq

[docs]    @json.adapter(version=47007)
    def adapter47007(cls, json_dict):
        plain_seq = cls()
        json_dict['entry_id'] = plain_seq.entry_id
        json_dict['structure_chain'] = plain_seq.structure_chain
        return json_dict

[docs]    @json.adapter(version=48003)
    def adapter48002(cls, json_dict):
        json_dict['long_name'] = json_dict.pop('title')
        return json_dict

[docs]    @json.adapter(version=52065)
    def adapter52065(cls, json_dict):
        json_dict['kinase_features'] = None
        return json_dict

    # ==========================================================================
    # Construction
    # ==========================================================================

[docs]    @classmethod
    def isValid(cls, elements):
        """
        :param elements: An iterable of string representations of elements
            making up the sequence
        :type elements: iterable(str) or str

        :return: Tuple indicating whether valid and a set of invalid characters,
            if any
        :rtype: tuple(bool, set(str))
        """
        valid_chars = set(cls.ALPHABET) | set(cls._gap_chars)
        invalid_chars = set(elements) - valid_chars
        return not invalid_chars, invalid_chars

    # ==========================================================================
    # Annotations
    # ==========================================================================

[docs]    def renumberResidues(self, new_rescode_map):
        """
        Renumber residues in this sequence given a dictionary mapping old
        rescodes to the new desired rescodes.

        :type new_rescode_map: dict[tuple(int, str), tuple(int, str)]
        """
        min_res_idx = None
        max_res_idx = None
        for idx, res in enumerate(self):
            if res.is_res and res.getChainKey() in new_rescode_map:
                res.resnum, res.inscode = new_rescode_map[res.getChainKey()]
                if min_res_idx is None:
                    min_res_idx = idx
                max_res_idx = idx
        if min_res_idx is not None:
            self.residuesChanged.emit(min_res_idx, max_res_idx)

[docs]    def isKinaseChain(self):
        return any(
            res.is_res and (res.kinase_features or res.kinase_conservation)
            for res in self)

    @property
    def is_kinase_annotated(self):
        return self._is_kinase_annotated

[docs]    def setKinaseFeatures(self,
                          feature_map: Dict[residue.Residue,
                                            annotation.KinaseFeatureLabel]):
        """
        :param feature_map: Map of residue to kinase feature label
        """
        for res, feature in feature_map.items():
            if res not in self:
                raise ValueError(
                    'All residues in feature_map must belong to this sequence')
            res.kinase_features = feature
        self.kinaseFeaturesChanged.emit()
        self._is_kinase_annotated = True

    @property
    def is_kinase_cons_annotated(self):
        return self._is_kinase_cons_annotated

[docs]    def setKinaseConservation(self, cons_map, lig_asl):
        """
        :param cons_map: Map of residue to conservation
        :type cons_map: dict[residue.Residue, annotation.KinaseConservation]

        :param lig_asl: ASL of the associated ligand
        :type lig_asl: str
        """
        try:
            lig_index = self.annotations.ligand_asls.index(lig_asl)
        except ValueError:
            return
        for res, cons in cons_map.items():
            res.kinase_conservation[lig_index] = cons
        self.kinaseConservationChanged.emit()
        self._is_kinase_cons_annotated = True

    @property
    def disulfide_bonds(self):
        # See AbstractProteinSequenceMixin for documentation
        if self._disulfide_bonds is None:
            bonds = set(res.disulfide_bond for res in self if res.is_res and
                        res.disulfide_bond and res.disulfide_bond.isValid())
            key = lambda bond: min(
                self.index(res) for res in bond if res in self)
            self._disulfide_bonds = tuple(sorted(bonds, key=key))
        return self._disulfide_bonds

    @QtCore.pyqtSlot()
    def _invalidateSecondaryStructure(self):
        """
        Invalidates the stored indices of start and end positions of secondary
        structure assignment blocks in the sequence.

        Note that we do not actually calculate the indices until they are
        needed; this happens in _updateSecondaryStructure.
        """
        self._secondary_structures = None
        self._pred_secondary_structures = None
        self.secondaryStructureCacheCleared.emit()

    def _updateSecondaryStructure(self):
        # See AbstractProteinSequenceMixin for documentation
        self._secondary_structures = []
        if len(self) == 0:
            return
        ssa = self._calculateSecondaryStructuresList(
            ssa_attr='secondary_structure')
        self._secondary_structures = ssa

    def _calculateSecondaryStructuresList(self, ssa_attr):
        """
        Calculate and return a list of `_SecondaryStructure` tuples describing
        the types and ranges of secondary structures in the sequence.

        :param ssa_attr: The attribute name to access the secondary structure
            on a residue.
        :type  ssa_attr: str

        :rtype : list[_SecondaryStructure]
        """
        start_idxs = set()
        end_idxs = set()
        for prev_res, curr_res, next_res in self.iterNeighbors():
            # Note: iterNeighbors uses `None` to indicate that neighbor does
            # not exist
            curr_ssa = getattr(curr_res, ssa_attr)
            if (prev_res is None or getattr(prev_res, ssa_attr) != curr_ssa):
                start_idxs.add(self.index(curr_res))
            if (next_res is None or getattr(next_res, ssa_attr) != curr_ssa):
                end_idxs.add(self.index(curr_res))

        start_idxs = sorted(start_idxs)
        end_idxs = sorted(end_idxs)

        all_ss = []
        for start_idx, end_idx in zip(start_idxs, end_idxs):
            ss = _SecondaryStructure((start_idx, end_idx),
                                     getattr(self[start_idx], ssa_attr))
            all_ss.append(ss)
        return all_ss

    # ==========================================================================
    # Predictions
    # ==========================================================================

    @property
    def pred_disulfide_bonds(self):
        # See AbstractProteinSequenceMixin for documentation
        if self._pred_disulfide_bonds is None:
            bonds = list(
                set(res.pred_disulfide_bond
                    for res in self
                    if res.is_res and res.pred_disulfide_bond and
                    res.pred_disulfide_bond.isValid()))
            key = lambda bond: min(
                self.index(res) for res in bond if res in self)
            self._pred_disulfide_bonds = tuple(sorted(bonds, key=key))
        return self._pred_disulfide_bonds

    def _updatePredSecondaryStructure(self):
        # See AbstractProteinSequenceMixin for documentation
        self._pred_secondary_structures = []
        if len(self) == 0:
            return
        ssa = self._calculateSecondaryStructuresList(
            ssa_attr='pred_secondary_structure')
        self._pred_secondary_structures = ssa

    # ==========================================================================
    # Structure
    # ==========================================================================

[docs]    def removeStructurelessResidues(self, start=0, end=None):
        """
        Remove any structureless residues

        :param start: The index at which to start filtering structureless
            residues.
        :type start: int

        :param end: The index at which to end filtering
        :type end: int
        """

        def keep_func(res):
            return (res.hasStructure() or res.is_gap or
                    not res.sequence.hasStructure())

        self._removeFromSequence(keep_func, start, end)

    # ==========================================================================
    # Patterns
    # ==========================================================================

[docs]    def encodeForPatternSearch(self,
                               with_ss=False,
                               with_flex=False,
                               with_asa=False):
        """
        Convert to sequence dict expected by `find_generalized_pattern`.

        :param with_ss: Whether to include secondary structure information.
        :type with_ss: bool

        :param with_flex: Whether to include flexibility information.
        :type with_flex: bool

        :param with_asa: Whether to include accessible surface area
            information.
        :type with_asa: bool

        :rtype: dict
        :return: dictionary of sequence data
        """

        # TODO make cache aware of structure flags
        if self._pattern_seq_dict is not None:
            return self._pattern_seq_dict

        _aa_list = []
        if with_ss:
            _ss_list = []
        if with_asa:
            std_sasa_dict = calculate_sasa_dict()
            _asa_list = []

        seq_dict = {
            "res_ids": [],
        }

        def is_positive(number):
            return number is not None and number > 0

        if with_flex:
            _flex_list = []
            pos_b_factors = [
                res.b_factor
                for res in self
                if res.is_res and is_positive(res.b_factor)
            ]
            avg_b_factor = sum(pos_b_factors) / len(pos_b_factors) \
                if pos_b_factors else None

        for i, res in enumerate(self):
            if res.is_gap:
                continue

            # amino acid sequence
            short_code = (res.short_code if res.short_code
                          in AMINO_ACIDS_3_TO_1.values() else "X")
            _aa_list.append(short_code)

            # residue numbering
            seq_dict["res_ids"].append(res.rescode.strip())

            # secondary structure
            if with_ss:
                ss_code = ' '
                if res.secondary_structure is structure.SS_HELIX:
                    ss_code = 'h'
                elif res.secondary_structure is structure.SS_STRAND:
                    ss_code = 'e'
                _ss_list.append(ss_code)

            # flexibility
            if with_flex:
                flex_code = ' '
                # Consider flexible if b-factor is above average
                if (is_positive(res.b_factor) and is_positive(avg_b_factor) and
                        res.b_factor > avg_b_factor):
                    flex_code = 'f'
                _flex_list.append(flex_code)

            # solvent_accessibility
            if with_asa:
                asa_code = ' '
                # Consider exposed if ASA is greater than standard ASA
                std_sasa = std_sasa_dict.get(res.long_code)
                res_sasa = self.annotations.sasa[i]
                if (std_sasa is not None and res_sasa is not None and
                        res_sasa > std_sasa):
                    asa_code = 'e'
                _asa_list.append(asa_code)

        seq_dict["amino_acids"] = "".join(_aa_list)
        empty_seq_str = " " * len(seq_dict["res_ids"])
        seq_dict["secondary_structure"] = ("".join(_ss_list)
                                           if with_ss else empty_seq_str)
        seq_dict["flexibility"] = ("".join(_flex_list)
                                   if with_flex else empty_seq_str)
        seq_dict["solvent_accessibility"] = ("".join(_asa_list)
                                             if with_asa else empty_seq_str)

        self._pattern_seq_dict = seq_dict
        return self._pattern_seq_dict

    @QtCore.pyqtSlot()
    def _invalidatePatternSeqDict(self):
        """
        Invalidates the stored sequence dictionary for pattern search.
        """
        self._pattern_seq_dict = None

    # ==========================================================================
    # Caching
    # ==========================================================================

[docs]    def clearAllCaching(self):
        self.annotations.clearAllCaching()
        self.clearDisulfideBondsCache()
        self._invalidateSecondaryStructure()
        self._invalidateResiduePositionCache()
        self._invalidatePatternSeqDict()
        self._invalidatePredictionFlags()

[docs]    def setGPCRAnnotations(self, gpcr_annotation_map):
        for res, gpcr_anno in gpcr_annotation_map.items():
            res.gpcr_segment = gpcr_anno['gpcr_segment']
            res.gpcr_generic_number = gpcr_anno['gpcr_generic_number']


[docs]class NucleicSequenceMeta(ProteinSequenceMeta):
    ALPHABET = None


[docs]class NucleicAcidSequence(ProteinSequence, metaclass=NucleicSequenceMeta):
    # We only have "limited support" of nucleic acids, so this class is just a
    # stub.
    #
    # TODO (MSV-1504): Create proper nucleic acid domain objects

    AnnotationClass = annotation.NucleicAcidSequenceAnnotations
    ElementClass = residue.Nucleotide
    _unknown_res_type = residue.UNKNOWN_NA
    ALPHABET = None  # Cannot directly instantiate a NucleicAcidSequence

    @property
    def is_kinase_cons_annotated(self):
        return True

    @property
    def is_kinase_annotated(self):
        return True

[docs]    def isKinaseChain(self):
        return False

[docs]    def getTranslation(self):
        """
        Get a translated sequence. This method uses BioPython's translate
        method to convert a nucleic acid sequence into an amino acid sequence

        :return: A translated protein sequence. The name and chain from the
            nucleic acid sequence are copied over
        :rtype: ProteinSequence
        """
        seq_str = str(self).replace(self.gap_char, '')
        extra = len(seq_str) % 3
        if extra:
            # Pad to a multiple of 3
            seq_str += "N" * (3 - extra)
        bp_amino_str = translate(seq_str, stop_symbol=self.gap_char)
        protein_seq = ProteinSequence(bp_amino_str,
                                      name=self.name,
                                      chain=self.chain)
        return protein_seq


[docs]class DNASequence(NucleicAcidSequence):
    ALPHABET = types.MappingProxyType(residue.DNA_ALPHABET)


[docs]class RNASequence(NucleicAcidSequence):
    ALPHABET = types.MappingProxyType(residue.RNA_ALPHABET)


[docs]class CombinedChainSequenceMeta(msv_utils.DocstringWrapperMetaClass,
                                ProteinSequenceMeta):
    """
    The metaclass for `CombinedChainProteinSequence`.  This metaclass
    wraps the specified class attributes.
    """

    def __new__(metacls,
                cls,
                bases,
                classdict,
                *,
                wraps=None,
                wrapped_constants=(),
                wrapped_properties=(),
                wrapped_getters=(),
                wrapped_setters=()):
        """
        See Python documentation for positional argument documentation.

        :param wraps: The `AbstractSingleChainSequence` subclass to wrap.  If
            None, no wrapping will be done.  (This allows for subclassing of the
            wrapper class.)
        :type wraps: type or None

        :param wrapped_constants: Class constants to be wrapped.
        :type wrapped_constants: Iterable

        :param wrapped_properties: Properties to be wrapped.
        :type wrapped_properties: Iterable

        :param wrapped_getters: Getters to be wrapped.
        :type wrapped_getters: Iterable

        :param wrapped_setters: Setters to be wrapped.
        :type wrapped_setters: Iterable
        """
        if wraps is not None:
            for name in wrapped_constants:
                classdict[name] = getattr(wraps, name)
            for name in wrapped_properties:
                classdict[name] = metacls._wrapProperty(name)
            for name in wrapped_getters:
                classdict[name] = metacls._wrapGetterMethod(name)
            for name in wrapped_setters:
                classdict[name] = metacls._wrapSetterMethod(name)
        return super().__new__(metacls, cls, bases, classdict, wraps=wraps)

    @staticmethod
    def _wrapProperty(name):
        """
        Wrap the specified property.

        :param name: The name of the property to wrap.
        :type name: str

        :return: The wrapped property
        :rtype: property
        """

        def getter(self):
            return getattr(self._seqs[0], name)

        def setter(self, value):
            for chain in self._seqs:
                setattr(chain, name, value)

        def deleter(self):
            for chain in self._seqs:
                delattr(chain, name)

        return property(getter, setter, deleter)

    @staticmethod
    def _wrapGetterMethod(name):
        """
        Wrap the specified getter method.  The wrapper method will retrieved
        values from the first wrapped sequence, so this should only be used for
        methods where the return value will be identical across all wrapped
        sequences.

        :param name: The name of the method to wrap.
        :type name: str

        :return: The wrapped method
        :rtype: function
        """

        def getter(self, *args, **kwargs):
            return getattr(self._seqs[0], name)(*args, **kwargs)

        return getter

    @staticmethod
    def _wrapSetterMethod(name):
        """
        Wrap the specified setter method.  The wrapper method will set values on
        all wrapped sequences.

        :param name: The name of the method to wrap.
        :type name: str

        :return: The wrapped method
        :rtype: function
        """

        def setter(self, *args, **kwargs):
            for chain in self._seqs:
                getattr(chain, name)(*args, **kwargs)

        return setter


[docs]@dataclasses.dataclass
class GapRegion:
    """
    Container for information about gaps to add to or remove from the start and
    end of a chain
    """
    from_start: int
    from_end: int


GapRegionsForChains = typing.List[GapRegion]
GapRegionsForSeqs = typing.List[GapRegionsForChains]


[docs]class CombinedChainProteinSequence(
        AbstractProteinSequenceMixin,
        AbstractSequence,
        metaclass=CombinedChainSequenceMeta,
        wraps=ProteinSequence,
        wrapped_constants=("ElementClass", "_gap_chars"),
        wrapped_properties=("origin", "entry_name", "entry_id", "long_name",
                            "pdb_id", "visibility", "name"),
        wrapped_getters=("getStructure", "hasStructure"),
        wrapped_setters=("setStructure",)):
    """
    A sequence that contains multiple chains from the same protein.  Instances
    of this class do not directly contain any residues themselves and instead
    wrap one or several `ProteinSequence` objects.

    :note: `CombinedChainProteinSequence.visibility` properly reports entry
        inclusion state, but it may not correctly report entry visibility (e.g.
        partially visible vs. fully visible). The MSV structure icons only
        report inclusion state and the visibility of included entries isn't
        reported anywhere in the panel, though, so this limitation doesn't have
        any impact on functionality.
    """

    AnnotationClass = annotation.CombinedChainProteinSequenceAnnotations

    # ==========================================================================
    # Magic Methods
    # ==========================================================================

[docs]    def __init__(self, seqs):
        """
        :param seqs: A list of the split-chain sequences to wrap.
        :type seqs: list(ProteinSequence)
        """
        if not seqs:
            raise ValueError("ExtendedProteinSequence requires at least one "
                             "sequence.")
        self._seqs = list(seqs)
        self._updateSeqOffsets()
        self._updateSeqCaches()
        super().__init__()
        for seq in seqs:
            self._connectSignals(seq)

    def __deepcopy__(self, memo):
        copied_chains = copy.deepcopy(self._seqs, memo)
        return self.__class__(copied_chains)

    def __getitem__(self, index):
        if isinstance(index, slice):
            if index.step not in (None, 1):
                raise ValueError(f"{self.__class__.__name__} does not support "
                                 "slices with steps.")
            elif index.start == index.stop:
                return []

            if index.start is None:
                start_seq_num = 0
                start_index = 0
            else:
                start_seq_num, start_index = self._indexToSeqIndexAndIndex(
                    index.start)

            if index.stop is None:
                end_seq_num = len(self._seqs) - 1
                end_index = len(self._seqs[-1]) - 1
            else:
                end_seq_num, end_index = self._indexToSeqIndexAndIndex(
                    index.stop - 1)

            if start_seq_num == end_seq_num:
                residues = self._seqs[start_seq_num][start_index:end_index + 1]
            else:
                residues = self._seqs[start_seq_num][start_index:]
                for seq_num in range(start_seq_num + 1, end_seq_num):
                    residues.extend(self._seqs[seq_num][:])
                residues.extend(self._seqs[end_seq_num][:end_index + 1])
            return [
                residue.CombinedChainResidueWrapper(res, self)
                for res in residues
            ]
        else:
            seq, index_in_seq, _ = self.indexToSeqAndIndex(index)
            res = seq[index_in_seq]
            return residue.CombinedChainResidueWrapper(res, self)

    def __iter__(self):
        for res in itertools.chain.from_iterable(self._seqs):
            yield residue.CombinedChainResidueWrapper(res, self)

[docs]    def __len__(self):
        return self._seq_offsets[-1] + len(self._seqs[-1])

    def __str__(self):
        return "".join(str(seq) for seq in self._seqs)

    def __repr__(self):
        chain_reprs = ", ".join(map(repr, self._seqs))
        return f"{self.__class__.__name__}([{chain_reprs}])"

    def __hash__(self):
        return hash(self.chains)

    def __eq__(self, other):
        if isinstance(other, CombinedChainProteinSequence):
            return self.chains == other.chains
        else:
            return False

    def _getSignalsAndSlots(self, seq):
        """
        Return signals and slots for signals emitted by the given split-chain
        sequences.

        :param seq: The split-chain sequence
        :type seq: ProteinSequence

        :return: A list of tuples of (signal, slot)
        :rtype: list(tuple(QtCore.pyqtBoundSignal, method)
        """
        return [
            (seq.residuesChanged, self._onResiduesChanged),
            (seq.lengthAboutToChange, self._onLengthAboutToChange),
            (seq.lengthChanged, self._onLengthChanged),
            (seq.nameChanged, self.nameChanged),
            (seq.visibilityChanged, self.visibilityChanged),
            (seq.structureChanged, self.structureChanged),
            (seq.annotationTitleChanged, self.annotationTitleChanged),
            (seq.disulfideBondsCacheCleared, self.clearDisulfideBondsCache),
            (seq.secondaryStructureCacheCleared,
             self._clearSecondaryStructureCache)
        ]  # yapf: disable

    def _connectSignals(self, seq):
        for signal, slot in self._getSignalsAndSlots(seq):
            signal.connect(slot)

    def _disconnectSignals(self, seq):
        for signal, slot in self._getSignalsAndSlots(seq):
            signal.disconnect(slot)

    # ==========================================================================
    # Sequence-Level Properties
    # ==========================================================================

    @property
    def fullname(self):
        return f"{self.name}_All"

    # ==========================================================================
    # Residues
    # ==========================================================================

[docs]    def index(self, res):
        chain = res.split_sequence
        try:
            chain_index = self._chain_indices[chain]
        except KeyError:
            err_msg = f"Specified residue {res} not found in sequence {self}"
            raise ValueError(err_msg)
        offset = self._seq_offsets[chain_index]
        return offset + chain.index(res.split_res)

[docs]    def indices(self, residues):
        """
        Returns the indices of all specified residues.  Note that the returned
        integers will likely *not* be in the same order as the input residues.

        :param res: The residues to find indices of
        :type res: Iterable(residue.CombinedChainResidueWrapper)

        :return: The indices of the residues
        :rtype: list[int]
        """
        by_chain = {}
        for cur_res in residues:
            split_res = cur_res.split_res
            by_chain.setdefault(split_res.sequence, []).append(split_res)
        indices = []
        for chain, cur_residues in by_chain.items():
            offset = self.offsetForChain(chain)
            cur_indices = chain.indices(cur_residues)
            indices.extend(i + offset for i in cur_indices)
        return indices

[docs]    def insertElements(self, index, elements):
        # See parent class for method documentation
        chain, chain_index, _ = self.indexToSeqAndIndex(index)
        chain.insertElements(chain_index, elements)

[docs]    def mutate(self, start, end, elements):
        """
        Mutate sequence elements.  See parent class for additional method
        documentation.

        :raise MultipleChainsError: If the specified residue range spans
            multiple chains.
        """
        chain, start_chain_index, end_chain_index = \
            self._getMutationChainIndices(start, end)
        chain.mutate(start_chain_index, end_chain_index, elements)

[docs]    def assertCanMutateResidues(self, start, end):
        """
        Make sure that we can mutate the specified residues.  If not, raise an
        exception.

        :param start: The index at which to start mutating
        :type start: int

        :param end: The index of the last mutated element (exclusive)
        :type end: int

        :raise MultipleChainsError: If the specified residue range spans
            multiple chains.
        """
        self._getMutationChainIndices(start, end)

[docs]    def append(self, element):
        # See parent class for method documentation
        self._seqs[-1].append(element)

[docs]    def extend(self, elements):
        # See parent class for method documentation
        self._seqs[-1].extend(elements)

[docs]    def getSubsequence(self, start, end):
        """
        Return a sequence containing a subset of the elements in this one.  Note
        that the new sequence will be a split-chain sequence and will ignore
        any chain breaks present in the requested subset of elements.

        :param start: The index at which the subsequence should start
        :type start: int

        :param end: The index at which the subsequence should end (exclusive)
        :type end: int

        :return: The requested subsequence
        :rtype: ProteinSequence
        """
        return ProteinSequence(
            copy.copy(res.split_res) for res in self[start:end])

[docs]    def removeElements(self, eles):
        # See parent class for method documentation
        by_chain = {}
        for cur_ele in eles:
            by_chain.setdefault(cur_ele.split_sequence,
                                []).append(cur_ele.split_res)
        if not set(by_chain.keys()).issubset(self._seqs):
            raise ValueError("Elements must be contained by sequence in order "
                             "to be removed")
        for chain, cur_eles in by_chain.items():
            chain.removeElements(cur_eles)
        # note that the sequence offsets will be updated in response to the
        # lengthChanged signal, so we don't need to explicitly call
        # _updateSeqOffsets here

    def _onResiduesChanged(self, start, end):
        chain = self.sender()
        chain_i = self._seqs.index(chain)
        offset = self._seq_offsets[chain_i]
        self.residuesChanged.emit(start + offset, end + offset)

    def _onLengthAboutToChange(self, old_chain_len, new_chain_len):
        old_combined_len = len(self)
        new_combined_len = old_combined_len + new_chain_len - old_chain_len
        self.lengthAboutToChange.emit(old_combined_len, new_combined_len)

    def _onLengthChanged(self, old_chain_len, new_chain_len):
        self._updateSeqOffsets()
        new_combined_len = len(self)
        old_combined_len = new_combined_len - new_chain_len + old_chain_len
        self.lengthChanged.emit(old_combined_len, new_combined_len)

    def _removeFromSequence(self, keep_func, start=0, end=None):
        # See parent class for method documentation
        if end is None:
            end = len(self)
        for chain, offset in zip(self._seqs, self._seq_offsets):
            if end < offset or start >= offset + len(chain):
                continue
            if start <= offset:
                cur_start = 0
            else:
                cur_start = start - offset
            if end >= offset + len(chain):
                cur_end = None
            else:
                cur_end = end - offset
            chain._removeFromSequence(keep_func, cur_start, cur_end)

[docs]    def getStructureResForRes(self, res):
        # See parent class for method documentation
        chain = res.split_sequence
        if chain not in self._seqs:
            raise ValueError("Residue not part of this sequence")
        if not chain.hasStructure():
            return None
        return chain.getStructureResForRes(res.split_res)

    # ==========================================================================
    # Gaps
    # ==========================================================================

[docs]    def getGaplessLength(self):
        # See parent class for method documentation
        return sum(chain.getGaplessLength() for chain in self._seqs)

[docs]    def addGapsByIndices(self, gap_idxs):
        # See parent class for method documentation
        # Adding a gap is ambiguous at chain boundaries.  We always add to
        # the beginning of the later sequence.
        chain_i = 0
        gaps_by_chain = [[] for _ in range(len(self._seqs))]
        num_chains = len(self._seqs)
        for gap_i, cur_index in enumerate(sorted(gap_idxs)):
            while chain_i < num_chains:
                adjusted = cur_index - gap_i - self._seq_offsets[chain_i]
                # figure out the index of last allowable gap for this chain
                last_allowable_gap = len(self._seqs[chain_i])
                if chain_i < num_chains - 1:
                    last_allowable_gap -= 1
                if adjusted > last_allowable_gap:
                    chain_i += 1
                    continue
                else:
                    gaps_by_chain[chain_i].append(adjusted)
                    break
            else:
                raise ValueError("Invalid gap index")
        for split_seq, gaps in zip(self._seqs, gaps_by_chain):
            if gaps:
                split_seq.addGapsBeforeIndices(gaps)
        # note that the sequence offsets will be updated in response to the
        # lengthChanged signal, so we don't need to explicitly call
        # _updateSeqOffsets here

[docs]    def addGapsToChainStartsAndEnds(self, gaps: GapRegionsForChains):
        """
        Add the specified numbers of gaps to the starts and ends of each chain.

        :param gaps: The numbers of gaps to add
        """
        assert len(self._seqs) == len(gaps)
        for chain, gap_region in zip(self._seqs, gaps):
            start_gaps = [residue.Gap() for _ in range(gap_region.from_start)]
            chain.insertElements(0, start_gaps)
            end_gaps = [residue.Gap() for _ in range(gap_region.from_end)]
            chain.extend(end_gaps)

[docs]    def removeGapsFromChainStartsAndEnds(self, gaps: GapRegionsForChains):
        """
        Remove the specified numbers of gaps from the starts and ends of each
        chain.

        :param gaps: The numbers of gaps to remove
        """
        assert len(self._seqs) == len(gaps)
        elems = self._getGapsFromChainStartsAndEnds(gaps)
        elems = [
            residue.CombinedChainResidueWrapper(gap, self) for gap in elems
        ]
        self.removeElements(elems)

[docs]    def validateGapsToRemoveFromChainStartAndEnds(self,
                                                  gaps: GapRegionsForChains):
        """
        Make sure that we can remove the specified numbers of gaps from the
        starts and ends of each chain.

        :param gaps: The numbers of gaps to remove
        :raise AssertionError: If some of the sequence elements to be removed
            aren't actually gaps.
        """
        self._getGapsFromChainStartsAndEnds(gaps)

    def _getGapsFromChainStartsAndEnds(self, gaps: GapRegionsForChains):
        """
        Fetch the sequence elements representing the specified numbers of gaps
        from the starts and ends of each chain.

        :param gaps: The numbers of gaps to remove
        :raise AssertionError: If some of the sequence elements to be removed
            aren't actually gaps.
        """
        elems = []
        for chain, gap_region in zip(self._seqs, gaps):
            elems.extend(chain[0:gap_region.from_start])
            if gap_region.from_end:
                elems.extend(chain[-gap_region.from_end:])
        assert all(e.is_gap for e in elems)
        return elems

    # ==========================================================================
    # Annotations
    # ==========================================================================

    @property
    def disulfide_bonds(self):
        """
        :return: A sorted tuple of the valid disulfide bonds.
        :rtype: tuple(residue.CombinedChainDisulfideBond)
        """
        if self._disulfide_bonds is None:
            bonds = {
                residue.CombinedChainDisulfideBond(cur_bond, self)
                for chain in self._seqs
                for cur_bond in chain.disulfide_bonds
            }
            self._disulfide_bonds = tuple(
                sorted((bond for bond in bonds if bond.isValid()),
                       key=lambda bond: min(self.index(res) for res in bond)))
        return self._disulfide_bonds

    def _updateSecondaryStructure(self):
        # See AbstractProteinSequenceMixin for documentation
        self._secondary_structures = []
        for chain, offset in zip(self._seqs, self._seq_offsets):
            for split_ss in chain.secondary_structures:
                split_start, split_end = split_ss.limits
                combined_start = split_start + offset
                combined_end = split_end + offset
                combined_ss = _SecondaryStructure(
                    (combined_start, combined_end), split_ss.ssa_type)
                self._secondary_structures.append(combined_ss)

    def _clearSecondaryStructureCache(self):
        self._secondary_structures = None

[docs]    def isKinaseChain(self):
        return any(chain.isKinaseChain() for chain in self.chains)

    # ==========================================================================
    # Predictions
    # ==========================================================================

    @property
    def pred_disulfide_bonds(self):
        # See AbstractProteinSequenceMixin for documentation
        if self._pred_disulfide_bonds is None:
            bonds = {
                residue.CombinedChainDisulfideBond(cur_bond, self)
                for chain in self._seqs
                for cur_bond in chain.pred_disulfide_bonds
            }
            key = lambda bond: min(
                self.index(res) for res in bond if res in self)
            self._pred_disulfide_bonds = tuple(sorted(bonds, key=key))
        return self._pred_disulfide_bonds

    def _updatePredSecondaryStructure(self):
        # See AbstractProteinSequenceMixin for documentation
        self._pred_secondary_structures = []
        for chain, offset in zip(self._seqs, self._seq_offsets):
            for split_ss in chain.pred_secondary_structures:
                split_start, split_end = split_ss.limits
                combined_start = split_start + offset
                combined_end = split_end + offset
                combined_ss = _SecondaryStructure(
                    (combined_start, combined_end), split_ss.ssa_type)
                self._pred_secondary_structures.append(combined_ss)

    # ==========================================================================
    # Chains and Sequences
    # ==========================================================================

    def _updateSeqOffsets(self):
        """
        Update `self._seq_offsets`, which gives the residue index for the start
        of each chain.  Should be called when `self._seqs` changes.
        """
        self._seq_offsets = [0] + list(
            itertools.accumulate(len(seq) for seq in self._seqs[:-1]))
        self._ro_seq_offsets = tuple(self._seq_offsets)

    def _updateSeqCaches(self):
        """
        Update the read-only copy of the sequence and sequence offsets.  These
        copies are used to ensure that clients of this class can't modify
        internal data.
        """
        self._ro_seqs = tuple(self._seqs)
        self._chain_indices = types.MappingProxyType(
            {seq: i for i, seq in enumerate(self._seqs)})

    def _indexToSeqIndexAndIndex(self, index):
        """
        Convert a combined-chain residue index to a sequence index and a residue
        index within the specified sequence.

        :param index: A valid combined-chain residue index
        :type index: int

        :return: The split-chain sequence index and residue index
        :rtype: tuple(int, int)
        """
        if index < 0:
            index = len(self) + index
        seq_index = bisect.bisect_right(self._seq_offsets, index) - 1
        index_in_seq = index - self._seq_offsets[seq_index]
        return seq_index, index_in_seq

[docs]    def indexToSeqAndIndex(self, index):
        """
        Convert a combined-chain residue index to a split-chain sequence and a
        residue index within the specified sequence.

        :param index: A valid combined-chain residue index
        :type index: int

        :return: A tuple of
          - the split-chain sequence
          - residue index
          - the starting index of the split-chain sequence
        :rtype: tuple(ProteinSequence, int, int)
        """
        seq_index, index_in_seq = self._indexToSeqIndexAndIndex(index)
        return self._seqs[seq_index], index_in_seq, self._seq_offsets[seq_index]

    @property
    def chain(self):
        return "All"

    @property
    def chains(self):
        return self._ro_seqs

    @property
    def chain_offsets(self):
        return self._ro_seq_offsets

[docs]    def hasChain(self, chain_name):
        """
        Does this sequence contain a chain with the specified name?

        :param chain_name: The chain name to check
        :type chain_name: str

        :rtype: bool
        """
        return any(chain_name == seq.chain for seq in self._seqs)

[docs]    def addChain(self, seq):
        """
        Add a new chain to this sequence.

        :param seq: The chain to add
        :type seq: ProteinSequence
        """
        old_len = len(self)
        new_len = old_len + len(seq)
        self.lengthAboutToChange.emit(old_len, new_len)
        self._seqs.append(seq)
        self._updateSeqOffsets()
        self._updateSeqCaches()
        self.clearDisulfideBondsCache()
        self._connectSignals(seq)
        self._clearSecondaryStructureCache()
        self.annotations.chainAdded(seq)
        self.lengthChanged.emit(old_len, new_len)

[docs]    def removeChain(self, seq):
        """
        Remove a chain from this sequence.  Note that you should not remove the
        last chain; instead, remove this sequence from the alignment.

        :param seq: The chain to remove
        :type seq: ProteinSequence
        """
        if len(self._seqs) == 1:
            raise ValueError(f"Cannot remove last chain from a "
                             f"{self.__class__.__name__}")
        index = self._seqs.index(seq)
        old_len = len(self)
        new_len = old_len - len(seq)
        self.lengthAboutToChange.emit(old_len, new_len)
        del self._seqs[index]
        self._updateSeqOffsets()
        self._updateSeqCaches()
        self._disconnectSignals(seq)
        self._clearSecondaryStructureCache()
        self.clearDisulfideBondsCache()
        self.annotations.chainRemoved(seq)
        self.lengthChanged.emit(old_len, new_len)

[docs]    def removeChains(self, seqs):
        """
        Remove multiple chains from this sequence.  Note that you should not all
        chain from a combine-chain sequence; instead, remove the sequence from
        the alignment.

        :param seqs: The chains to remove
        :type seqs: list[ProteinSequence]
        """
        for seq in seqs:
            self.removeChain(seq)

[docs]    def insertElementByChain(self, index, chain, element):
        """
        Add the given element to the specified chain of a sequence.

        :param index: The index to insert the element at.  Note that this is the
            index in the sequence, not the chain.
        :type index: int

        :param chain: The chain to insert into
        :type chain: sequence.ProteinSequence

        :param element: The element to insert
        :type element: str or residue.AbstractSequenceElement
        """
        chain_index = self._chain_indices[chain]
        index_in_chain = index - self._seq_offsets[chain_index]
        assert 0 <= index_in_chain <= len(chain)
        chain.insertElements(index_in_chain, [element])

[docs]    def offsetForChain(self, chain):
        """
        Get the combined-chain residue index for the first residue of the
        specified chain.

        :param chain: The chain
        :type chain: ProteinSequence

        :return: The offset
        :rtype: int
        """
        chain_index = self._chain_indices[chain]
        return self._seq_offsets[chain_index]

    def _getMutationChainIndices(self, start, end):
        """
        Get the chain and chain indices referred to by the specified combined-
        chain indices.

        :param start: The index at which to start mutating
        :type start: int

        :param end: The index of the last mutated element (exclusive)
        :type end: int

        :return: A tuple of
            - the chain that the indices refer to
            - The start index within the chain
            - The end index within the chain
        :rtype: tuple(sequence.ProteinSequence, int, int)

        :raise MultipleChainsError: If the specified residue range spans
            multiple chains.
        """
        start_chain, start_chain_index, _ = self.indexToSeqAndIndex(start)
        # since end is exclusive, we want the chain of the previous residue
        if end > start:
            end -= 1
            offset = 1
        elif end < start:
            raise RuntimeError("End must be greater than or equal to start")
        else:
            offset = 0
        end_chain, end_chain_index, _ = self.indexToSeqAndIndex(end)
        end_chain_index += offset
        if start_chain != end_chain:
            raise MultipleChainsError("Cannot mutate multiple chains.")
        return start_chain, start_chain_index, end_chain_index

    # ==========================================================================
    # Caching
    # ==========================================================================

[docs]    def clearAllCaching(self):
        # we intentionally don't clear the read-only sequence and sequence-
        # offset caches, since those aren't regenerated as needed.
        self.annotations.clearAllCaching()
        self._clearSecondaryStructureCache()
        self.clearDisulfideBondsCache()
        for chain in self._seqs:
            chain.clearAllCaching()


[docs]class MultipleChainsError(ValueError):
    """
    An exception raised when the specified indices span multiple chains but the
    operation can only be carried out on a single chain.
    """
    # This class intentionally left blank


[docs]class StructureSequence(structure._AtomCollection):
    """
    Class representing a sequence of protein residues.
    """

    def _getResidueIterator(self):
        return structure._structure._get_residues_by(self)

    residue = property(
        _getResidueIterator,
        doc="Returns residue iterator for all residues in the sequence")


[docs]def get_structure_sequences(st):
    """
    Iterates over all sequences in the given structure.
    """

    mm.mmpdb_initialize(mm.error_handler)
    new_indices = mm.mmpdb_get_sequence_order(st.handle,
                                              mm.MMPDB_SEQUENCE_ORDER)
    mm.mmpdb_terminate()

    curr_sequence_atoms = []
    prev_res_atom = None  # An atom of the previous residue

    prev_residue_id = (None, None, None)
    # Skip atom 0, as it is simply a place holder, because atoms are 1-indexed.
    for ai in new_indices[1:]:
        atom = st.atom[ai]
        residue_id = (atom.resnum, atom.chain, atom.inscode)
        # Check to see whether we reached the end of this sequence
        if residue_id != prev_residue_id and prev_res_atom is not None and not mm.mmct_res_connected(
                st.handle, prev_res_atom, ai):
            # This is the start of a new sequence
            yield StructureSequence(st, curr_sequence_atoms)
            curr_sequence_atoms = []

        prev_res_atom = ai
        prev_residue_id = residue_id
        # Whether new sequence or not, we need to update these:
        curr_sequence_atoms.append(ai)

    if curr_sequence_atoms:
        yield StructureSequence(st, curr_sequence_atoms)


[docs]def find_generalized_pattern(sequence_list, pattern, validate_pattern=False):
    """
    Finds a generalized sequence pattern within specified sequences.
    NOTE: The search is performed in the forward direction only.

    :param sequence_list: list of sequence dictionaries to search.
    :type sequence_list: list(dict)

           Each sequence is a dictionary including amino acid strings
           and associated data. The amino acid string should be ungapped.
           The dictionary should include 'amino_acids' key and optionally
           'secondary_structure', 'solvent_accessibility' and 'flexibility'
           keys:

           - sequence is expressed as uppercase 1-letter code:

               sequence['amino_acids'] = 'ACDEFGHIKLMNPQRSTVWY'
                   (sequence in 1-letter code, mandatory)

           - secondary structure usese 'h', 'e' and ' ' symbols:

               sequence['secondary_structure'] = 'hhhhh     eeeee     '

           - solvent accessibility uses 'e' (exposed) and ' ' (buried)
               symbols:

               sequence['solvent_accessibiliy' = 'eeeee     eeeee     '

           - flexibility uses 'f' (flexible) and ' ' (not flexible)
               symbols:

               sequence['flexibility'] = '     fffff     fffff'

    :type pattern: str
    :param pattern: Pattern defined using extended PROSITE syntax.

           - standard IUPAC one-letter codes are used for all amino acids

           - each element in a pattern is separated using '-' symbol

           - symbol 'x' is used for position where any amino acid is accepted

           - ambiguities are listed using the acceptable amino acids between
             square brackets, e.g. [ACT] means Ala, Cys or Thr

           - amino acids not accepted for a given position are indicated
             by listing them between curly brackets, e.g. {GP} means 'not Gly
             and not Pro'

           - repetition is indicated using parentheses, e.g. A(3) means
             Ala-Ala-Ala, x(2,4) means between 2 to 4 any residues

           - the following lowercase characters can be used as additional
             flags:

               - 'x' means any amino acid
               - 'a' means acidic residue: [DE]
               - 'b' means basic residue: [KR]
               - 'o' means hydrophobic residue: [ACFILPWVY]
               - 'p' means aromatic residue: [WYF]
               - 's' means solvent exposed residue
               - 'h' means helical residue
               - 'e' means extended residue
               - 'f' means flexible residue

           - Each position can optionally by followed by @<res_num> expression
             that will match the position with a given residue number.

           - Entire pattern can be followed by :<index> expression that defines
             a 'hotspot' in the pattern. When the hotspot is defined, only
             a single residue corresponding to (pattern_match_start+index-1)
             will be returned as a match. The index is 1-based and can be used
             to place the hotspot outside of the pattern (can also be
             a negative number).

        Pattern examples:

            - N-{P}-[ST] : Asn-X-Ser or Thr (X != Pro)
            - N[sf]-{P}[sf]-[ST][sf] : as above, but all residues flexible
              OR solvent exposed
            - Nsf-{P}sf-[ST]sf : as above, but all residues flexible
              AND solvent exposed
            - Ns{f} : Asn solvent exposed AND not in flexible region
            - N[s{f}] : Asn solvent exposed OR not in flexible region
            - [ab]{K}{s}f : acidic OR basic, with exception of Lys,
              flexible AND not solvent exposed
            - Ahe : Ala helical AND extended - no match possible
            - A[he] : Ala helical OR extended
            - A{he} : Ala coiled chain conformation (not helical nor extended)
            - [ST] : Ser OR Thr
            - ST : Ser AND Thr - no match possible

    :type validate_pattern: boolean
    :param validate_pattern: If True, the function will validate the pattern
             without performing the search (the sequences parameter will be
             ignored) and return True if the pattern is valid, or False
             otherwise. The default is False.

    :rtype: list of lists of integer tuples or False if the pattern is invalid
    :return: False if the specified input pattern was incorrect.
             Otherwise, it returns a list of lists of matches for each
             input sequence. Each match is a (start, end) tuple where
             start and end are matching sequence positions.
    """
    '''
    FIXME This will modify the user's input. Not good. Intead we can expect
    the user to do this themselves.
    if invert and sequence_list:
        for k in sequences.values():
            sequences[k] = sequences[k][::-1]
    '''
    # Cast unicode pattern to str to avoid b''.translate() exception
    pattern = str(pattern)

    # Remove color designator if present
    if '#' in pattern:
        pattern = pattern[:pattern.find('#')]

    hotspot = None
    # Extract a hotspot marker
    if ':' in pattern:
        pos = pattern.find(':')
        try:
            hotspot = int(pattern[pos + 1:])
        except ValueError:
            hotspot = None
        else:
            pattern = pattern[:pos]

    # Split the pattern by dash symbols
    plist = [p for p in pattern.split('-') if p]

    pattern = ""

    idlist = []

    # Extract residue number matches
    for index, p in enumerate(plist):
        _id = ""
        match = re.match(r"""
                         (.*)             # everything before the @
                         @(\d+[a-zA-Z]?)  # the @ and residue number with optional insertion code
                         (.*)$            # everything after the residue number
                         """,
                         p,
                         flags=re.VERBOSE)
        if match:
            # Fail if a residue number match is requested but
            # the sequence list does not have residue number information
            if sequence_list and not sequence_list[0].get("res_ids"):
                return False
            p = match.group(1) + match.group(3)
            _id = match.group(2)
        else:
            _id = ""
            if p == "":
                p = "x"

        plist[index] = p
        idlist.append(_id)

    replace_wildcards = lambda x: x.replace('a', 'ED'). \
        replace('b', 'KR').replace('o', 'WILYFVPCA').replace('p', 'WYF')

    # Convert the pattern to a regular expression (a bit clunky code)
    for p in plist:
        s = p.split('(')
        p = s[0]
        pattern += '('
        # Extract included, excluded and mandatory amino acids
        incl_aa = delete_from_str(''.join(re.findall(r"\[[abopA-Z]+\]", p)),
                                  '[]')
        excl_aa = delete_from_str(''.join(re.findall(r"\{[A-Zabop]+\}", p)),
                                  '{}')
        mand_aa = delete_from_str(p, incl_aa + excl_aa + "[]{}efhs")
        incl_aa = replace_wildcards(incl_aa)
        excl_aa = replace_wildcards(excl_aa)
        if 'x' in incl_aa + mand_aa:
            pattern += '.'
        else:
            if 'x' in excl_aa:
                # Exclude everything
                return False
            if len(mand_aa) > 1:
                # Wrong pattern, no match possible
                return False
            if mand_aa:
                if mand_aa in "abop":
                    incl_aa = replace_wildcards(mand_aa)
                else:
                    pattern += mand_aa
            if incl_aa or excl_aa:
                pattern += '['
                if incl_aa:
                    pattern += delete_from_str(incl_aa, excl_aa)
                else:
                    pattern += '^' + excl_aa
                pattern += ']'
        # Extract included, excluded and mandatory flags
        incl_flag = delete_from_str(''.join(re.findall(r"\[[efhs\{\}]+\]", p)),
                                    '[]{}')
        excl_flag = delete_from_str(''.join(re.findall(r"\{[efhs\[\]]+\}", p)),
                                    '{}[]')
        mand_flag = [
            c for c in delete_from_str(p, incl_flag + excl_flag + "[]{}")
            if c in "efhs"
        ]
        incl_flag = delete_from_str(incl_flag, excl_flag)

        if 'e' in mand_flag and 'h' in mand_flag:
            # Can't be simultaneously helix and extended
            return False
        p = ""
        for f in ['eh', 's', 'f']:
            p = ""
            for c in f:
                if c in mand_flag:
                    p = c
                elif c in incl_flag:
                    if '[' in p:
                        p = '[' + c + p[1:]
                    else:
                        p = '[' + c + ']'
                elif c in excl_flag:
                    if '[' in p:
                        p = '[^' + c + p[2:]
                    else:
                        p = '[^' + c + ']'
            if not p:
                p = '.'
            pattern += p

        pattern += '-)'

        # Add number of pattern repetitions
        if len(s) > 1:
            pattern += '{' + s[1].replace(')', '}')

    # Compile the pattern
    try:
        pattern = re.compile(pattern)
    except Exception:
        # Syntax error: quit
        return False

    if validate_pattern:
        return True

    results = []

    # Use the pattern to search the sequences
    for sequence in sequence_list:
        match_list = []
        if 'amino_acids' in sequence:
            length = len(sequence['amino_acids'])
            if 'secondary_structure' not in sequence:
                sequence['secondary_structure'] = ' ' * length
            if 'solvent_accessibility' not in sequence:
                sequence['solvent_accessibility'] = ' ' * length
            else:
                # replace "e" for "exposed" with "s" for "solvent-exposed"
                sequence['solvent_accessibility'] = sequence[
                    'solvent_accessibility'].replace('e', 's')
            if 'flexibility' not in sequence:
                sequence['flexibility'] = ' ' * length
            # Convert the sequence to flat format: Ahsf-
            # (5 characters per residue)
            seq_string = ''.join([
                ''.join(s) for s in
                zip(sequence['amino_acids'], sequence['secondary_structure'],
                    sequence['solvent_accessibility'], sequence['flexibility'],
                    '-' * length)
            ])

            # Build list of matches
            match_list = []
            match = pattern.search(seq_string)
            while match:
                start = match.start()
                end = match.end()
                # Now test residue IDs
                has_match = True
                if 'res_ids' in sequence and any(idlist):
                    res_ids = sequence['res_ids']
                    for index, id in enumerate(idlist):
                        if id and id != res_ids[old_div(start, 5) + index]:
                            has_match = False
                            break
                if has_match:
                    if hotspot is None:
                        match_list.append((old_div(start, 5), old_div(end, 5)))
                    else:
                        match_list.append(((old_div(start, 5)) + hotspot - 1,
                                           (old_div(start, 5)) + hotspot - 1))
                        # Why is the second item needed?
                        # It is only needed because MSV expects a range
                        # of residues returned for each match, so it knows
                        # start and end position. If the hotspot is defined,
                        # start == end. Thus MSV doesn't have to know
                        # if the pattern included hotspot definition or not.
                        # If the caller knows the hotspot was defined,
                        # it can simply ignore the second value.

                match = pattern.search(seq_string, start + 1)

            # Append the result even if it's []
            results.append(match_list)

    return results


[docs]def convert_structure_sequence_for_pattern_search(seq, sasa_by_atom=None):
    """
    Converts a StructureSequence object to dictionary required
    by find_generalized_pattern function. Because the conversion
    can be time consuming, it should be done once per sequence.

    Optionally a list of atom SASAs for each atom in the CT can be specified.
    If it's not specified, it will get calculated by calling
    analyze.calculate_sasa_by_atom().

    :type seq: `StructureSequence`
    :param seq: StructureSequence object

    :type sasa_by_atom: list
    :param sasa_by_atom: list of atom SASAs

    :rtype: dict
    :return: Dictionary of sequence information
    """
    valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1)

    seq_dict = {}
    seq_dict["amino_acids"] = ""
    seq_dict["secondary_structure"] = ""
    seq_dict["flexibility"] = ""
    seq_dict["solvent_accessibility"] = ""
    seq_dict["res_ids"] = []

    if len(seq.residue) == 0:
        return None
    avg_b_factor = 0.0
    n = 0
    for res in seq.residue:
        b_factor = res.temperature_factor
        if b_factor > 0:
            avg_b_factor += b_factor
            n += 1
    if n:
        avg_b_factor /= n

    # Calculates the std SASAs. The calculations are made only the first time
    # this function is called:
    std_sasa_dict = calculate_sasa_dict()

    # Calculate SASA only once for all atoms (and only if it wasn't specified):
    if sasa_by_atom is None:
        sasa_by_atom = analyze.calculate_sasa_by_atom(seq._st)

    for res in seq.residue:
        # sequence
        code = 'X'
        if res.pdbres[:3] in valid_amino_acids:
            code = constants.AMINO_ACIDS_3_TO_1[res.pdbres[:3]]
        seq_dict["amino_acids"] += code

        # residue numbers
        res_id = "%s%s" % (res.resnum, res.inscode.strip())
        seq_dict["res_ids"].append(res_id)

        # secondary structure
        code = ' '
        if res.secondary_structure == 1:
            code = 'h'
        elif res.secondary_structure == 2:
            code = 'e'
        seq_dict["secondary_structure"] += code

        # flexibility
        code = ' '
        if res.temperature_factor > avg_b_factor:
            code = 'f'
        seq_dict["flexibility"] += code

        # solvent accessibility
        code = ' '
        # sasa = analyze.calculate_sasa(seq._st, res.getAtomIndices())
        sasa = sum([sasa_by_atom[a.index - 1] for a in res.atom])

        if res.pdbres[:3] in std_sasa_dict:
            sasa /= std_sasa_dict[res.pdbres[:3]]
            if sasa > 1.0:
                code = 'e'
        seq_dict["solvent_accessibility"] += code

    return seq_dict


[docs]def find_pattern(seq, pattern):
    """
    Find pattern matches in a specified StructureSequence object.
    Returns a list of matching positions.

    :type seq: `StructureSequence`
    :param seq: StructureSequence object

    :type pattern: string
    :param pattern: Sequence pattern. The syntax is described in
                    find_generalized_pattern.

    :rtype: list of lists of integer tuples or None
    :return: None if the specified input pattern was incorrect.
             Otherwise, it returns a list of lists of matches for each
             residue position in the input structure. Each match is a
             (start, end) tuple where start and end are matching sequence
             positions. If 'hotspot' is specified then start = end.

    """
    seq_dict = convert_structure_sequence_for_pattern_search(seq)

    return find_generalized_pattern([seq_dict], pattern)


[docs]def assign_residue_numbers(residues, start_res=None, end_res=None):
    """
    Assign residue numbers to the given residues based on the residues before
    and after

    :param residues: Residues that need numbering. Will be modified in-place.
    :type residues: list[residue.Residue]

    :param start_res: Previous residue. Pass None if the residues are N-terminal
    :type start_res: residue.Residue or NoneType

    :param end_res: Next residue. Pass None if the residues are C-terminal
    :type end_res: residue.Residue or NoneType
    """
    # Make shallow copy without gaps
    residues = [elem for elem in residues if elem.is_res]

    start_num = None
    resnum_and_inscode = None
    num_to_renumber = len(residues)
    if start_res is None:
        if end_res is None:
            # Both are None - number starting from 1
            start_num = 1
        else:
            # Number N-terminal residues
            start_num = end_res.resnum - num_to_renumber
    elif end_res is None or (end_res.resnum - start_res.resnum - 1 >=
                             num_to_renumber):
        # Residues are C-terminal or there are enough missing residue numbers
        # that each residue can have its own number without the need for
        # insertion codes.
        start_num = start_res.resnum + 1
    else:
        # There are more residues than residue numbers, so we need
        # to use insertion codes.
        resnum_and_inscode = gen_resnums_and_inscodes(start_res.resnum,
                                                      start_res.inscode,
                                                      end_res.resnum,
                                                      end_res.inscode)
        if len(resnum_and_inscode) < num_to_renumber:
            # There are more residues than we can possibly fit even
            # using every number + insertion code combination, so
            # pad with Nones
            nones = [None] * (num_to_renumber - len(resnum_and_inscode))
            resnum_and_inscode.extend(nones)

    if resnum_and_inscode is not None:
        for resnum, res in zip(resnum_and_inscode, residues):
            if resnum is None:
                inscode = None
            else:
                resnum, inscode = resnum
            res.resnum = resnum
            res.inscode = inscode
    else:
        for idx, res in enumerate(residues, start=start_num):
            res.resnum = idx
            res.inscode = None


[docs]def gen_resnums_and_inscodes(start_resnum, start_inscode, end_resnum,
                             end_inscode):
    """
    Create a list of all residue numbers/insertion code combinations
    possible between the given endpoints.  If the ending residue number and
    insertion code are less than or equal to the starting residue number and
    insertion code, then an empty list will be returned.

    :param start_resnum: The starting residue number.
    :type start_resnum: int

    :param start_inscode: The starting insertion code.
    :type start_inscode: str

    :param end_resnum: The ending residue number.
    :type end_resnum: int

    :param end_inscode: The ending insertion code.
    :type end_inscode: str

    :return: A list of residue numbers and insertion codes
    :rtype: list[tuple[int, str]]
    """
    # Make sure that the insertion codes are ASCII letters or blanks
    start_inscode = start_inscode.upper() or " "
    end_inscode = end_inscode.upper() or " "
    for resnum, inscode in ((start_resnum, start_inscode), (end_resnum,
                                                            end_inscode)):
        if inscode not in string.ascii_uppercase + " ":
            raise IOError("Invalid insertion code for residue %s%s." %
                          (resnum, inscode))
    if end_resnum < start_resnum or (end_resnum == start_resnum and
                                     (end_inscode <= start_inscode or
                                      end_inscode == " ")):
        # the end point is before the start point
        return []
    cur_resnum = start_resnum
    cur_inscode = start_inscode
    resnum_and_inscode = []
    while True:
        if cur_inscode == " ":
            cur_inscode = "A"
        elif cur_inscode == "Z":
            cur_inscode = " "
            cur_resnum += 1
        else:
            cur_inscode = chr(ord(cur_inscode) + 1)
        if cur_resnum == end_resnum and cur_inscode == end_inscode:
            return resnum_and_inscode
        resnum_and_inscode.append((cur_resnum, cur_inscode))


def _get_protein_sequence(chain):
    """
    Converts a protein chain to a ProteinSequence object.

    :param chain: Protein chain to get the sequence of.
    :type chain: `structure._Chain`

    :return: A protein sequence.
    :rtype: `ProteinSequence`
    """
    sequence = ProteinSequence([res.getCode() for res in chain.residue])
    sequence.chain = chain.name
    return sequence


[docs]def get_pairwise_sequence_similarity(chain1,
                                     chain2,
                                     consider_gap=True,
                                     method='muscle'):
    """
    Given two single chain sequences, align them,
    and return sequence similarity among them.

    :param chain1: The first sequence chain.
    :type chain1: `structure._Chain`

    :param chain2: The second sequence chain.
    :type chain2: `structure._Chain`

    :param consider_gap: Whether or not to consider gaps in the alignment,
                         default to true.
    :type consider_gap: bool

    :param method: Which alignment method to use ('muscle' or 'clustalw')
    :type  method: string

    :return: Sequence similarity of the alignment of the two.
    :rtype: float, between 0.0 and 1.0
    """
    with ioredirect.IOSilence():
        aln = align_from_chains([chain1, chain2], method=method)
    return aln[0].getSimilarity(aln[1], consider_gap)


[docs]def create_alignment_from_chains(chains):
    """
    Return `ProteinAlignment` object comprised of two chains

    :param chains: Chains to be aligned
    :type  chains: iterable(structure._Chain)
    """
    # Dev note: Consider moving this to a ProteinSequence class method

    # To avoid a circular import
    from schrodinger.protein.alignment import ProteinAlignment

    aln = ProteinAlignment()
    for chain in chains:
        seq = _get_protein_sequence(chain)
        aln.addSeq(seq)
    return aln


[docs]def align_alignment(aln, second_aln=None, method='muscle'):
    """
    Perform alignment from an ProteinAlignment object

    :param aln: Alignment data
    :type  aln: `ProteinAlignment`
    :param method: Which method/program to use
    :type  method: string

    :return: Aligned sequences
    :rtype: `ProteinAlignment`
    """

    if method == 'muscle':
        from schrodinger.protein.tasks import muscle
        job = muscle.MuscleJob(aln, second_aln=second_aln)
        new_aln = job.run()
    elif method == 'clustalw':
        from schrodinger.protein.tasks import clustal
        if second_aln:
            raise NotImplementedError("""
            ClustalW2 alignment of alignments (profiling)
            is not supported directly via this API. Consider using
            MUSCLE instead (method='muscle')
            """)
        job = clustal.ClustalJob(aln)
        new_aln = job.run()
    else:
        raise ValueError("Unsupported alignment method: {}".format(method))

    return new_aln


[docs]def align_from_chains(chains, method='muscle'):
    """
    Perform alignment on a series of chains

    :param chains: Chains to be aligned
    :type  chains: iterable(structure._Chain)

    :param method: Which method/program to use (choices 'muscle', 'clustalw')
    :type  method: string

    :return: Aligned sequences
    :rtype: `ProteinAlignment`
    """
    aln = create_alignment_from_chains(chains)
    return align_alignment(aln, method=method)


[docs]def get_aligned_residues(st1, st2, method='muscle'):
    """
    This generator will yield 2 structure._Residue objects - one from each
    structure - for each position in aligned sequences.

    :param st1: First structure.
    :type st1: `structure.Structure`

    :param st2: Second structure
    :type st2: `structure.Structure`

    :return: Generates tuples of 2 residues that align at each position.
    :rtype: generator(structure._Residue or None, structure._Residue or None)

    :raises ValueError: if structures don't have equivalent chains.

    """
    if st1.chain.chainNames() != st2.chain.chainNames():
        raise ValueError("Input structures must have same chains")

    for chain1, chain2 in zip(st1.chain, st2.chain):
        alignment = align_from_chains([chain1, chain2], method=method)

        residues1 = list(chain1.residue)
        residues2 = list(chain2.residue)

        for seq_res1, seq_res2 in zip(*alignment):
            st_res1 = residues1[
                seq_res1.gapless_idx_in_seq] if seq_res1.is_res else None
            st_res2 = residues2[
                seq_res2.gapless_idx_in_seq] if seq_res2.is_res else None
            yield st_res1, st_res2


[docs]def get_aligned_structure_residues(sts, method='muscle'):
    """
    This generator will yield 2 structure._Residue objects - one from each
    structure for each position in aligned sequences.

    :param sts: Structures to align
    :type sts: list(structure.Structure)

    :return: Generates lists of residues that align at each position.
    :rtype: generator(list[structure._Residue or None])
    """

    # To avoid a circular import
    from schrodinger.protein.alignment import ProteinAlignment

    aln = ProteinAlignment()
    for st in sts:
        seq = ProteinSequence([res.getCode() for res in st.residue])
        # sequence.chain = chain.name
        aln.addSeq(seq)

    alignment = align_alignment(aln, method=method)

    residues_by_st = [list(st.residue) for st in sts]

    for aligned_residues in zip(*alignment):
        residues = [
            residues_by_st[i][seq_res.gapless_idx_in_seq]
            if seq_res.is_res else None
            for i, seq_res in enumerate(aligned_residues)
        ]
        yield residues


[docs]def offset_indices(indices):
    """
    Offset insertion indices based on numbering before insertion to reflect
    numbering after insertion.

    For example, [1, 1, 2, 3, 5, 8] would be changed to [1, 2, 4, 6, 9, 13]

    :type indices: list[int]
    """
    sorted_indices = sorted(indices)
    for i, cur_index in enumerate(sorted_indices):
        sorted_indices[i] = cur_index + i
    return sorted_indices