Source code for schrodinger.ui.sequencealignment.align

"""
Sequence alignment routines for the multiple sequence viewer.

Copyright Schrodinger, LLC. All rights reserved.

"""

# Contributors: Piotr Rotkiewicz

from . import constants
from . import utils
from .residue import Residue


[docs]def align(first_sequence,
          second_sequence,
          gap_open_penalty=-1.0,
          gap_extend_penalty=0.0,
          scoring_matrix=None,
          direct_scores=False,
          constraints=None,
          merge=False,
          merge_selected=False,
          last_to_merge=None,
          ss_constraints=False,
          sequence_group=None):
    """
    This is an implementation of constrained dynamic programming algorithm
    for pairwise sequence alignment.

    :type first_sequence: `Sequence`
    :param first_sequence: first sequence to be aligned

    :type second_sequence: `Sequence`
    :param second_sequence: second sequence to be aligned

    :type gap_open_penalty: float
    :param gap_open_penalty: Gap opening penalty.

    :type gap_extend_penalty: float
    :param gap_extend_penalty: Gap extension penalty.

    :type scoring_matrix: 2D float array
    :param scoring_matrix: Scoring matrix to be used for the alignment.
        If no matrix is specified, this method uses residue identity measure.

    :type constraints: list of (int, int)
    :param constraints: Optional list of pairwise constraints.

    :type merge: boolean
    :param merge: Merge the aligned sequence with the exisiting alignment.

    :type direct_scores: boolean
    :param direct_scores: Use scoring matrix directly as (NxM) where N, M are
        lengths of both sequences rather than default 20x20 substitution matrix.

    :type ss_constraints: boolean
    :param ss_constraints: Impose secondary structure constraints
        during the alignment.
    """
    # Make sequence copies for later gaps merging.
    if merge:
        first_original = first_sequence.copyForUndo(deep_copy=False)
    first_sequence.removeAllGaps()
    second_sequence.removeAllGaps()
    len1 = first_sequence.length()
    len2 = second_sequence.length()
    # Allocate DP matrices.
    score_matrix = []
    trace_matrix = []
    for y in range(0, len2 + 1):
        score_matrix.append([0] * (len1 + 2))
        trace_matrix.append([0] * (len1 + 2))

    for y in range(1, len2 + 1):
        trace_matrix[y][0] = 2
        trace_matrix[y][len1] = 2

    for x in range(1, len1 + 1):
        trace_matrix[0][x] = 1
        trace_matrix[len2][x] = 1
    best_x = 0
    best_y = 0
    best_score = 0

    # Locate SSA or SSP if there are any.
    first_ss = None
    for child in first_sequence.children:
        if child.type == constants.SEQ_SECONDARY:
            first_ss = child
            break
        if child.type == constants.SEQ_ANNOTATION and \
           child.annotation_type == constants.ANNOTATION_SSP:
            first_ss = child
            break
    second_ss = None
    for child in second_sequence.children:
        if child.type == constants.SEQ_SECONDARY:
            second_ss = child
            break
        if child.type == constants.SEQ_ANNOTATION and \
           child.annotation_type == constants.ANNOTATION_SSP:
            second_ss = child
            break
    # Score matrix build step.
    for y in range(1, len2 + 1):
        for x in range(1, len1 + 1):
            h_score = score_matrix[y][x - 1]
            if trace_matrix[y][x - 1] != 1:
                h_score += gap_open_penalty
            else:
                h_score += gap_extend_penalty
            v_score = score_matrix[y - 1][x]

            if trace_matrix[y - 1][x] != 2:
                v_score += gap_open_penalty
            else:
                v_score += gap_extend_penalty
            code1 = first_sequence.residues[x - 1].code
            code2 = second_sequence.residues[y - 1].code
            score = 0

            if scoring_matrix:
                if direct_scores:
                    score = scoring_matrix[x - 1][y - 1]
                else:
                    score = utils.matrixValue(scoring_matrix, code1, code2)
            else:
                # Use residue identity criteria if no matrix is specified.
                if code1 == code2:
                    score = 1
            if constraints:
                for constraint in constraints:
                    start, end, seq, prime = constraint
                    if start == x - 1 and \
                       end == y - 1:
                        score += 1e6
            sec = False
            if ss_constraints:
                if first_ss and \
                   (first_ss.residues[x - 1].code == 'H' or
                        first_ss.residues[x - 1].code == 'E'):
                    sec = True
                if second_ss and \
                   (second_ss.residues[y - 1].code == 'H' or
                        second_ss.residues[y - 1].code == 'E'):
                    sec = True
            d_score = score_matrix[y - 1][x - 1] + score
            max_score = max(d_score, h_score, v_score)

            if (max_score == d_score) or sec:
                score_matrix[y][x] = d_score
                trace_matrix[y][x] = 0
            elif max_score == h_score:
                score_matrix[y][x] = h_score
                trace_matrix[y][x] = 1
            else:
                score_matrix[y][x] = v_score
                trace_matrix[y][x] = 2

            if max_score > best_score:
                best_score = max_score
                best_x = x
                best_y = y
    # Traceback step.
    new_sequence_1 = []
    new_sequence_2 = []
    y = best_y
    x = best_x
    while x < len1:
        x += 1
        trace_matrix[y][x] = 1
    while y < len2:
        y += 1
        trace_matrix[y][x] = 2
    while x > 0 or y > 0:
        if trace_matrix[y][x] == 0:
            y -= 1
            x -= 1
            new_sequence_1.append(first_sequence.residues[x])
            new_sequence_2.append(second_sequence.residues[y])
        elif trace_matrix[y][x] == 1:
            if x >= 0:
                x -= 1
                new_sequence_1.append(first_sequence.residues[x])
                gap = Residue()
                gap.is_gap = True
                gap.code = constants.UNLOCKED_GAP_SYMBOL
                gap.sequence = second_sequence
                new_sequence_2.append(gap)
        else:
            if y >= 0:
                y -= 1
                new_sequence_2.append(second_sequence.residues[y])
                gap = Residue()
                gap.is_gap = True
                gap.code = constants.UNLOCKED_GAP_SYMBOL
                gap.sequence = first_sequence
                new_sequence_1.append(gap)
    # The traceback step creates reverted sequences, so revert them back.
    first_sequence.residues = new_sequence_1[::-1]
    second_sequence.residues = new_sequence_2[::-1]

    # Merge original gaps with the existing alignment if necessary.
    if merge and sequence_group:

        # There are two steps:
        # 1) copy gaps from aligned query to other sequence in the alignment
        # 2) copy gaps from original query to second aligned sequence
        # Thus the aligned residue correspondence doesn't change
        # and the new alignment is merged with the original one.

        # Build an insertion position list for the first aligned sequence.
        insertion_list = []
        last_nongap = None
        for pos, res in enumerate(first_sequence.residues):
            if res.is_gap:
                # This is a gap. Find a corresponding alignment index
                # in the first original sequence.
                if last_nongap:
                    index = first_original.residues.index(last_nongap)
                    insertion_list.append(index + 1)
                else:
                    insertion_list.append(0)
            else:
                last_nongap = res
        # Build an insertion position list for the original sequence.
        second_insertion_list = []
        last_nongap = None
        for pos, res in enumerate(first_original.residues):
            if res.is_gap:
                # This is a gap. Find a corresponding alignment index
                # in the first original sequence.
                if last_nongap:
                    index = first_sequence.residues.index(last_nongap)
                    second_insertion_list.append(index + 1)
                else:
                    second_insertion_list.append(0)
            else:
                last_nongap = res
        # We are going to insert these gaps anyway.
        first_sequence.residues = first_original.residues
        # Now insert gaps at the insertion positions in the alignment
        # in all sequences except for the merged one.
        for seq in sequence_group.sequences:
            if seq.type == constants.SEQ_AMINO_ACIDS and \
               seq != second_sequence and \
               (seq.selected or (not merge_selected) or seq == first_sequence):
                offset = 0
                for gap_pos in insertion_list:
                    gap = Residue()
                    gap.is_gap = True
                    gap.code = constants.UNLOCKED_GAP_SYMBOL
                    gap.sequence = seq
                    seq.residues.insert(gap_pos + offset, gap)
                    offset += 1
                seq.propagateGapsToChildren()
                if last_to_merge == seq:
                    break
        # Now insert the original gaps into the second aligned sequence
        seq = second_sequence
        offset = 0
        for gap_pos in second_insertion_list:
            gap = Residue()
            gap.is_gap = True
            gap.code = constants.UNLOCKED_GAP_SYMBOL
            gap.sequence = seq
            seq.residues.insert(gap_pos + offset, gap)
            offset += 1
        second_sequence.propagateGapsToChildren()
    else:
        # No merging - just propagate gaps to children.
        first_sequence.propagateGapsToChildren()
        second_sequence.propagateGapsToChildren()