Source code for schrodinger.ui.sequencealignment.align

"""
Sequence alignment routines for the multiple sequence viewer.

Copyright Schrodinger, LLC. All rights reserved.

"""

# Contributors: Piotr Rotkiewicz

from . import constants
from . import utils
from .residue import Residue


[docs]def align(first_sequence, second_sequence, gap_open_penalty=-1.0, gap_extend_penalty=0.0, scoring_matrix=None, direct_scores=False, constraints=None, merge=False, merge_selected=False, last_to_merge=None, ss_constraints=False, sequence_group=None): """ This is an implementation of constrained dynamic programming algorithm for pairwise sequence alignment. :type first_sequence: `Sequence` :param first_sequence: first sequence to be aligned :type second_sequence: `Sequence` :param second_sequence: second sequence to be aligned :type gap_open_penalty: float :param gap_open_penalty: Gap opening penalty. :type gap_extend_penalty: float :param gap_extend_penalty: Gap extension penalty. :type scoring_matrix: 2D float array :param scoring_matrix: Scoring matrix to be used for the alignment. If no matrix is specified, this method uses residue identity measure. :type constraints: list of (int, int) :param constraints: Optional list of pairwise constraints. :type merge: boolean :param merge: Merge the aligned sequence with the exisiting alignment. :type direct_scores: boolean :param direct_scores: Use scoring matrix directly as (NxM) where N, M are lengths of both sequences rather than default 20x20 substitution matrix. :type ss_constraints: boolean :param ss_constraints: Impose secondary structure constraints during the alignment. """ # Make sequence copies for later gaps merging. if merge: first_original = first_sequence.copyForUndo(deep_copy=False) first_sequence.removeAllGaps() second_sequence.removeAllGaps() len1 = first_sequence.length() len2 = second_sequence.length() # Allocate DP matrices. score_matrix = [] trace_matrix = [] for y in range(0, len2 + 1): score_matrix.append([0] * (len1 + 2)) trace_matrix.append([0] * (len1 + 2)) for y in range(1, len2 + 1): trace_matrix[y][0] = 2 trace_matrix[y][len1] = 2 for x in range(1, len1 + 1): trace_matrix[0][x] = 1 trace_matrix[len2][x] = 1 best_x = 0 best_y = 0 best_score = 0 # Locate SSA or SSP if there are any. first_ss = None for child in first_sequence.children: if child.type == constants.SEQ_SECONDARY: first_ss = child break if child.type == constants.SEQ_ANNOTATION and \ child.annotation_type == constants.ANNOTATION_SSP: first_ss = child break second_ss = None for child in second_sequence.children: if child.type == constants.SEQ_SECONDARY: second_ss = child break if child.type == constants.SEQ_ANNOTATION and \ child.annotation_type == constants.ANNOTATION_SSP: second_ss = child break # Score matrix build step. for y in range(1, len2 + 1): for x in range(1, len1 + 1): h_score = score_matrix[y][x - 1] if trace_matrix[y][x - 1] != 1: h_score += gap_open_penalty else: h_score += gap_extend_penalty v_score = score_matrix[y - 1][x] if trace_matrix[y - 1][x] != 2: v_score += gap_open_penalty else: v_score += gap_extend_penalty code1 = first_sequence.residues[x - 1].code code2 = second_sequence.residues[y - 1].code score = 0 if scoring_matrix: if direct_scores: score = scoring_matrix[x - 1][y - 1] else: score = utils.matrixValue(scoring_matrix, code1, code2) else: # Use residue identity criteria if no matrix is specified. if code1 == code2: score = 1 if constraints: for constraint in constraints: start, end, seq, prime = constraint if start == x - 1 and \ end == y - 1: score += 1e6 sec = False if ss_constraints: if first_ss and \ (first_ss.residues[x - 1].code == 'H' or first_ss.residues[x - 1].code == 'E'): sec = True if second_ss and \ (second_ss.residues[y - 1].code == 'H' or second_ss.residues[y - 1].code == 'E'): sec = True d_score = score_matrix[y - 1][x - 1] + score max_score = max(d_score, h_score, v_score) if (max_score == d_score) or sec: score_matrix[y][x] = d_score trace_matrix[y][x] = 0 elif max_score == h_score: score_matrix[y][x] = h_score trace_matrix[y][x] = 1 else: score_matrix[y][x] = v_score trace_matrix[y][x] = 2 if max_score > best_score: best_score = max_score best_x = x best_y = y # Traceback step. new_sequence_1 = [] new_sequence_2 = [] y = best_y x = best_x while x < len1: x += 1 trace_matrix[y][x] = 1 while y < len2: y += 1 trace_matrix[y][x] = 2 while x > 0 or y > 0: if trace_matrix[y][x] == 0: y -= 1 x -= 1 new_sequence_1.append(first_sequence.residues[x]) new_sequence_2.append(second_sequence.residues[y]) elif trace_matrix[y][x] == 1: if x >= 0: x -= 1 new_sequence_1.append(first_sequence.residues[x]) gap = Residue() gap.is_gap = True gap.code = constants.UNLOCKED_GAP_SYMBOL gap.sequence = second_sequence new_sequence_2.append(gap) else: if y >= 0: y -= 1 new_sequence_2.append(second_sequence.residues[y]) gap = Residue() gap.is_gap = True gap.code = constants.UNLOCKED_GAP_SYMBOL gap.sequence = first_sequence new_sequence_1.append(gap) # The traceback step creates reverted sequences, so revert them back. first_sequence.residues = new_sequence_1[::-1] second_sequence.residues = new_sequence_2[::-1] # Merge original gaps with the existing alignment if necessary. if merge and sequence_group: # There are two steps: # 1) copy gaps from aligned query to other sequence in the alignment # 2) copy gaps from original query to second aligned sequence # Thus the aligned residue correspondence doesn't change # and the new alignment is merged with the original one. # Build an insertion position list for the first aligned sequence. insertion_list = [] last_nongap = None for pos, res in enumerate(first_sequence.residues): if res.is_gap: # This is a gap. Find a corresponding alignment index # in the first original sequence. if last_nongap: index = first_original.residues.index(last_nongap) insertion_list.append(index + 1) else: insertion_list.append(0) else: last_nongap = res # Build an insertion position list for the original sequence. second_insertion_list = [] last_nongap = None for pos, res in enumerate(first_original.residues): if res.is_gap: # This is a gap. Find a corresponding alignment index # in the first original sequence. if last_nongap: index = first_sequence.residues.index(last_nongap) second_insertion_list.append(index + 1) else: second_insertion_list.append(0) else: last_nongap = res # We are going to insert these gaps anyway. first_sequence.residues = first_original.residues # Now insert gaps at the insertion positions in the alignment # in all sequences except for the merged one. for seq in sequence_group.sequences: if seq.type == constants.SEQ_AMINO_ACIDS and \ seq != second_sequence and \ (seq.selected or (not merge_selected) or seq == first_sequence): offset = 0 for gap_pos in insertion_list: gap = Residue() gap.is_gap = True gap.code = constants.UNLOCKED_GAP_SYMBOL gap.sequence = seq seq.residues.insert(gap_pos + offset, gap) offset += 1 seq.propagateGapsToChildren() if last_to_merge == seq: break # Now insert the original gaps into the second aligned sequence seq = second_sequence offset = 0 for gap_pos in second_insertion_list: gap = Residue() gap.is_gap = True gap.code = constants.UNLOCKED_GAP_SYMBOL gap.sequence = seq seq.residues.insert(gap_pos + offset, gap) offset += 1 second_sequence.propagateGapsToChildren() else: # No merging - just propagate gaps to children. first_sequence.propagateGapsToChildren() second_sequence.propagateGapsToChildren()