Source code for schrodinger.ui.sequencealignment.align
"""
Sequence alignment routines for the multiple sequence viewer.
Copyright Schrodinger, LLC. All rights reserved.
"""
# Contributors: Piotr Rotkiewicz
from . import constants
from . import utils
from .residue import Residue
[docs]def align(first_sequence,
second_sequence,
gap_open_penalty=-1.0,
gap_extend_penalty=0.0,
scoring_matrix=None,
direct_scores=False,
constraints=None,
merge=False,
merge_selected=False,
last_to_merge=None,
ss_constraints=False,
sequence_group=None):
"""
This is an implementation of constrained dynamic programming algorithm
for pairwise sequence alignment.
:type first_sequence: `Sequence`
:param first_sequence: first sequence to be aligned
:type second_sequence: `Sequence`
:param second_sequence: second sequence to be aligned
:type gap_open_penalty: float
:param gap_open_penalty: Gap opening penalty.
:type gap_extend_penalty: float
:param gap_extend_penalty: Gap extension penalty.
:type scoring_matrix: 2D float array
:param scoring_matrix: Scoring matrix to be used for the alignment.
If no matrix is specified, this method uses residue identity measure.
:type constraints: list of (int, int)
:param constraints: Optional list of pairwise constraints.
:type merge: boolean
:param merge: Merge the aligned sequence with the exisiting alignment.
:type direct_scores: boolean
:param direct_scores: Use scoring matrix directly as (NxM) where N, M are
lengths of both sequences rather than default 20x20 substitution matrix.
:type ss_constraints: boolean
:param ss_constraints: Impose secondary structure constraints
during the alignment.
"""
# Make sequence copies for later gaps merging.
if merge:
first_original = first_sequence.copyForUndo(deep_copy=False)
first_sequence.removeAllGaps()
second_sequence.removeAllGaps()
len1 = first_sequence.length()
len2 = second_sequence.length()
# Allocate DP matrices.
score_matrix = []
trace_matrix = []
for y in range(0, len2 + 1):
score_matrix.append([0] * (len1 + 2))
trace_matrix.append([0] * (len1 + 2))
for y in range(1, len2 + 1):
trace_matrix[y][0] = 2
trace_matrix[y][len1] = 2
for x in range(1, len1 + 1):
trace_matrix[0][x] = 1
trace_matrix[len2][x] = 1
best_x = 0
best_y = 0
best_score = 0
# Locate SSA or SSP if there are any.
first_ss = None
for child in first_sequence.children:
if child.type == constants.SEQ_SECONDARY:
first_ss = child
break
if child.type == constants.SEQ_ANNOTATION and \
child.annotation_type == constants.ANNOTATION_SSP:
first_ss = child
break
second_ss = None
for child in second_sequence.children:
if child.type == constants.SEQ_SECONDARY:
second_ss = child
break
if child.type == constants.SEQ_ANNOTATION and \
child.annotation_type == constants.ANNOTATION_SSP:
second_ss = child
break
# Score matrix build step.
for y in range(1, len2 + 1):
for x in range(1, len1 + 1):
h_score = score_matrix[y][x - 1]
if trace_matrix[y][x - 1] != 1:
h_score += gap_open_penalty
else:
h_score += gap_extend_penalty
v_score = score_matrix[y - 1][x]
if trace_matrix[y - 1][x] != 2:
v_score += gap_open_penalty
else:
v_score += gap_extend_penalty
code1 = first_sequence.residues[x - 1].code
code2 = second_sequence.residues[y - 1].code
score = 0
if scoring_matrix:
if direct_scores:
score = scoring_matrix[x - 1][y - 1]
else:
score = utils.matrixValue(scoring_matrix, code1, code2)
else:
# Use residue identity criteria if no matrix is specified.
if code1 == code2:
score = 1
if constraints:
for constraint in constraints:
start, end, seq, prime = constraint
if start == x - 1 and \
end == y - 1:
score += 1e6
sec = False
if ss_constraints:
if first_ss and \
(first_ss.residues[x - 1].code == 'H' or
first_ss.residues[x - 1].code == 'E'):
sec = True
if second_ss and \
(second_ss.residues[y - 1].code == 'H' or
second_ss.residues[y - 1].code == 'E'):
sec = True
d_score = score_matrix[y - 1][x - 1] + score
max_score = max(d_score, h_score, v_score)
if (max_score == d_score) or sec:
score_matrix[y][x] = d_score
trace_matrix[y][x] = 0
elif max_score == h_score:
score_matrix[y][x] = h_score
trace_matrix[y][x] = 1
else:
score_matrix[y][x] = v_score
trace_matrix[y][x] = 2
if max_score > best_score:
best_score = max_score
best_x = x
best_y = y
# Traceback step.
new_sequence_1 = []
new_sequence_2 = []
y = best_y
x = best_x
while x < len1:
x += 1
trace_matrix[y][x] = 1
while y < len2:
y += 1
trace_matrix[y][x] = 2
while x > 0 or y > 0:
if trace_matrix[y][x] == 0:
y -= 1
x -= 1
new_sequence_1.append(first_sequence.residues[x])
new_sequence_2.append(second_sequence.residues[y])
elif trace_matrix[y][x] == 1:
if x >= 0:
x -= 1
new_sequence_1.append(first_sequence.residues[x])
gap = Residue()
gap.is_gap = True
gap.code = constants.UNLOCKED_GAP_SYMBOL
gap.sequence = second_sequence
new_sequence_2.append(gap)
else:
if y >= 0:
y -= 1
new_sequence_2.append(second_sequence.residues[y])
gap = Residue()
gap.is_gap = True
gap.code = constants.UNLOCKED_GAP_SYMBOL
gap.sequence = first_sequence
new_sequence_1.append(gap)
# The traceback step creates reverted sequences, so revert them back.
first_sequence.residues = new_sequence_1[::-1]
second_sequence.residues = new_sequence_2[::-1]
# Merge original gaps with the existing alignment if necessary.
if merge and sequence_group:
# There are two steps:
# 1) copy gaps from aligned query to other sequence in the alignment
# 2) copy gaps from original query to second aligned sequence
# Thus the aligned residue correspondence doesn't change
# and the new alignment is merged with the original one.
# Build an insertion position list for the first aligned sequence.
insertion_list = []
last_nongap = None
for pos, res in enumerate(first_sequence.residues):
if res.is_gap:
# This is a gap. Find a corresponding alignment index
# in the first original sequence.
if last_nongap:
index = first_original.residues.index(last_nongap)
insertion_list.append(index + 1)
else:
insertion_list.append(0)
else:
last_nongap = res
# Build an insertion position list for the original sequence.
second_insertion_list = []
last_nongap = None
for pos, res in enumerate(first_original.residues):
if res.is_gap:
# This is a gap. Find a corresponding alignment index
# in the first original sequence.
if last_nongap:
index = first_sequence.residues.index(last_nongap)
second_insertion_list.append(index + 1)
else:
second_insertion_list.append(0)
else:
last_nongap = res
# We are going to insert these gaps anyway.
first_sequence.residues = first_original.residues
# Now insert gaps at the insertion positions in the alignment
# in all sequences except for the merged one.
for seq in sequence_group.sequences:
if seq.type == constants.SEQ_AMINO_ACIDS and \
seq != second_sequence and \
(seq.selected or (not merge_selected) or seq == first_sequence):
offset = 0
for gap_pos in insertion_list:
gap = Residue()
gap.is_gap = True
gap.code = constants.UNLOCKED_GAP_SYMBOL
gap.sequence = seq
seq.residues.insert(gap_pos + offset, gap)
offset += 1
seq.propagateGapsToChildren()
if last_to_merge == seq:
break
# Now insert the original gaps into the second aligned sequence
seq = second_sequence
offset = 0
for gap_pos in second_insertion_list:
gap = Residue()
gap.is_gap = True
gap.code = constants.UNLOCKED_GAP_SYMBOL
gap.sequence = seq
seq.residues.insert(gap_pos + offset, gap)
offset += 1
second_sequence.propagateGapsToChildren()
else:
# No merging - just propagate gaps to children.
first_sequence.propagateGapsToChildren()
second_sequence.propagateGapsToChildren()