Source code for schrodinger.protein.tasks.muscle
# Runs MUSCLE alignment jobs
# This module provides an interface similar to clustal.py's.
# They might be merged in the future to reduce code duplication
import copy
import glob
import os
import tempfile
from schrodinger.application.msv import seqio
from schrodinger.protein.tasks import clustal
from schrodinger.utils import fileutils
[docs]def get_muscle_path():
"""
Returns the path to the MUSCLE executable.
:rtype: str
:return: path to MUSCLE executable file, or None if the executable could
not be located.
"""
pattern = os.path.join(fileutils.get_mmshare_dir(), "bin", "*", "muscle*")
results = glob.glob(pattern)
if results:
return results[0]
print("No MUSCLE exec found!")
return None
[docs]class MuscleJob(clustal.AbstractAlignmentJob):
"""
Runs alignment via MUSCLE program
:note: Muscle by default rearranges the order to put similar sequences
together
"""
[docs] def __init__(self, aln, second_aln=None, quiet=True, maintain_order=True):
"""
:param aln: Input sequences
:type aln: ProteinAlignment
:param second_aln: Second alignment for aligning multiple MSAs
:type second_aln: ProteinAlignment
:param maintain_order: Whether to maintain input order of sequences
:type maintain_order: bool
Dev note:
- Unimplemented features:
-- Speedup options
"""
super().__init__(aln, second_aln)
self.quiet = quiet
self.maintain_order = maintain_order
[docs] def run(self):
"""
Run the MUSCLE alignment.
:return: Aligned sequences
:rtype: ProteinAlignment
"""
muscle_exe = get_muscle_path()
if not muscle_exe:
raise RuntimeError("Could not locate MUSCLE executable!")
to_write = self.aln
if self.second_aln:
# Pad alignments to prevent muscle from adding trailing X
self.aln.padAlignment()
self.second_aln.padAlignment()
else:
# When only one input alignment is passed to Muscle, it ignores
# gap-only sequences and excludes them from the output. To work
# around this, we remove these sequences and manually add them back
# to the output alignment.
empty_seq_indices = [
i for i, seq in enumerate(self.aln)
if not seq.getGaplessLength()
]
if empty_seq_indices:
to_write = copy.deepcopy(self.aln)
empty_seqs = [to_write[i] for i in empty_seq_indices]
to_write.removeSeqs(empty_seqs)
tmpdir_obj = tempfile.TemporaryDirectory()
with tmpdir_obj as tmpdir:
inp_fname = os.path.join(tmpdir, 'input.fasta')
inp_fname2 = None
out_fname = os.path.join(tmpdir, 'output.fasta')
if self.second_aln:
inp_fname2 = os.path.join(tmpdir, 'input2.fasta')
seqio.FastaAlignmentWriter.write(self.second_aln, inp_fname2)
in_names = seqio.FastaAlignmentWriter.write(to_write, inp_fname)
command = self._makeCommand(muscle_exe=muscle_exe,
inp_fname=inp_fname,
out_fname=out_fname,
inp_fname2=inp_fname2)
# execute MUSCLE
self._run(command)
if self.canceled:
return None
# read the aligned sequences
new_alignment = seqio.FastaAlignmentReader.read(out_fname)
try:
tmpdir_obj.cleanup()
except OSError:
# Debugging for MSV-3768
print(os.listdir(tmpdir))
if not self.second_aln and self.maintain_order: # Reorder as input
# profile mode does not change the sequence order in each input MSA
# Note MUSCLE has a built in -stable option which is buggy (and
# disabled). See http://www.drive5.com/muscle/manual/stable.html
seqio.reorder_fasta_alignment(new_alignment, in_names)
if not self.second_aln and empty_seq_indices:
# add gap-only sequences back in
for i, seq in zip(empty_seq_indices, empty_seqs):
# muscle pads sequences with trailing gaps, so we do the same
# for the gap-only sequences
seq.removeElements(seq)
seq.extend(seq.gap_char * new_alignment.num_columns)
new_alignment.addSeq(seq, i)
return new_alignment
def _makeCommand(self, muscle_exe, inp_fname, out_fname, inp_fname2=None):
command = [muscle_exe, '-out', out_fname]
if inp_fname2 is None:
command.extend(['-in', inp_fname])
else: # If aligning alignments
command.extend(['-profile', '-in1', inp_fname, '-in2', inp_fname2])
if self.gapopen:
command.extend(['-gapopen', str(-self.gapopen)])
if self.gapext:
command.extend(['-gapextend', str(-self.gapext)])
if self.quiet:
command.extend(['-quiet'])
return command