Source code for schrodinger.ui.sequencealignment.fileio

"""
File I/O handling routines for the multiple sequence viewer.

Copyright Schrodinger, LLC. All rights reserved.
"""

# Contributors: Piotr Rotkiewicz

import gzip
import itertools
import os
import string

from schrodinger import structure
from schrodinger.application.msv.seqio import parse_fasta_header
from schrodinger.infra import mm
from schrodinger.structutils.analyze import evaluate_asl

from . import constants
from .residue import Residue
from .sequence import Sequence
from .tree_node import TreeNode


[docs]def partition_by_predicate(arr, pred):
    """
    Utility function to groups a list into lists, with each sublist beginning
    with an element that matches the supplied predicate

    Note that many file reading functions below would benefit from using this
    function.

    :param arr: A list to split into sublists
    :type arr: list

    :param pred: A function that takes a list item and returns True if the list
        item meets a criteria and False otherwise
    :type pred: function

    This is not efficient, since we loop through the array twice, but it
    probably doesn't matter.
    """
    counter = 0
    group_info = []
    for item in arr:
        if pred(item):
            counter += 1
        group_info.append(counter)
    array_with_grouping_info = list(zip(arr, group_info))

    for _, group in itertools.groupby(array_with_grouping_info, lambda x: x[1]):
        yield [item for (item, info) in group]


[docs]def load_fasta_file(sequence_group, file_name, text=None):
    """
    Load a sequence file in FASTA format, create sequences and append them
    to the sequence group. Splits sequence name from the FASTA header.

    :param sequence_group: sequence group to which the sequences will be added
    :type sequence_group: SequenceGroup

    :param file_name: name of input FASTA file
    :type file_name: str

    :param text: optional text in FASTA format used instead of the input file,
        split by newline char into a list of lines
    :type text: list of str
    """
    lines = text
    if not lines:
        try:
            with open(file_name) as f:
                lines = [line for line in f.readlines() if line]
        except IOError:
            return False

    for group in partition_by_predicate(lines, lambda x: x.startswith('>')):
        seq = Sequence()
        header = group.pop(0)
        fasta_parts = parse_fasta_header(header)
        split_name = fasta_parts.name
        chain_id = fasta_parts.chain
        residues = "".join(group)
        if not residues:
            continue
        seq.name = header[1:].rstrip()
        seq.chain_id = chain_id
        seq.makeShortName(name=split_name)
        seq.appendResidues(residues, use_numbers=True)
        sequence_group.sequences.append(seq)
    return True


[docs]def load_PIR_file(sequence_group, file_name):
    """
    Load a sequence file in PIR format, create sequences and append them
    to the sequence group.

    :param sequence_group: sequence group to which the sequences will be added
    :type sequence_group: SequenceGroup

    :param file_name: name of input PIR file
    :type file_name: string
    """
    try:
        inp_file = open(file_name, "r")
    except:
        return False
    if not inp_file:
        return False
    lines = inp_file.readlines()
    inp_file.close()
    if not lines or len(lines) == 0:
        return False
    seq = None
    skip = False
    string = ""

    for line in lines:
        if len(line) == 0:
            continue
        if skip:
            skip = False
            continue
        if line[0] == '>':
            if seq:
                seq.appendResidues(string)
            seq = Sequence()
            string = ""
            pos = line.find("P1;")
            if pos >= 0:
                line = line[pos + 2:]
            seq.name = [ch for ch in line[1:] if ch >= ' ']
            seq.makeShortName()
            sequence_group.sequences.append(seq)
            skip = True
        elif seq:
            string = string + line
    if seq:
        seq.appendResidues(string)
    return True


[docs]def load_GCG_file(sequence_group, file_name):
    """
    Load a sequence file in GCG format, create sequences and append them
    to the sequence group.

    :param sequence_group: sequence group to which the sequences will be added
    :type sequence_group: SequenceGroup

    :param file_name: name of input PIR file
    :type file_name: string
    """
    try:
        inp_file = open(file_name, "r")
    except:
        return False
    if not inp_file:
        return False
    lines = inp_file.readlines()
    inp_file.close()
    if not lines or len(lines) == 0:
        return False
    seq = None
    sequences = {}
    name_list = []
    for line in lines:
        if len(line) < 2:
            continue
        if "\\\\" in line:
            continue
        name, sep, seq = line.partition(" ")
        if len(name) > 0 and len(seq) > 0:
            seq = [ch for ch in seq if ch > ' ']
            if name in sequences:
                sequences[name] += seq
            else:
                sequences[name] = seq
                name_list.append(name)
    for name in name_list:
        seq = Sequence()
        seq.appendResidues(sequences[name])
        seq.name = name
        seq.makeShortName()
        sequence_group.sequences.append(seq)
    return True


[docs]def load_EMBL_file(sequence_group, file_name):
    """
    Load a sequence file in EMBL format, create sequences and append them
    to the sequence group.

    :param sequence_group: sequence group to which the sequences will be added
    :type sequence_group: SequenceGroup

    :param file_name: name of input PIR file
    :type file_name: string
    """
    try:
        inp_file = open(file_name, "r")
    except:
        return False
    if not inp_file:
        return False
    lines = inp_file.readlines()
    inp_file.close()
    if not lines or len(lines) == 0:
        return False
    in_seq = False
    codes = ""
    name = None
    for line in lines:
        if line[:2] == "ID":
            if in_seq and name:
                seq = Sequence()
                seq.appendResidues(seq)
                seq.name = name
                seq.makeShortName()
                sequence_group.sequences.append(seq)
                seq = ""
                name = None
                in_seq = False
            split_line = line.split()
            if (len(split_line) > 1 and len(split_line[1])):
                name = split_line[1]
        if in_seq:
            codes += [ch for ch in line if ch > ' ']
        if line[:2] == "SQ":
            in_seq = True
    if in_seq and name:
        seq = Sequence()
        seq.appendResidues(codes)
        seq.name = name
        seq.makeShortName()
        sequence_group.sequences.append(seq)
    return True


[docs]def load_swissprot_file(sequence_group, file_name, text=None):
    """
    Load a sequence file in SWISSPROT format, create sequences and append them
    to the sequence group. Tries to split sequence name from the

    :param sequence_group: sequence group to which the sequences will be added
    :type sequence_group: SequenceGroup

    :param file_name: name of input SWISSPROT file
    :type file_name: string

    :param text: optional text in SWISSPROT format used instead of the input file
    :type text: string
    """
    if text or not file_name:
        lines = text
    else:
        try:
            inp_file = open(file_name, "r")
        except:
            return False
        if not inp_file:
            return False
        lines = inp_file.readlines()
        inp_file.close()
    if not lines or len(lines) == 0:
        return False
    seq = None
    read_sequence = False
    string = ""
    for line in lines:
        if len(line) < 2:
            continue
        id = line[:2]
        if id == 'ID':
            seq = Sequence()
            string = ""
            seq.name = line[2:]
        elif id == 'AC' and seq:
            seq.short_name = line[2:]
        elif id == 'SQ' and seq:
            read_sequence = True
        elif id == '//' and seq:
            if len(string) > 0 and \
               seq.name and \
               seq.short_name:
                seq.appendResidues(string)
                sequence_group.sequences.append(seq)
                seq = None
                string = ""
                read_sequence = False
        elif read_sequence:
            string += line.replace(" ", "")
    return True


[docs]def save_fasta_file(sequence_group,
                    file_name,
                    for_clustal=False,
                    file=None,
                    target_sequence=None,
                    skip_gaps=False,
                    save_annotations=False,
                    selected_only=False,
                    start=-1,
                    end=-1,
                    as_text=False,
                    save_similarity=False):
    """
    Writes a contents of sequence group to a file.

    :param sequence_group: Sequence group to be written to a file
    :type sequence_group: SequenceGroup

    :param file_name: Name of the output file
    :type file_name: string

    :param for_clustal: Optional parameter indicating if the output file
        will be used for Clustal alignment
    :type for_clustal: bool

    :param file: Optional file handle, if not None, this handle will be used
        to write the sequences rather than creating a new file (file_name parameter
        would be ignored)
    :type file: file

    :param target_sequence: Optional sequence to be saved. If not specified,
        all sequences will be written to the output file.
    :type target_sequence: `Sequence`

    :param skip_gaps: Optional parameter deciding if gaps should be written
        to the FASTA file.
    :type skip_gaps: bool

    :param save_annotations: Optional parameter for saving annotations (default
        is False).
    :type save_annotations: bool

    :param save_similarity: Saves similarity when set to True.
    :type save_similarity: bool

    :param start: Optional starting position of the subset residues to save.
    :type start: int

    :param end: Optional ending position of the subset residues to save.
    :type end: int

    :param selected_only: Save only (partially) selected columns if True
    :type selected_only: True
    """
    if not file:
        out_handle = open(file_name, "w")
    else:
        out_handle = file
    if not out_handle:
        return False
    have_selected = sequence_group.hasSelectedSequences()
    for seq in sequence_group.sequences:
        if target_sequence and target_sequence != seq:
            continue
        index = sequence_group.sequences.index(seq)
        if seq.visible and seq.length() > 0 and \
            (not have_selected or seq.selected):
            fasta_name = seq.short_name
            if seq.chain_id > ' ':
                fasta_name += '_' + seq.chain_id
            if save_similarity:
                fasta_name += "|ID:%.2f" % (100.0 * seq.identity) + \
                    "|SIM:%.2f" % (100.0 * seq.similarity) + \
                    "|HOM:%.2f" % (100.0 * seq.homology)
            if as_text:
                out_handle.write(fasta_name + ", ")
            elif for_clustal:
                if seq.type != constants.SEQ_AMINO_ACIDS:
                    continue
                out_handle.write(">SEQ" + str(index) + "\n")
            else:
                out_handle.write(">" + fasta_name + "\n")
            n_char = 0
            for index, res in enumerate(seq.residues):
                if selected_only and not res.selected:
                    continue
                if (for_clustal or skip_gaps) and res.is_gap:
                    # Skip gaps for Clustal.
                    continue
                if start >= 0 and end >= 0 and (index < start or index > end):
                    continue
                out_handle.write(res.code)
                n_char += 1
                if not as_text and n_char == constants.FASTA_LINE_LENGTH:
                    out_handle.write("\n")
                    n_char = 0
            if not as_text and n_char < constants.FASTA_LINE_LENGTH:
                out_handle.write("\n")
            out_handle.write("\n")
            if save_annotations and not for_clustal:
                for child in seq.children:
                    if child.annotation_type == constants.ANNOTATION_SSP or \
                       child.type == constants.SEQ_SECONDARY or \
                       child.annotation_type == constants.ANNOTATION_PFAM:
                        if child.annotation_type == constants.ANNOTATION_SSP:
                            out_handle.write(">" + fasta_name + "_SSP\n")
                        elif child.annotation_type == constants.ANNOTATION_PFAM:
                            out_handle.write(">" + fasta_name + "_PFAM\n")
                        else:
                            out_handle.write(">" + fasta_name + "_SSA\n")
                        n_char = 0
                        for index, res in enumerate(child.residues):
                            if selected_only and not seq.residues[
                                    index].selected:
                                continue
                            if (skip_gaps) and res.is_gap:
                                continue
                            if start >= 0 and end >= 0 and \
                            (index < start or index > end):
                                continue
                            code = res.code
                            if code == ' ':
                                code = '-'
                            elif res.is_gap:
                                code = '~'
                            out_handle.write(code)
                            n_char += 1
                            if n_char == constants.FASTA_LINE_LENGTH:
                                out_handle.write("\n")
                                n_char = 0
                        if n_char < constants.FASTA_LINE_LENGTH:
                            out_handle.write("\n")
                        out_handle.write("\n")
            index += 1
    if not file:
        out_handle.close()
    return True


[docs]def save_clustal_file(sequence_group,
                      file_name,
                      file=None,
                      start=-1,
                      end=-1,
                      ss_constraints=False,
                      subset=None,
                      ignore_selection=False):
    """
    Writes a contents of sequence group to a Clustal ALN file.

    :param sequence_group: Sequence group to be written to a file
    :type sequence_group: SequenceGroup

    :param file_name: Name of the output file
    :type file_name: string

    :param start: Optional starting position of the subset residues to save.
    :type start: int

    :param end: Optional ending position of the subset residues to save.
    :type end: int

    :param ss_constraints: Optional secondary structure constraints.
    :type ss_constraints: True
    """
    if not file:
        out_handle = open(file_name, "w")
    else:
        out_handle = file
    if not out_handle:
        return False
    have_selected = sequence_group.hasSelectedSequences()
    if ignore_selection:
        have_selected = False
    seq_list = []
    ss_list = []
    for seq in sequence_group.sequences:
        if seq.isValidProtein() and seq.length() > 0 and \
           ((subset == "selected" and seq.selected) or
                (subset == "unselected" and seq.unselected) or
                (subset is None and (not have_selected or seq.selected))):
            seq_list.append(seq)
            ss = None
            if ss_constraints:
                for child in seq.children:
                    if child.type == constants.SEQ_SECONDARY or \
                       child.annotation_type == constants.ANNOTATION_SSP:
                        ss = child
                        break
            ss_list.append(ss)
    if len(seq_list) == 0:
        return False
    out_handle.write("CLUSTAL W (2.0) multiple sequence alignment\n\n")
    saved = True
    pos = 0
    while saved:
        saved = False
        for index, seq in enumerate(seq_list):
            ss_residues = None
            if ss_list[index] is not None:
                if start >= 0 and end >= 0:
                    ss_residues = ss_list[index].residues[start:end]
                else:
                    ss_residues = ss_list[index].residues
                if ss_residues and pos < len(ss_residues):
                    ss_res_list = ss_residues[pos:pos + 60]
                    out_handle.write("!SS_SEQ" +
                                     str(sequence_group.sequences.index(seq)) +
                                     "\t")
                    for res in ss_res_list:
                        if res.code == 'H':
                            out_handle.write('A')
                        elif res.code == 'E':
                            out_handle.write('B')
                        else:
                            out_handle.write('.')
                    out_handle.write("\n")
                    out_handle.write("!GM_SEQ" +
                                     str(sequence_group.sequences.index(seq)) +
                                     "\t")
                    for res in ss_res_list:
                        if res.code == 'H':
                            out_handle.write('9')
                        elif res.code == 'E':
                            out_handle.write('9')
                        else:
                            out_handle.write('1')
                    out_handle.write("\n")
        for index, seq in enumerate(seq_list):
            if start >= 0 and end >= 0:
                residues = seq.residues[start:end]
            else:
                residues = seq.residues
            if pos < len(residues):
                res_list = residues[pos:pos + 60]
                out_handle.write("SEQ" +
                                 str(sequence_group.sequences.index(seq)) +
                                 "\t")
                saved = True
                for res in res_list:
                    if res.is_gap:
                        out_handle.write('.')
                    else:
                        out_handle.write(res.code)
                out_handle.write("\n")
        pos += 60
        out_handle.write("\n\n")
    if not file:
        out_handle.close()
    return True


[docs]def load_DND_tree(file_name, sequence_group):
    """
    Load Newick-formatted tree file outputted by multiple sequence alignment
    program. The function was tested using outputs of ClustalW and T-Coffee.

    :param file_name: name of the input file
    :type file_name: string

    :param sequence_group: target sequence group
    :type sequence_group: SequenceGroup

    :return: True if operation succeeded, False otherwise
    :rtype: boolean
    """
    try:
        # Load input file.
        inp_file = open(file_name, "r")
        if not inp_file:
            return False
    except IOError:
        return False
    lines = inp_file.readlines()
    inp_file.close()
    # Concatenate lines to create Newick-formatted string.
    dnd_string = "".join(lines).replace("\n", "").replace("\r", "").\
                     replace(" ", "")
    # Create an empty tree.
    tree = TreeNode(0)
    # Parse the DND string and build the NJ-tree.
    parse_DND_string(dnd_string, tree)
    # Find a root node.
    root = None
    tree = tree.branches[0]
    for branch in tree.branches:
        if len(branch.branches) == 0:
            if root:  # More than 1 leafs, quit.
                break
            root = branch
    # By default, Clustal generates a rooted tree. We need to unroot-it
    # to avoid triple-forked trees.
    if root:
        tree.branches.remove(root)
        new_tree = TreeNode(0)
        new_tree.branches.append(tree)
        new_tree.branches.append(root)
        tree = new_tree
    tree.buildSequenceList(sequence_group)
    sequence_group.tree = tree
    # sequence_group.sortByTreeOrder()
    return tree


[docs]def parse_DND_string(dndstring, tree):
    """
    Parse a dnd-formatted string, generate a tree and and append its branches
    to a given tree.

    :type dndstring: string
    :param dndstring: tree in DND format

    :type tree: `TreeNode`
    :param tree: target tree
    """
    level = 0
    while len(dndstring) > 0:
        if dndstring[0] == "(":
            level += 1
            dndstring = dndstring[1:]
            node = TreeNode(level)
            node.parent = tree
            tree.branches.append(node)
            tree = node
        elif dndstring[0] == ")":
            level -= 1
            dndstring = dndstring[1:]
            tree = tree.parent
        elif dndstring[0] == ";":
            break
        else:
            right_pos = dndstring.find('(')
            left_pos = dndstring.find(')')
            col_pos = dndstring.find(':')
            comma_pos = dndstring.find(',')
            if col_pos > 0 and (col_pos < right_pos or right_pos < 0) and \
               (col_pos < left_pos or left_pos < 0) and \
               (col_pos < comma_pos or comma_pos < 0):
                # Create a new node with empty branches to indicate a leaf.
                node = TreeNode(level)
                node.parent = tree
                node.name = dndstring[:col_pos]
                tree.branches.append(node)
                # Extract a branch score.
                if comma_pos < left_pos and comma_pos >= 0:
                    left_pos = comma_pos
                dndstring = dndstring[col_pos + 1:]
            else:
                dndstring = dndstring[1:]


[docs]def load_clustal_file(sequence_group,
                      file_name,
                      replace=False,
                      start=-1,
                      end=-1):
    """
    Load a sequence alignment in Clustal format. Add sequences to a specified
    sequence group. By default, this method doesn't replace the old residues,
    but only introduces gaps according to the alignment. Thus, all residue
    meta-data (e.g. Maestro information) will be preserved after doing the
    alignment.

    :type sequence_group: SequenceGroup
    :param sequence_group: target sequence

    :type file_name: string
    :param file_name: input file name

    :type replace: boolean (default=False)
    :param replace: optional parameter, if True, replace existing sequences

    :rtype: bool
    :return: True on success, False otherwise
    """
    try:
        inp_file = open(file_name, "r")
        if not inp_file:
            return False
    except IOError:
        return False
    lines = inp_file.readlines()
    inp_file.close()
    for seq in sequence_group.sequences:
        seq.processed = False
    if replace:
        for line in lines:
            if line[:3] == "SEQ":
                index = int(line[3:8])
                seq = sequence_group.sequences[index]
                if not seq.processed:
                    seq.processed = True
                    seq.residues = []
                seq.appendResidues(line[10:])
        for seq in sequence_group.sequences:
            if seq.processed:
                seq.propagateGapsToChildren()
    else:
        for line in lines:
            if line[:3] == "SEQ":
                index = int(line[3:8])
                seq = sequence_group.sequences[index]
                if not seq.processed:
                    seq.processed = True
                    seq._old_residues = seq.residues
                    seq.residues = []
                seq.appendResidues(line[10:])
        if start >= 0 and end >= 0:
            # Replace the residues between start and end with the residues
            # read from the ClustalW alignment file.
            for seq in sequence_group.sequences:
                if seq.processed:
                    new_residues = seq._old_residues[:]
                    for pos in range(start, end):
                        new_residues.pop(start)
                    for res in reversed(seq.residues):
                        new_residues.insert(start, res)
                    seq.residues = new_residues
        for seq in sequence_group.sequences:
            if seq.processed:
                ungapped = [res for res in seq._old_residues if not res.is_gap]
                if seq.gaplessLength() == len(ungapped):
                    ungapped = [
                        res for res in seq._old_residues if not res.is_gap
                    ]
                    old_index = 0
                    for index, res in enumerate(seq.residues):
                        if not res.is_gap:
                            seq.residues[index] = ungapped[old_index]
                            old_index += 1
                else:
                    # Something is very wrong, sequence lengths don't match.
                    # We replace the residues, so all residue-level
                    # metadata is going to be lost. We probably need to issue
                    # a warning message when this happens.
                    pass
                # Just get rid of the old sequence.
                seq._old_residues = None
                seq.propagateGapsToChildren()
    return True


[docs]def pdb_create_sequence(pdb_id, chain_id, sequence_string):
    """
    Creates a new sequence out of sequence string read from a PDB file.

    :type pdb_id: str
    :param pdb_id: PDB ID (4-letter code)

    :type chain_id: str
    :param chain_id: single-letter chain ID

    :type sequence_string: str
    :param sequence_string: single-letter code amino acid string to be converted
        to the sequence

    :rtype: `Sequence`
    :return: Created sequence.
    """
    seq = Sequence()
    seq.chain_id = chain_id
    seq.type = constants.SEQ_AMINO_ACIDS
    seq.name = pdb_id
    seq.short_name = pdb_id
    seq.makeShortName()
    seq.appendResidues(sequence_string)
    return seq


"""
ATOM      1  N   MET A   1     -14.195  -1.520   4.416
ATOM    447  CA  PHE A  30      -0.555   4.359   0.858  1.00  0.09           C
"""


[docs]def load_PDB_file(sequence_group,
                  file_name,
                  requested_chain_id=None,
                  given_pdb_id=None,
                  align_func=None):
    """
    Reads a PDB file, extracts relevant data and creates the sequence and
    annotations.

    :type sequence_group: `SequenceGroup`
    :param sequence_group: target sequence group

    :type file_name: str
    :param file_name: name of the file to be read

    :type requested_chain_id: str
    :param requested_chain_id: Optional parameter. If specified, only the
        chain ID equal to this parameter will be read.

    :rtype: bool
    :return: True on success, False otherwise.
    """
    if file_name[-3:] == ".gz":
        try:
            inp_file = gzip.open(file_name, "r")
            if not inp_file:
                return False
        except:
            return False
    else:
        try:
            inp_file = open(file_name, "r")
            if not inp_file:
                return False
        except IOError:
            return False
    lines = inp_file.readlines()
    inp_file.close()
    sequence_string = ""
    valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1)
    pdb_id = None
    record_index = 0
    last_record_index = 0
    if given_pdb_id:
        pdb_id = given_pdb_id
    else:
        pdb_id = os.path.basename(file_name)
        pdb_id, ext = os.path.splitext(pdb_id)
        if pdb_id[:3] == "pdb":
            pdb_id = pdb_id[3:]
    if len(pdb_id) >= 4 and pdb_id[0].isdigit():
        pdb_id = pdb_id[:4].upper()
    sequence_dict = {}
    secondary_dict = {}
    structure_dict = {}
    ssbond_dict = {}
    # Multipass processing of the PDB file.
    # Extract PDB ID.
    for line in lines:
        if line.startswith("HEADER"):
            # Try to extract PDB ID from HEADER record.
            new_pdb_id = line[62:66].replace(" ", "")
            if len(new_pdb_id) == 0:
                pdb_id = new_pdb_id
                break
        if line.startswith("ATOM") or line.startswith("END"):
            break
    # Extract SEQRES records.
    chain_id = ""
    for line in lines:
        if line.startswith("SEQRES"):
            # Read SEQRES records and split them into individual chains.
            record_index = int(line[6:10].replace(" ", ""))
            if record_index < last_record_index and \
               len(sequence_string) > 0:
                sequence_dict[chain_id] = pdb_create_sequence(
                    pdb_id, chain_id, sequence_string)
                sequence_string = ""
            last_record_index = record_index
            chain_id = line[11]
            residue_list = line[19:72].split(" ")
            seq_str = "".join([
                constants.AMINO_ACIDS_3_TO_1.get(res, "X")
                for res in residue_list
            ])
            for res in residue_list:
                if len(res) == 3:
                    if res in valid_amino_acids:
                        sequence_string += constants.AMINO_ACIDS_3_TO_1[res]
                    else:
                        sequence_string += "X"
    if len(sequence_string) > 0:
        sequence_dict[chain_id] = pdb_create_sequence(pdb_id, chain_id,
                                                      sequence_string)
        sequence_string = ""

    # Extract secondary structure assignments.
    # They need to be matched with residue numbers, so they will
    # be assigned later on. For now we are only creating
    # placeholders.
    for line in lines:
        if line.startswith("HELIX"):
            record_index = int(line[6:10].replace(" ", ""))
            chain_id = line[19]
            if chain_id not in secondary_dict and chain_id in sequence_dict:
                secondary_dict[chain_id] = \
                    sequence_dict[chain_id].createSecondaryAssignment()
                secondary_dict[chain_id]._tmp_ssa = []
            start = line[20:25].replace(" ", "")
            end = line[32:37].replace(" ", "")
            if chain_id in secondary_dict:
                ssa = secondary_dict[chain_id]
                ssa._tmp_ssa.append(('H', start, end))
        elif line.startswith("SHEET"):
            record_index = int(line[6:10].replace(" ", ""))
            chain_id = line[21]
            if chain_id not in secondary_dict and chain_id in sequence_dict:
                secondary_dict[chain_id] = \
                    sequence_dict[chain_id].createSecondaryAssignment()
                secondary_dict[chain_id]._tmp_ssa = []
            start = line[22:27].replace(" ", "")
            end = line[33:38].replace(" ", "")
            if chain_id in secondary_dict:
                ssa = secondary_dict[chain_id]
                ssa._tmp_ssa.append(('E', start, end))

        elif line.startswith("SSBOND"):
            """
            SSBOND   1 CYS A    3    CYS A   40                          1555   1555  2.00
            SSBOND   2 CYS A    4    CYS A   32                          1555   1555  2.04
            SSBOND   3 CYS A   16    CYS A   26                          1555   1555  2.05
            """
            chain1 = line[15]
            chain2 = line[29]
            resi1 = line[16:22].strip()
            resi2 = line[30:36].strip()
            if chain1 == chain2:
                # For now we only support inter-molecular disulfide bonds.
                chain_id = chain1
                if chain_id not in ssbond_dict and chain_id in sequence_dict:
                    ssbond_dict[chain_id] = \
                        sequence_dict[chain_id].createSSBondAssignment()
                ssbond_dict[chain_id]._tmp_bond_list.append((resi1, resi2))
    amino_acids = list(constants.AMINO_ACIDS_3_TO_1)
    previous_res_id = None
    current_residue = None
    # Should we ignore what we read
    ignore = False
    # Read ATOM data.
    for line in lines:
        if line.startswith("MODEL"):
            model_num = int(line[5:14])
            if model_num > 1:
                # Incorporate only the first model.
                ignore = True
        if line.startswith("ENDMDL"):
            ignore = False
            continue
        if ignore:
            continue
        if line.startswith("ATOM") or line.startswith("HETATM"):
            # Get a residue ID.
            res_id = line[22:27]
            if not previous_res_id or previous_res_id != res_id:
                previous_res_id = res_id
                # Get a chain ID.
                chain_id = line[21]
                if chain_id in structure_dict:
                    structure = structure_dict[chain_id]
                else:
                    structure = Sequence()
                    structure.name = "Atomic Coordinates"
                    structure.short_name = "Atomic Coordinates"
                    structure.type = constants.ANNOTATION_STRUCTURE
                    structure.chain_id = chain_id
                    structure_dict[chain_id] = structure
                # Get a residue name.
                res_name = line[17:20]
                bfactor = line[60:65]
                try:
                    bfactor = float(bfactor)
                except:
                    bfactor = -1.0
                # Convert the residue name to amino acid single letter code.
                if res_name in amino_acids:
                    res_code = constants.AMINO_ACIDS_3_TO_1[res_name]
                    res = Residue()
                    res.name = res_name
                    res.icode = res_id[-1:]
                    res.num = int(res_id[:-1])
                    res.code = res_code
                    res.sequence = structure
                    res.bfactor = bfactor
                    res.coordinates = []
                    current_residue = res
                    res.structureless = False
                    structure.residues.append(res)
            if current_residue:
                current_residue.structure.append(line)
    # Process the structures that don't have sequences (lack of SEQRES record)
    for structure in structure_dict.values():
        chain_id = structure.chain_id
        if chain_id not in sequence_dict:
            sequence_string = structure.text()
            sequence_dict[chain_id] = pdb_create_sequence(
                pdb_id, chain_id, sequence_string)
    # Append the read PDB sequences to the sequence group.
    for seq in sequence_dict.values():
        if not requested_chain_id or \
           seq.chain_id == requested_chain_id:
            seq.has_structure = True
            if seq.chain_id in structure_dict and align_func:
                align_func(seq, structure_dict[seq.chain_id])
            sequence_group.sequences.append(seq)
            if chain_id in ssbond_dict:
                ssb = ssbond_dict[seq.chain_id]
                bond_list = []
                for resi1, resi2 in ssb._tmp_bond_list:
                    ridx1 = seq.getResidueIndex(resi1)
                    ridx2 = seq.getResidueIndex(resi2)
                    if ridx1 is not None and ridx2 is not None:
                        bond_list.append((ridx1, ridx2))
                seq.ssb_bond_list = bond_list
                ssb._tmp_bond_list
                if bond_list:
                    seq.children.append(ssb)
                    ssb.parent_sequence = seq
                    ssb.height = int(0.5 * len(bond_list) + 0.5)
                    for pos in range(len(ssb.residues)):
                        res = seq.residues[pos]
            if seq.chain_id in secondary_dict:
                ssa = secondary_dict[seq.chain_id]
                # Actually assign secondary structure from the read
                # assignments.
                seq.children.append(ssa)
                ssa.parent_sequence = seq
                for pos in range(len(ssa.residues)):
                    ss_res = ssa.residues[pos]
                    res = seq.residues[pos]
                    for ss in ssa._tmp_ssa:
                        code, start, end = ss
                        start_i = int(start.rstrip(string.ascii_letters))
                        end_i = int(end.rstrip(string.ascii_letters))
                        if res.num >= start_i and res.num <= end_i:
                            ss_res.code = code
                    res.ss_code = ss_res.code
    return True


[docs]def save_PDB_file(sequence, file_name):
    lines = [res.structure for res in sequence.residues if res.structure]
    if not lines:
        return False

    output_file = open(file_name, "w")
    output_file.writelines(lines)
    output_file.write("TER\nEND\n")
    output_file.close()

    return True


[docs]def load_file(sequence_group, file_name, format=None, align_func=None):
    """
    Loads a file. The file format can be inferred from the file name extension,
    or can be explictly given.

    :type file_name: string
    :param file_name: input file name

    :type format: string
    :param format: format of the input file

    :rtype: bool
    :return: True if file successfully read; otherwise False
    """
    root, ext = os.path.splitext(file_name)
    ext = ext.lower()
    if format == "fasta" or ext == ".fasta" or ext == ".fst" or \
          ext == ".fas" or ext == ".seq" or ext == ".txt":
        return load_fasta_file(sequence_group, file_name)

    elif format == "pdb" or ext == ".pdb" or ext == ".ent":
        return load_PDB_file(sequence_group, file_name, align_func=align_func)

    elif format == "mae" or ext == ".mae":
        return (load_maestro_file(sequence_group,
                                  file_name,
                                  align_func=align_func))

    elif format == "sp" or ext == ".sp" or ext == ".sw" or \
            ext == ".swiss" or ext == ".swissprot":
        return load_swissprot_file(sequence_group, file_name)

    elif format == "pir" or ext == ".pir":
        return load_PIR_file(sequence_group, file_name)

    elif format == "gcg" or ext == ".msf" or ext == ".gcg":
        return load_GCG_file(sequence_group, file_name)

    elif format == "embl" or ext == ".embl" or ext == ".emb":
        return load_EMBL_file(sequence_group, file_name)


[docs]def load_maestro_file(sequence_group, file_name, align_func=None):
    """
    Load a file in Maestro format.
    """
    valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1)
    reader = structure.MaestroReader(file_name)
    for st in reader:
        has_seqres = False
        name = None
        if "s_pdb_PDB_ID" in st.property:
            name = st.property["s_pdb_PDB_ID"]
        try:
            data_handle = mm.mmct_ct_m2io_get_unrequested_handle(st)
        except mm.MmException:
            try:
                data_handle = mm.mmct_ct_get_or_open_additional_data(st, True)
            except:
                pass
        chain_seq_dict = {}
        if has_seqres:
            try:
                mm.m2io_goto_block(data_handle, "m_PDB_SEQRES", 1)
                has_seqres = True
            except:
                pass
        for chain in st.chain:
            seq = None
            is_protein = False
            for residue in chain.residue:
                if residue.pdbres[:3] in valid_amino_acids:
                    is_protein = True
                    break
            if not is_protein:
                continue
            secondary = None
            ssb = None
            seq = Sequence()
            seq.type = constants.SEQ_AMINO_ACIDS
            seq.chain_id = chain.name[0]
            seq.color_scheme = constants.COLOR_MAESTRO
            seq.has_structure = True
            sequence_group.sequences.append(seq)
            ssb = seq.createSSBondAssignment()
            ssb.parent_sequence = seq
            seq.children.append(ssb)

            # This code comes from mcpro, zmat.py
            bond_list = []
            asl = "a.pt SG and (res. CYS or res. CYX)"
            if chain.name > ' ':
                asl += " and (chain.name %s)" % str(chain.name)
            s_atoms = evaluate_asl(st, asl)
            found = False
            for s_atom in s_atoms:
                for s2_atom in s_atoms:
                    if mm.mmct_is_atom_bonded(st, s_atom, s2_atom) == mm.TRUE:
                        found = True
                        if s_atom <= s2_atom:
                            resi1 = str(
                                st.atom[s_atom].resnum) + \
                                str(st.atom[s_atom].inscode)
                            resi2 = str(
                                st.atom[s2_atom].resnum) + \
                                str(st.atom[s2_atom].inscode)
                            bond_list.append((resi1, resi2))
            if not found:
                seq.children.remove(ssb)
            secondary = seq.createSecondaryAssignment()
            secondary.parent_sequence = seq
            seq.children.append(secondary)

            if not name:
                name = st.title
            name = name[:]
            short_name = name[:]
            seq.name = name
            seq.short_name = short_name

            residues = []
            ss_residues = []
            ssb_residues = []
            residue_list = structure.get_residues_unsorted(chain)
            for residue in residue_list:
                ca_atom = None
                for atom in residue.atom:
                    if "CA" in atom.pdbname:
                        ca_atom = atom
                        break
                if not ca_atom:
                    continue
                if residue.pdbres[:3] in valid_amino_acids:
                    code = constants.AMINO_ACIDS_3_TO_1[residue.pdbres[:3]]
                else:
                    code = 'X'
                res = Residue()
                res.sequence = seq
                res.code = code
                res.name = residue.pdbres[:3]
                res.num = residue.resnum
                res.icode = residue.inscode
                res.structureless = False
                res.bfactor = ca_atom.temperature_factor
                res.maestro_color = ca_atom.color.rgb
                res.maestro_atom_num = ca_atom.index

                ss_res = Residue()
                ss_res.sequence = seq
                if ssb:
                    ssb_res = Residue()
                    ssb_res.sequence = seq
                    ssb_res.code = ' '
                    ssb_residues.append(ssb_res)
                if ca_atom.secondary_structure == 1:
                    ss_res.code = 'H'
                    res.ss_code = 'H'
                elif ca_atom.secondary_structure == 2:
                    ss_res.code = 'E'
                    res.ss_code = 'E'
                else:
                    ss_res.code = ' '
                    res.ss_code = ' '
                residues.append(res)
                ss_residues.append(ss_res)
            if residues:
                seq.residues = residues
                secondary.residues = ss_residues
                if ssb and ssb in seq.children:
                    ssb.residues = ssb_residues
                    new_bond_list = []
                    for rid1, rid2 in bond_list:
                        ridx1 = seq.getResidueIndex(rid1)
                        ridx2 = seq.getResidueIndex(rid2)
                        if ridx1 is not None and ridx2 is not None:
                            new_bond_list.append((ridx1, ridx2))
                    seq.ssb_bond_list = new_bond_list
                    ssb.bond_list = new_bond_list
                    if not new_bond_list:
                        seq.children.remove(ssb)
                    else:
                        ssb.height = int(len(new_bond_list) * 0.5 + 0.5)
            else:
                sequence_group.sequences.remove(seq)

            if residues and chain.name in chain_seq_dict:
                # Use full SEQRES sequence.
                full_seq = Sequence()
                full_seq.appendResidues(chain_seq_dict[chain.name])
                if align_func:
                    align_func(full_seq, seq)
            for res in seq.residues:
                res.sequence = seq
            for child in seq.children:
                child.parent_sequence = seq
                for res in child.residues:
                    res.sequence = child
    return True