Source code for schrodinger.ui.sequencealignment.fileio

"""
File I/O handling routines for the multiple sequence viewer.

Copyright Schrodinger, LLC. All rights reserved.
"""

# Contributors: Piotr Rotkiewicz

import gzip
import itertools
import os
import string

from schrodinger import structure
from schrodinger.application.msv.seqio import parse_fasta_header
from schrodinger.infra import mm
from schrodinger.structutils.analyze import evaluate_asl

from . import constants
from .residue import Residue
from .sequence import Sequence
from .tree_node import TreeNode


[docs]def partition_by_predicate(arr, pred): """ Utility function to groups a list into lists, with each sublist beginning with an element that matches the supplied predicate Note that many file reading functions below would benefit from using this function. :param arr: A list to split into sublists :type arr: list :param pred: A function that takes a list item and returns True if the list item meets a criteria and False otherwise :type pred: function This is not efficient, since we loop through the array twice, but it probably doesn't matter. """ counter = 0 group_info = [] for item in arr: if pred(item): counter += 1 group_info.append(counter) array_with_grouping_info = list(zip(arr, group_info)) for _, group in itertools.groupby(array_with_grouping_info, lambda x: x[1]): yield [item for (item, info) in group]
[docs]def load_fasta_file(sequence_group, file_name, text=None): """ Load a sequence file in FASTA format, create sequences and append them to the sequence group. Splits sequence name from the FASTA header. :param sequence_group: sequence group to which the sequences will be added :type sequence_group: SequenceGroup :param file_name: name of input FASTA file :type file_name: str :param text: optional text in FASTA format used instead of the input file, split by newline char into a list of lines :type text: list of str """ lines = text if not lines: try: with open(file_name) as f: lines = [line for line in f.readlines() if line] except IOError: return False for group in partition_by_predicate(lines, lambda x: x.startswith('>')): seq = Sequence() header = group.pop(0) fasta_parts = parse_fasta_header(header) split_name = fasta_parts.name chain_id = fasta_parts.chain residues = "".join(group) if not residues: continue seq.name = header[1:].rstrip() seq.chain_id = chain_id seq.makeShortName(name=split_name) seq.appendResidues(residues, use_numbers=True) sequence_group.sequences.append(seq) return True
[docs]def load_PIR_file(sequence_group, file_name): """ Load a sequence file in PIR format, create sequences and append them to the sequence group. :param sequence_group: sequence group to which the sequences will be added :type sequence_group: SequenceGroup :param file_name: name of input PIR file :type file_name: string """ try: inp_file = open(file_name, "r") except: return False if not inp_file: return False lines = inp_file.readlines() inp_file.close() if not lines or len(lines) == 0: return False seq = None skip = False string = "" for line in lines: if len(line) == 0: continue if skip: skip = False continue if line[0] == '>': if seq: seq.appendResidues(string) seq = Sequence() string = "" pos = line.find("P1;") if pos >= 0: line = line[pos + 2:] seq.name = [ch for ch in line[1:] if ch >= ' '] seq.makeShortName() sequence_group.sequences.append(seq) skip = True elif seq: string = string + line if seq: seq.appendResidues(string) return True
[docs]def load_GCG_file(sequence_group, file_name): """ Load a sequence file in GCG format, create sequences and append them to the sequence group. :param sequence_group: sequence group to which the sequences will be added :type sequence_group: SequenceGroup :param file_name: name of input PIR file :type file_name: string """ try: inp_file = open(file_name, "r") except: return False if not inp_file: return False lines = inp_file.readlines() inp_file.close() if not lines or len(lines) == 0: return False seq = None sequences = {} name_list = [] for line in lines: if len(line) < 2: continue if "\\\\" in line: continue name, sep, seq = line.partition(" ") if len(name) > 0 and len(seq) > 0: seq = [ch for ch in seq if ch > ' '] if name in sequences: sequences[name] += seq else: sequences[name] = seq name_list.append(name) for name in name_list: seq = Sequence() seq.appendResidues(sequences[name]) seq.name = name seq.makeShortName() sequence_group.sequences.append(seq) return True
[docs]def load_EMBL_file(sequence_group, file_name): """ Load a sequence file in EMBL format, create sequences and append them to the sequence group. :param sequence_group: sequence group to which the sequences will be added :type sequence_group: SequenceGroup :param file_name: name of input PIR file :type file_name: string """ try: inp_file = open(file_name, "r") except: return False if not inp_file: return False lines = inp_file.readlines() inp_file.close() if not lines or len(lines) == 0: return False in_seq = False codes = "" name = None for line in lines: if line[:2] == "ID": if in_seq and name: seq = Sequence() seq.appendResidues(seq) seq.name = name seq.makeShortName() sequence_group.sequences.append(seq) seq = "" name = None in_seq = False split_line = line.split() if (len(split_line) > 1 and len(split_line[1])): name = split_line[1] if in_seq: codes += [ch for ch in line if ch > ' '] if line[:2] == "SQ": in_seq = True if in_seq and name: seq = Sequence() seq.appendResidues(codes) seq.name = name seq.makeShortName() sequence_group.sequences.append(seq) return True
[docs]def load_swissprot_file(sequence_group, file_name, text=None): """ Load a sequence file in SWISSPROT format, create sequences and append them to the sequence group. Tries to split sequence name from the :param sequence_group: sequence group to which the sequences will be added :type sequence_group: SequenceGroup :param file_name: name of input SWISSPROT file :type file_name: string :param text: optional text in SWISSPROT format used instead of the input file :type text: string """ if text or not file_name: lines = text else: try: inp_file = open(file_name, "r") except: return False if not inp_file: return False lines = inp_file.readlines() inp_file.close() if not lines or len(lines) == 0: return False seq = None read_sequence = False string = "" for line in lines: if len(line) < 2: continue id = line[:2] if id == 'ID': seq = Sequence() string = "" seq.name = line[2:] elif id == 'AC' and seq: seq.short_name = line[2:] elif id == 'SQ' and seq: read_sequence = True elif id == '//' and seq: if len(string) > 0 and \ seq.name and \ seq.short_name: seq.appendResidues(string) sequence_group.sequences.append(seq) seq = None string = "" read_sequence = False elif read_sequence: string += line.replace(" ", "") return True
[docs]def save_fasta_file(sequence_group, file_name, for_clustal=False, file=None, target_sequence=None, skip_gaps=False, save_annotations=False, selected_only=False, start=-1, end=-1, as_text=False, save_similarity=False): """ Writes a contents of sequence group to a file. :param sequence_group: Sequence group to be written to a file :type sequence_group: SequenceGroup :param file_name: Name of the output file :type file_name: string :param for_clustal: Optional parameter indicating if the output file will be used for Clustal alignment :type for_clustal: bool :param file: Optional file handle, if not None, this handle will be used to write the sequences rather than creating a new file (file_name parameter would be ignored) :type file: file :param target_sequence: Optional sequence to be saved. If not specified, all sequences will be written to the output file. :type target_sequence: `Sequence` :param skip_gaps: Optional parameter deciding if gaps should be written to the FASTA file. :type skip_gaps: bool :param save_annotations: Optional parameter for saving annotations (default is False). :type save_annotations: bool :param save_similarity: Saves similarity when set to True. :type save_similarity: bool :param start: Optional starting position of the subset residues to save. :type start: int :param end: Optional ending position of the subset residues to save. :type end: int :param selected_only: Save only (partially) selected columns if True :type selected_only: True """ if not file: out_handle = open(file_name, "w") else: out_handle = file if not out_handle: return False have_selected = sequence_group.hasSelectedSequences() for seq in sequence_group.sequences: if target_sequence and target_sequence != seq: continue index = sequence_group.sequences.index(seq) if seq.visible and seq.length() > 0 and \ (not have_selected or seq.selected): fasta_name = seq.short_name if seq.chain_id > ' ': fasta_name += '_' + seq.chain_id if save_similarity: fasta_name += "|ID:%.2f" % (100.0 * seq.identity) + \ "|SIM:%.2f" % (100.0 * seq.similarity) + \ "|HOM:%.2f" % (100.0 * seq.homology) if as_text: out_handle.write(fasta_name + ", ") elif for_clustal: if seq.type != constants.SEQ_AMINO_ACIDS: continue out_handle.write(">SEQ" + str(index) + "\n") else: out_handle.write(">" + fasta_name + "\n") n_char = 0 for index, res in enumerate(seq.residues): if selected_only and not res.selected: continue if (for_clustal or skip_gaps) and res.is_gap: # Skip gaps for Clustal. continue if start >= 0 and end >= 0 and (index < start or index > end): continue out_handle.write(res.code) n_char += 1 if not as_text and n_char == constants.FASTA_LINE_LENGTH: out_handle.write("\n") n_char = 0 if not as_text and n_char < constants.FASTA_LINE_LENGTH: out_handle.write("\n") out_handle.write("\n") if save_annotations and not for_clustal: for child in seq.children: if child.annotation_type == constants.ANNOTATION_SSP or \ child.type == constants.SEQ_SECONDARY or \ child.annotation_type == constants.ANNOTATION_PFAM: if child.annotation_type == constants.ANNOTATION_SSP: out_handle.write(">" + fasta_name + "_SSP\n") elif child.annotation_type == constants.ANNOTATION_PFAM: out_handle.write(">" + fasta_name + "_PFAM\n") else: out_handle.write(">" + fasta_name + "_SSA\n") n_char = 0 for index, res in enumerate(child.residues): if selected_only and not seq.residues[ index].selected: continue if (skip_gaps) and res.is_gap: continue if start >= 0 and end >= 0 and \ (index < start or index > end): continue code = res.code if code == ' ': code = '-' elif res.is_gap: code = '~' out_handle.write(code) n_char += 1 if n_char == constants.FASTA_LINE_LENGTH: out_handle.write("\n") n_char = 0 if n_char < constants.FASTA_LINE_LENGTH: out_handle.write("\n") out_handle.write("\n") index += 1 if not file: out_handle.close() return True
[docs]def save_clustal_file(sequence_group, file_name, file=None, start=-1, end=-1, ss_constraints=False, subset=None, ignore_selection=False): """ Writes a contents of sequence group to a Clustal ALN file. :param sequence_group: Sequence group to be written to a file :type sequence_group: SequenceGroup :param file_name: Name of the output file :type file_name: string :param start: Optional starting position of the subset residues to save. :type start: int :param end: Optional ending position of the subset residues to save. :type end: int :param ss_constraints: Optional secondary structure constraints. :type ss_constraints: True """ if not file: out_handle = open(file_name, "w") else: out_handle = file if not out_handle: return False have_selected = sequence_group.hasSelectedSequences() if ignore_selection: have_selected = False seq_list = [] ss_list = [] for seq in sequence_group.sequences: if seq.isValidProtein() and seq.length() > 0 and \ ((subset == "selected" and seq.selected) or (subset == "unselected" and seq.unselected) or (subset is None and (not have_selected or seq.selected))): seq_list.append(seq) ss = None if ss_constraints: for child in seq.children: if child.type == constants.SEQ_SECONDARY or \ child.annotation_type == constants.ANNOTATION_SSP: ss = child break ss_list.append(ss) if len(seq_list) == 0: return False out_handle.write("CLUSTAL W (2.0) multiple sequence alignment\n\n") saved = True pos = 0 while saved: saved = False for index, seq in enumerate(seq_list): ss_residues = None if ss_list[index] is not None: if start >= 0 and end >= 0: ss_residues = ss_list[index].residues[start:end] else: ss_residues = ss_list[index].residues if ss_residues and pos < len(ss_residues): ss_res_list = ss_residues[pos:pos + 60] out_handle.write("!SS_SEQ" + str(sequence_group.sequences.index(seq)) + "\t") for res in ss_res_list: if res.code == 'H': out_handle.write('A') elif res.code == 'E': out_handle.write('B') else: out_handle.write('.') out_handle.write("\n") out_handle.write("!GM_SEQ" + str(sequence_group.sequences.index(seq)) + "\t") for res in ss_res_list: if res.code == 'H': out_handle.write('9') elif res.code == 'E': out_handle.write('9') else: out_handle.write('1') out_handle.write("\n") for index, seq in enumerate(seq_list): if start >= 0 and end >= 0: residues = seq.residues[start:end] else: residues = seq.residues if pos < len(residues): res_list = residues[pos:pos + 60] out_handle.write("SEQ" + str(sequence_group.sequences.index(seq)) + "\t") saved = True for res in res_list: if res.is_gap: out_handle.write('.') else: out_handle.write(res.code) out_handle.write("\n") pos += 60 out_handle.write("\n\n") if not file: out_handle.close() return True
[docs]def load_DND_tree(file_name, sequence_group): """ Load Newick-formatted tree file outputted by multiple sequence alignment program. The function was tested using outputs of ClustalW and T-Coffee. :param file_name: name of the input file :type file_name: string :param sequence_group: target sequence group :type sequence_group: SequenceGroup :return: True if operation succeeded, False otherwise :rtype: boolean """ try: # Load input file. inp_file = open(file_name, "r") if not inp_file: return False except IOError: return False lines = inp_file.readlines() inp_file.close() # Concatenate lines to create Newick-formatted string. dnd_string = "".join(lines).replace("\n", "").replace("\r", "").\ replace(" ", "") # Create an empty tree. tree = TreeNode(0) # Parse the DND string and build the NJ-tree. parse_DND_string(dnd_string, tree) # Find a root node. root = None tree = tree.branches[0] for branch in tree.branches: if len(branch.branches) == 0: if root: # More than 1 leafs, quit. break root = branch # By default, Clustal generates a rooted tree. We need to unroot-it # to avoid triple-forked trees. if root: tree.branches.remove(root) new_tree = TreeNode(0) new_tree.branches.append(tree) new_tree.branches.append(root) tree = new_tree tree.buildSequenceList(sequence_group) sequence_group.tree = tree # sequence_group.sortByTreeOrder() return tree
[docs]def parse_DND_string(dndstring, tree): """ Parse a dnd-formatted string, generate a tree and and append its branches to a given tree. :type dndstring: string :param dndstring: tree in DND format :type tree: `TreeNode` :param tree: target tree """ level = 0 while len(dndstring) > 0: if dndstring[0] == "(": level += 1 dndstring = dndstring[1:] node = TreeNode(level) node.parent = tree tree.branches.append(node) tree = node elif dndstring[0] == ")": level -= 1 dndstring = dndstring[1:] tree = tree.parent elif dndstring[0] == ";": break else: right_pos = dndstring.find('(') left_pos = dndstring.find(')') col_pos = dndstring.find(':') comma_pos = dndstring.find(',') if col_pos > 0 and (col_pos < right_pos or right_pos < 0) and \ (col_pos < left_pos or left_pos < 0) and \ (col_pos < comma_pos or comma_pos < 0): # Create a new node with empty branches to indicate a leaf. node = TreeNode(level) node.parent = tree node.name = dndstring[:col_pos] tree.branches.append(node) # Extract a branch score. if comma_pos < left_pos and comma_pos >= 0: left_pos = comma_pos dndstring = dndstring[col_pos + 1:] else: dndstring = dndstring[1:]
[docs]def load_clustal_file(sequence_group, file_name, replace=False, start=-1, end=-1): """ Load a sequence alignment in Clustal format. Add sequences to a specified sequence group. By default, this method doesn't replace the old residues, but only introduces gaps according to the alignment. Thus, all residue meta-data (e.g. Maestro information) will be preserved after doing the alignment. :type sequence_group: SequenceGroup :param sequence_group: target sequence :type file_name: string :param file_name: input file name :type replace: boolean (default=False) :param replace: optional parameter, if True, replace existing sequences :rtype: bool :return: True on success, False otherwise """ try: inp_file = open(file_name, "r") if not inp_file: return False except IOError: return False lines = inp_file.readlines() inp_file.close() for seq in sequence_group.sequences: seq.processed = False if replace: for line in lines: if line[:3] == "SEQ": index = int(line[3:8]) seq = sequence_group.sequences[index] if not seq.processed: seq.processed = True seq.residues = [] seq.appendResidues(line[10:]) for seq in sequence_group.sequences: if seq.processed: seq.propagateGapsToChildren() else: for line in lines: if line[:3] == "SEQ": index = int(line[3:8]) seq = sequence_group.sequences[index] if not seq.processed: seq.processed = True seq._old_residues = seq.residues seq.residues = [] seq.appendResidues(line[10:]) if start >= 0 and end >= 0: # Replace the residues between start and end with the residues # read from the ClustalW alignment file. for seq in sequence_group.sequences: if seq.processed: new_residues = seq._old_residues[:] for pos in range(start, end): new_residues.pop(start) for res in reversed(seq.residues): new_residues.insert(start, res) seq.residues = new_residues for seq in sequence_group.sequences: if seq.processed: ungapped = [res for res in seq._old_residues if not res.is_gap] if seq.gaplessLength() == len(ungapped): ungapped = [ res for res in seq._old_residues if not res.is_gap ] old_index = 0 for index, res in enumerate(seq.residues): if not res.is_gap: seq.residues[index] = ungapped[old_index] old_index += 1 else: # Something is very wrong, sequence lengths don't match. # We replace the residues, so all residue-level # metadata is going to be lost. We probably need to issue # a warning message when this happens. pass # Just get rid of the old sequence. seq._old_residues = None seq.propagateGapsToChildren() return True
[docs]def pdb_create_sequence(pdb_id, chain_id, sequence_string): """ Creates a new sequence out of sequence string read from a PDB file. :type pdb_id: str :param pdb_id: PDB ID (4-letter code) :type chain_id: str :param chain_id: single-letter chain ID :type sequence_string: str :param sequence_string: single-letter code amino acid string to be converted to the sequence :rtype: `Sequence` :return: Created sequence. """ seq = Sequence() seq.chain_id = chain_id seq.type = constants.SEQ_AMINO_ACIDS seq.name = pdb_id seq.short_name = pdb_id seq.makeShortName() seq.appendResidues(sequence_string) return seq
""" ATOM 1 N MET A 1 -14.195 -1.520 4.416 ATOM 447 CA PHE A 30 -0.555 4.359 0.858 1.00 0.09 C """
[docs]def load_PDB_file(sequence_group, file_name, requested_chain_id=None, given_pdb_id=None, align_func=None): """ Reads a PDB file, extracts relevant data and creates the sequence and annotations. :type sequence_group: `SequenceGroup` :param sequence_group: target sequence group :type file_name: str :param file_name: name of the file to be read :type requested_chain_id: str :param requested_chain_id: Optional parameter. If specified, only the chain ID equal to this parameter will be read. :rtype: bool :return: True on success, False otherwise. """ if file_name[-3:] == ".gz": try: inp_file = gzip.open(file_name, "r") if not inp_file: return False except: return False else: try: inp_file = open(file_name, "r") if not inp_file: return False except IOError: return False lines = inp_file.readlines() inp_file.close() sequence_string = "" valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1) pdb_id = None record_index = 0 last_record_index = 0 if given_pdb_id: pdb_id = given_pdb_id else: pdb_id = os.path.basename(file_name) pdb_id, ext = os.path.splitext(pdb_id) if pdb_id[:3] == "pdb": pdb_id = pdb_id[3:] if len(pdb_id) >= 4 and pdb_id[0].isdigit(): pdb_id = pdb_id[:4].upper() sequence_dict = {} secondary_dict = {} structure_dict = {} ssbond_dict = {} # Multipass processing of the PDB file. # Extract PDB ID. for line in lines: if line.startswith("HEADER"): # Try to extract PDB ID from HEADER record. new_pdb_id = line[62:66].replace(" ", "") if len(new_pdb_id) == 0: pdb_id = new_pdb_id break if line.startswith("ATOM") or line.startswith("END"): break # Extract SEQRES records. chain_id = "" for line in lines: if line.startswith("SEQRES"): # Read SEQRES records and split them into individual chains. record_index = int(line[6:10].replace(" ", "")) if record_index < last_record_index and \ len(sequence_string) > 0: sequence_dict[chain_id] = pdb_create_sequence( pdb_id, chain_id, sequence_string) sequence_string = "" last_record_index = record_index chain_id = line[11] residue_list = line[19:72].split(" ") seq_str = "".join([ constants.AMINO_ACIDS_3_TO_1.get(res, "X") for res in residue_list ]) for res in residue_list: if len(res) == 3: if res in valid_amino_acids: sequence_string += constants.AMINO_ACIDS_3_TO_1[res] else: sequence_string += "X" if len(sequence_string) > 0: sequence_dict[chain_id] = pdb_create_sequence(pdb_id, chain_id, sequence_string) sequence_string = "" # Extract secondary structure assignments. # They need to be matched with residue numbers, so they will # be assigned later on. For now we are only creating # placeholders. for line in lines: if line.startswith("HELIX"): record_index = int(line[6:10].replace(" ", "")) chain_id = line[19] if chain_id not in secondary_dict and chain_id in sequence_dict: secondary_dict[chain_id] = \ sequence_dict[chain_id].createSecondaryAssignment() secondary_dict[chain_id]._tmp_ssa = [] start = line[20:25].replace(" ", "") end = line[32:37].replace(" ", "") if chain_id in secondary_dict: ssa = secondary_dict[chain_id] ssa._tmp_ssa.append(('H', start, end)) elif line.startswith("SHEET"): record_index = int(line[6:10].replace(" ", "")) chain_id = line[21] if chain_id not in secondary_dict and chain_id in sequence_dict: secondary_dict[chain_id] = \ sequence_dict[chain_id].createSecondaryAssignment() secondary_dict[chain_id]._tmp_ssa = [] start = line[22:27].replace(" ", "") end = line[33:38].replace(" ", "") if chain_id in secondary_dict: ssa = secondary_dict[chain_id] ssa._tmp_ssa.append(('E', start, end)) elif line.startswith("SSBOND"): """ SSBOND 1 CYS A 3 CYS A 40 1555 1555 2.00 SSBOND 2 CYS A 4 CYS A 32 1555 1555 2.04 SSBOND 3 CYS A 16 CYS A 26 1555 1555 2.05 """ chain1 = line[15] chain2 = line[29] resi1 = line[16:22].strip() resi2 = line[30:36].strip() if chain1 == chain2: # For now we only support inter-molecular disulfide bonds. chain_id = chain1 if chain_id not in ssbond_dict and chain_id in sequence_dict: ssbond_dict[chain_id] = \ sequence_dict[chain_id].createSSBondAssignment() ssbond_dict[chain_id]._tmp_bond_list.append((resi1, resi2)) amino_acids = list(constants.AMINO_ACIDS_3_TO_1) previous_res_id = None current_residue = None # Should we ignore what we read ignore = False # Read ATOM data. for line in lines: if line.startswith("MODEL"): model_num = int(line[5:14]) if model_num > 1: # Incorporate only the first model. ignore = True if line.startswith("ENDMDL"): ignore = False continue if ignore: continue if line.startswith("ATOM") or line.startswith("HETATM"): # Get a residue ID. res_id = line[22:27] if not previous_res_id or previous_res_id != res_id: previous_res_id = res_id # Get a chain ID. chain_id = line[21] if chain_id in structure_dict: structure = structure_dict[chain_id] else: structure = Sequence() structure.name = "Atomic Coordinates" structure.short_name = "Atomic Coordinates" structure.type = constants.ANNOTATION_STRUCTURE structure.chain_id = chain_id structure_dict[chain_id] = structure # Get a residue name. res_name = line[17:20] bfactor = line[60:65] try: bfactor = float(bfactor) except: bfactor = -1.0 # Convert the residue name to amino acid single letter code. if res_name in amino_acids: res_code = constants.AMINO_ACIDS_3_TO_1[res_name] res = Residue() res.name = res_name res.icode = res_id[-1:] res.num = int(res_id[:-1]) res.code = res_code res.sequence = structure res.bfactor = bfactor res.coordinates = [] current_residue = res res.structureless = False structure.residues.append(res) if current_residue: current_residue.structure.append(line) # Process the structures that don't have sequences (lack of SEQRES record) for structure in structure_dict.values(): chain_id = structure.chain_id if chain_id not in sequence_dict: sequence_string = structure.text() sequence_dict[chain_id] = pdb_create_sequence( pdb_id, chain_id, sequence_string) # Append the read PDB sequences to the sequence group. for seq in sequence_dict.values(): if not requested_chain_id or \ seq.chain_id == requested_chain_id: seq.has_structure = True if seq.chain_id in structure_dict and align_func: align_func(seq, structure_dict[seq.chain_id]) sequence_group.sequences.append(seq) if chain_id in ssbond_dict: ssb = ssbond_dict[seq.chain_id] bond_list = [] for resi1, resi2 in ssb._tmp_bond_list: ridx1 = seq.getResidueIndex(resi1) ridx2 = seq.getResidueIndex(resi2) if ridx1 is not None and ridx2 is not None: bond_list.append((ridx1, ridx2)) seq.ssb_bond_list = bond_list ssb._tmp_bond_list if bond_list: seq.children.append(ssb) ssb.parent_sequence = seq ssb.height = int(0.5 * len(bond_list) + 0.5) for pos in range(len(ssb.residues)): res = seq.residues[pos] if seq.chain_id in secondary_dict: ssa = secondary_dict[seq.chain_id] # Actually assign secondary structure from the read # assignments. seq.children.append(ssa) ssa.parent_sequence = seq for pos in range(len(ssa.residues)): ss_res = ssa.residues[pos] res = seq.residues[pos] for ss in ssa._tmp_ssa: code, start, end = ss start_i = int(start.rstrip(string.ascii_letters)) end_i = int(end.rstrip(string.ascii_letters)) if res.num >= start_i and res.num <= end_i: ss_res.code = code res.ss_code = ss_res.code return True
[docs]def save_PDB_file(sequence, file_name): lines = [res.structure for res in sequence.residues if res.structure] if not lines: return False output_file = open(file_name, "w") output_file.writelines(lines) output_file.write("TER\nEND\n") output_file.close() return True
[docs]def load_file(sequence_group, file_name, format=None, align_func=None): """ Loads a file. The file format can be inferred from the file name extension, or can be explictly given. :type file_name: string :param file_name: input file name :type format: string :param format: format of the input file :rtype: bool :return: True if file successfully read; otherwise False """ root, ext = os.path.splitext(file_name) ext = ext.lower() if format == "fasta" or ext == ".fasta" or ext == ".fst" or \ ext == ".fas" or ext == ".seq" or ext == ".txt": return load_fasta_file(sequence_group, file_name) elif format == "pdb" or ext == ".pdb" or ext == ".ent": return load_PDB_file(sequence_group, file_name, align_func=align_func) elif format == "mae" or ext == ".mae": return (load_maestro_file(sequence_group, file_name, align_func=align_func)) elif format == "sp" or ext == ".sp" or ext == ".sw" or \ ext == ".swiss" or ext == ".swissprot": return load_swissprot_file(sequence_group, file_name) elif format == "pir" or ext == ".pir": return load_PIR_file(sequence_group, file_name) elif format == "gcg" or ext == ".msf" or ext == ".gcg": return load_GCG_file(sequence_group, file_name) elif format == "embl" or ext == ".embl" or ext == ".emb": return load_EMBL_file(sequence_group, file_name)
[docs]def load_maestro_file(sequence_group, file_name, align_func=None): """ Load a file in Maestro format. """ valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1) reader = structure.MaestroReader(file_name) for st in reader: has_seqres = False name = None if "s_pdb_PDB_ID" in st.property: name = st.property["s_pdb_PDB_ID"] try: data_handle = mm.mmct_ct_m2io_get_unrequested_handle(st) except mm.MmException: try: data_handle = mm.mmct_ct_get_or_open_additional_data(st, True) except: pass chain_seq_dict = {} if has_seqres: try: mm.m2io_goto_block(data_handle, "m_PDB_SEQRES", 1) has_seqres = True except: pass for chain in st.chain: seq = None is_protein = False for residue in chain.residue: if residue.pdbres[:3] in valid_amino_acids: is_protein = True break if not is_protein: continue secondary = None ssb = None seq = Sequence() seq.type = constants.SEQ_AMINO_ACIDS seq.chain_id = chain.name[0] seq.color_scheme = constants.COLOR_MAESTRO seq.has_structure = True sequence_group.sequences.append(seq) ssb = seq.createSSBondAssignment() ssb.parent_sequence = seq seq.children.append(ssb) # This code comes from mcpro, zmat.py bond_list = [] asl = "a.pt SG and (res. CYS or res. CYX)" if chain.name > ' ': asl += " and (chain.name %s)" % str(chain.name) s_atoms = evaluate_asl(st, asl) found = False for s_atom in s_atoms: for s2_atom in s_atoms: if mm.mmct_is_atom_bonded(st, s_atom, s2_atom) == mm.TRUE: found = True if s_atom <= s2_atom: resi1 = str( st.atom[s_atom].resnum) + \ str(st.atom[s_atom].inscode) resi2 = str( st.atom[s2_atom].resnum) + \ str(st.atom[s2_atom].inscode) bond_list.append((resi1, resi2)) if not found: seq.children.remove(ssb) secondary = seq.createSecondaryAssignment() secondary.parent_sequence = seq seq.children.append(secondary) if not name: name = st.title name = name[:] short_name = name[:] seq.name = name seq.short_name = short_name residues = [] ss_residues = [] ssb_residues = [] residue_list = structure.get_residues_unsorted(chain) for residue in residue_list: ca_atom = None for atom in residue.atom: if "CA" in atom.pdbname: ca_atom = atom break if not ca_atom: continue if residue.pdbres[:3] in valid_amino_acids: code = constants.AMINO_ACIDS_3_TO_1[residue.pdbres[:3]] else: code = 'X' res = Residue() res.sequence = seq res.code = code res.name = residue.pdbres[:3] res.num = residue.resnum res.icode = residue.inscode res.structureless = False res.bfactor = ca_atom.temperature_factor res.maestro_color = ca_atom.color.rgb res.maestro_atom_num = ca_atom.index ss_res = Residue() ss_res.sequence = seq if ssb: ssb_res = Residue() ssb_res.sequence = seq ssb_res.code = ' ' ssb_residues.append(ssb_res) if ca_atom.secondary_structure == 1: ss_res.code = 'H' res.ss_code = 'H' elif ca_atom.secondary_structure == 2: ss_res.code = 'E' res.ss_code = 'E' else: ss_res.code = ' ' res.ss_code = ' ' residues.append(res) ss_residues.append(ss_res) if residues: seq.residues = residues secondary.residues = ss_residues if ssb and ssb in seq.children: ssb.residues = ssb_residues new_bond_list = [] for rid1, rid2 in bond_list: ridx1 = seq.getResidueIndex(rid1) ridx2 = seq.getResidueIndex(rid2) if ridx1 is not None and ridx2 is not None: new_bond_list.append((ridx1, ridx2)) seq.ssb_bond_list = new_bond_list ssb.bond_list = new_bond_list if not new_bond_list: seq.children.remove(ssb) else: ssb.height = int(len(new_bond_list) * 0.5 + 0.5) else: sequence_group.sequences.remove(seq) if residues and chain.name in chain_seq_dict: # Use full SEQRES sequence. full_seq = Sequence() full_seq.appendResidues(chain_seq_dict[chain.name]) if align_func: align_func(full_seq, seq) for res in seq.residues: res.sequence = seq for child in seq.children: child.parent_sequence = seq for res in child.residues: res.sequence = child return True