"""
File I/O handling routines for the multiple sequence viewer.
Copyright Schrodinger, LLC. All rights reserved.
"""
# Contributors: Piotr Rotkiewicz
import gzip
import itertools
import os
import string
from schrodinger import structure
from schrodinger.application.msv.seqio import parse_fasta_header
from schrodinger.infra import mm
from schrodinger.structutils.analyze import evaluate_asl
from . import constants
from .residue import Residue
from .sequence import Sequence
from .tree_node import TreeNode
[docs]def partition_by_predicate(arr, pred):
"""
Utility function to groups a list into lists, with each sublist beginning
with an element that matches the supplied predicate
Note that many file reading functions below would benefit from using this
function.
:param arr: A list to split into sublists
:type arr: list
:param pred: A function that takes a list item and returns True if the list
item meets a criteria and False otherwise
:type pred: function
This is not efficient, since we loop through the array twice, but it
probably doesn't matter.
"""
counter = 0
group_info = []
for item in arr:
if pred(item):
counter += 1
group_info.append(counter)
array_with_grouping_info = list(zip(arr, group_info))
for _, group in itertools.groupby(array_with_grouping_info, lambda x: x[1]):
yield [item for (item, info) in group]
[docs]def load_fasta_file(sequence_group, file_name, text=None):
"""
Load a sequence file in FASTA format, create sequences and append them
to the sequence group. Splits sequence name from the FASTA header.
:param sequence_group: sequence group to which the sequences will be added
:type sequence_group: SequenceGroup
:param file_name: name of input FASTA file
:type file_name: str
:param text: optional text in FASTA format used instead of the input file,
split by newline char into a list of lines
:type text: list of str
"""
lines = text
if not lines:
try:
with open(file_name) as f:
lines = [line for line in f.readlines() if line]
except IOError:
return False
for group in partition_by_predicate(lines, lambda x: x.startswith('>')):
seq = Sequence()
header = group.pop(0)
fasta_parts = parse_fasta_header(header)
split_name = fasta_parts.name
chain_id = fasta_parts.chain
residues = "".join(group)
if not residues:
continue
seq.name = header[1:].rstrip()
seq.chain_id = chain_id
seq.makeShortName(name=split_name)
seq.appendResidues(residues, use_numbers=True)
sequence_group.sequences.append(seq)
return True
[docs]def load_PIR_file(sequence_group, file_name):
"""
Load a sequence file in PIR format, create sequences and append them
to the sequence group.
:param sequence_group: sequence group to which the sequences will be added
:type sequence_group: SequenceGroup
:param file_name: name of input PIR file
:type file_name: string
"""
try:
inp_file = open(file_name, "r")
except:
return False
if not inp_file:
return False
lines = inp_file.readlines()
inp_file.close()
if not lines or len(lines) == 0:
return False
seq = None
skip = False
string = ""
for line in lines:
if len(line) == 0:
continue
if skip:
skip = False
continue
if line[0] == '>':
if seq:
seq.appendResidues(string)
seq = Sequence()
string = ""
pos = line.find("P1;")
if pos >= 0:
line = line[pos + 2:]
seq.name = [ch for ch in line[1:] if ch >= ' ']
seq.makeShortName()
sequence_group.sequences.append(seq)
skip = True
elif seq:
string = string + line
if seq:
seq.appendResidues(string)
return True
[docs]def load_GCG_file(sequence_group, file_name):
"""
Load a sequence file in GCG format, create sequences and append them
to the sequence group.
:param sequence_group: sequence group to which the sequences will be added
:type sequence_group: SequenceGroup
:param file_name: name of input PIR file
:type file_name: string
"""
try:
inp_file = open(file_name, "r")
except:
return False
if not inp_file:
return False
lines = inp_file.readlines()
inp_file.close()
if not lines or len(lines) == 0:
return False
seq = None
sequences = {}
name_list = []
for line in lines:
if len(line) < 2:
continue
if "\\\\" in line:
continue
name, sep, seq = line.partition(" ")
if len(name) > 0 and len(seq) > 0:
seq = [ch for ch in seq if ch > ' ']
if name in sequences:
sequences[name] += seq
else:
sequences[name] = seq
name_list.append(name)
for name in name_list:
seq = Sequence()
seq.appendResidues(sequences[name])
seq.name = name
seq.makeShortName()
sequence_group.sequences.append(seq)
return True
[docs]def load_EMBL_file(sequence_group, file_name):
"""
Load a sequence file in EMBL format, create sequences and append them
to the sequence group.
:param sequence_group: sequence group to which the sequences will be added
:type sequence_group: SequenceGroup
:param file_name: name of input PIR file
:type file_name: string
"""
try:
inp_file = open(file_name, "r")
except:
return False
if not inp_file:
return False
lines = inp_file.readlines()
inp_file.close()
if not lines or len(lines) == 0:
return False
in_seq = False
codes = ""
name = None
for line in lines:
if line[:2] == "ID":
if in_seq and name:
seq = Sequence()
seq.appendResidues(seq)
seq.name = name
seq.makeShortName()
sequence_group.sequences.append(seq)
seq = ""
name = None
in_seq = False
split_line = line.split()
if (len(split_line) > 1 and len(split_line[1])):
name = split_line[1]
if in_seq:
codes += [ch for ch in line if ch > ' ']
if line[:2] == "SQ":
in_seq = True
if in_seq and name:
seq = Sequence()
seq.appendResidues(codes)
seq.name = name
seq.makeShortName()
sequence_group.sequences.append(seq)
return True
[docs]def load_swissprot_file(sequence_group, file_name, text=None):
"""
Load a sequence file in SWISSPROT format, create sequences and append them
to the sequence group. Tries to split sequence name from the
:param sequence_group: sequence group to which the sequences will be added
:type sequence_group: SequenceGroup
:param file_name: name of input SWISSPROT file
:type file_name: string
:param text: optional text in SWISSPROT format used instead of the input file
:type text: string
"""
if text or not file_name:
lines = text
else:
try:
inp_file = open(file_name, "r")
except:
return False
if not inp_file:
return False
lines = inp_file.readlines()
inp_file.close()
if not lines or len(lines) == 0:
return False
seq = None
read_sequence = False
string = ""
for line in lines:
if len(line) < 2:
continue
id = line[:2]
if id == 'ID':
seq = Sequence()
string = ""
seq.name = line[2:]
elif id == 'AC' and seq:
seq.short_name = line[2:]
elif id == 'SQ' and seq:
read_sequence = True
elif id == '//' and seq:
if len(string) > 0 and \
seq.name and \
seq.short_name:
seq.appendResidues(string)
sequence_group.sequences.append(seq)
seq = None
string = ""
read_sequence = False
elif read_sequence:
string += line.replace(" ", "")
return True
[docs]def save_fasta_file(sequence_group,
file_name,
for_clustal=False,
file=None,
target_sequence=None,
skip_gaps=False,
save_annotations=False,
selected_only=False,
start=-1,
end=-1,
as_text=False,
save_similarity=False):
"""
Writes a contents of sequence group to a file.
:param sequence_group: Sequence group to be written to a file
:type sequence_group: SequenceGroup
:param file_name: Name of the output file
:type file_name: string
:param for_clustal: Optional parameter indicating if the output file
will be used for Clustal alignment
:type for_clustal: bool
:param file: Optional file handle, if not None, this handle will be used
to write the sequences rather than creating a new file (file_name parameter
would be ignored)
:type file: file
:param target_sequence: Optional sequence to be saved. If not specified,
all sequences will be written to the output file.
:type target_sequence: `Sequence`
:param skip_gaps: Optional parameter deciding if gaps should be written
to the FASTA file.
:type skip_gaps: bool
:param save_annotations: Optional parameter for saving annotations (default
is False).
:type save_annotations: bool
:param save_similarity: Saves similarity when set to True.
:type save_similarity: bool
:param start: Optional starting position of the subset residues to save.
:type start: int
:param end: Optional ending position of the subset residues to save.
:type end: int
:param selected_only: Save only (partially) selected columns if True
:type selected_only: True
"""
if not file:
out_handle = open(file_name, "w")
else:
out_handle = file
if not out_handle:
return False
have_selected = sequence_group.hasSelectedSequences()
for seq in sequence_group.sequences:
if target_sequence and target_sequence != seq:
continue
index = sequence_group.sequences.index(seq)
if seq.visible and seq.length() > 0 and \
(not have_selected or seq.selected):
fasta_name = seq.short_name
if seq.chain_id > ' ':
fasta_name += '_' + seq.chain_id
if save_similarity:
fasta_name += "|ID:%.2f" % (100.0 * seq.identity) + \
"|SIM:%.2f" % (100.0 * seq.similarity) + \
"|HOM:%.2f" % (100.0 * seq.homology)
if as_text:
out_handle.write(fasta_name + ", ")
elif for_clustal:
if seq.type != constants.SEQ_AMINO_ACIDS:
continue
out_handle.write(">SEQ" + str(index) + "\n")
else:
out_handle.write(">" + fasta_name + "\n")
n_char = 0
for index, res in enumerate(seq.residues):
if selected_only and not res.selected:
continue
if (for_clustal or skip_gaps) and res.is_gap:
# Skip gaps for Clustal.
continue
if start >= 0 and end >= 0 and (index < start or index > end):
continue
out_handle.write(res.code)
n_char += 1
if not as_text and n_char == constants.FASTA_LINE_LENGTH:
out_handle.write("\n")
n_char = 0
if not as_text and n_char < constants.FASTA_LINE_LENGTH:
out_handle.write("\n")
out_handle.write("\n")
if save_annotations and not for_clustal:
for child in seq.children:
if child.annotation_type == constants.ANNOTATION_SSP or \
child.type == constants.SEQ_SECONDARY or \
child.annotation_type == constants.ANNOTATION_PFAM:
if child.annotation_type == constants.ANNOTATION_SSP:
out_handle.write(">" + fasta_name + "_SSP\n")
elif child.annotation_type == constants.ANNOTATION_PFAM:
out_handle.write(">" + fasta_name + "_PFAM\n")
else:
out_handle.write(">" + fasta_name + "_SSA\n")
n_char = 0
for index, res in enumerate(child.residues):
if selected_only and not seq.residues[
index].selected:
continue
if (skip_gaps) and res.is_gap:
continue
if start >= 0 and end >= 0 and \
(index < start or index > end):
continue
code = res.code
if code == ' ':
code = '-'
elif res.is_gap:
code = '~'
out_handle.write(code)
n_char += 1
if n_char == constants.FASTA_LINE_LENGTH:
out_handle.write("\n")
n_char = 0
if n_char < constants.FASTA_LINE_LENGTH:
out_handle.write("\n")
out_handle.write("\n")
index += 1
if not file:
out_handle.close()
return True
[docs]def save_clustal_file(sequence_group,
file_name,
file=None,
start=-1,
end=-1,
ss_constraints=False,
subset=None,
ignore_selection=False):
"""
Writes a contents of sequence group to a Clustal ALN file.
:param sequence_group: Sequence group to be written to a file
:type sequence_group: SequenceGroup
:param file_name: Name of the output file
:type file_name: string
:param start: Optional starting position of the subset residues to save.
:type start: int
:param end: Optional ending position of the subset residues to save.
:type end: int
:param ss_constraints: Optional secondary structure constraints.
:type ss_constraints: True
"""
if not file:
out_handle = open(file_name, "w")
else:
out_handle = file
if not out_handle:
return False
have_selected = sequence_group.hasSelectedSequences()
if ignore_selection:
have_selected = False
seq_list = []
ss_list = []
for seq in sequence_group.sequences:
if seq.isValidProtein() and seq.length() > 0 and \
((subset == "selected" and seq.selected) or
(subset == "unselected" and seq.unselected) or
(subset is None and (not have_selected or seq.selected))):
seq_list.append(seq)
ss = None
if ss_constraints:
for child in seq.children:
if child.type == constants.SEQ_SECONDARY or \
child.annotation_type == constants.ANNOTATION_SSP:
ss = child
break
ss_list.append(ss)
if len(seq_list) == 0:
return False
out_handle.write("CLUSTAL W (2.0) multiple sequence alignment\n\n")
saved = True
pos = 0
while saved:
saved = False
for index, seq in enumerate(seq_list):
ss_residues = None
if ss_list[index] is not None:
if start >= 0 and end >= 0:
ss_residues = ss_list[index].residues[start:end]
else:
ss_residues = ss_list[index].residues
if ss_residues and pos < len(ss_residues):
ss_res_list = ss_residues[pos:pos + 60]
out_handle.write("!SS_SEQ" +
str(sequence_group.sequences.index(seq)) +
"\t")
for res in ss_res_list:
if res.code == 'H':
out_handle.write('A')
elif res.code == 'E':
out_handle.write('B')
else:
out_handle.write('.')
out_handle.write("\n")
out_handle.write("!GM_SEQ" +
str(sequence_group.sequences.index(seq)) +
"\t")
for res in ss_res_list:
if res.code == 'H':
out_handle.write('9')
elif res.code == 'E':
out_handle.write('9')
else:
out_handle.write('1')
out_handle.write("\n")
for index, seq in enumerate(seq_list):
if start >= 0 and end >= 0:
residues = seq.residues[start:end]
else:
residues = seq.residues
if pos < len(residues):
res_list = residues[pos:pos + 60]
out_handle.write("SEQ" +
str(sequence_group.sequences.index(seq)) +
"\t")
saved = True
for res in res_list:
if res.is_gap:
out_handle.write('.')
else:
out_handle.write(res.code)
out_handle.write("\n")
pos += 60
out_handle.write("\n\n")
if not file:
out_handle.close()
return True
[docs]def load_DND_tree(file_name, sequence_group):
"""
Load Newick-formatted tree file outputted by multiple sequence alignment
program. The function was tested using outputs of ClustalW and T-Coffee.
:param file_name: name of the input file
:type file_name: string
:param sequence_group: target sequence group
:type sequence_group: SequenceGroup
:return: True if operation succeeded, False otherwise
:rtype: boolean
"""
try:
# Load input file.
inp_file = open(file_name, "r")
if not inp_file:
return False
except IOError:
return False
lines = inp_file.readlines()
inp_file.close()
# Concatenate lines to create Newick-formatted string.
dnd_string = "".join(lines).replace("\n", "").replace("\r", "").\
replace(" ", "")
# Create an empty tree.
tree = TreeNode(0)
# Parse the DND string and build the NJ-tree.
parse_DND_string(dnd_string, tree)
# Find a root node.
root = None
tree = tree.branches[0]
for branch in tree.branches:
if len(branch.branches) == 0:
if root: # More than 1 leafs, quit.
break
root = branch
# By default, Clustal generates a rooted tree. We need to unroot-it
# to avoid triple-forked trees.
if root:
tree.branches.remove(root)
new_tree = TreeNode(0)
new_tree.branches.append(tree)
new_tree.branches.append(root)
tree = new_tree
tree.buildSequenceList(sequence_group)
sequence_group.tree = tree
# sequence_group.sortByTreeOrder()
return tree
[docs]def parse_DND_string(dndstring, tree):
"""
Parse a dnd-formatted string, generate a tree and and append its branches
to a given tree.
:type dndstring: string
:param dndstring: tree in DND format
:type tree: `TreeNode`
:param tree: target tree
"""
level = 0
while len(dndstring) > 0:
if dndstring[0] == "(":
level += 1
dndstring = dndstring[1:]
node = TreeNode(level)
node.parent = tree
tree.branches.append(node)
tree = node
elif dndstring[0] == ")":
level -= 1
dndstring = dndstring[1:]
tree = tree.parent
elif dndstring[0] == ";":
break
else:
right_pos = dndstring.find('(')
left_pos = dndstring.find(')')
col_pos = dndstring.find(':')
comma_pos = dndstring.find(',')
if col_pos > 0 and (col_pos < right_pos or right_pos < 0) and \
(col_pos < left_pos or left_pos < 0) and \
(col_pos < comma_pos or comma_pos < 0):
# Create a new node with empty branches to indicate a leaf.
node = TreeNode(level)
node.parent = tree
node.name = dndstring[:col_pos]
tree.branches.append(node)
# Extract a branch score.
if comma_pos < left_pos and comma_pos >= 0:
left_pos = comma_pos
dndstring = dndstring[col_pos + 1:]
else:
dndstring = dndstring[1:]
[docs]def load_clustal_file(sequence_group,
file_name,
replace=False,
start=-1,
end=-1):
"""
Load a sequence alignment in Clustal format. Add sequences to a specified
sequence group. By default, this method doesn't replace the old residues,
but only introduces gaps according to the alignment. Thus, all residue
meta-data (e.g. Maestro information) will be preserved after doing the
alignment.
:type sequence_group: SequenceGroup
:param sequence_group: target sequence
:type file_name: string
:param file_name: input file name
:type replace: boolean (default=False)
:param replace: optional parameter, if True, replace existing sequences
:rtype: bool
:return: True on success, False otherwise
"""
try:
inp_file = open(file_name, "r")
if not inp_file:
return False
except IOError:
return False
lines = inp_file.readlines()
inp_file.close()
for seq in sequence_group.sequences:
seq.processed = False
if replace:
for line in lines:
if line[:3] == "SEQ":
index = int(line[3:8])
seq = sequence_group.sequences[index]
if not seq.processed:
seq.processed = True
seq.residues = []
seq.appendResidues(line[10:])
for seq in sequence_group.sequences:
if seq.processed:
seq.propagateGapsToChildren()
else:
for line in lines:
if line[:3] == "SEQ":
index = int(line[3:8])
seq = sequence_group.sequences[index]
if not seq.processed:
seq.processed = True
seq._old_residues = seq.residues
seq.residues = []
seq.appendResidues(line[10:])
if start >= 0 and end >= 0:
# Replace the residues between start and end with the residues
# read from the ClustalW alignment file.
for seq in sequence_group.sequences:
if seq.processed:
new_residues = seq._old_residues[:]
for pos in range(start, end):
new_residues.pop(start)
for res in reversed(seq.residues):
new_residues.insert(start, res)
seq.residues = new_residues
for seq in sequence_group.sequences:
if seq.processed:
ungapped = [res for res in seq._old_residues if not res.is_gap]
if seq.gaplessLength() == len(ungapped):
ungapped = [
res for res in seq._old_residues if not res.is_gap
]
old_index = 0
for index, res in enumerate(seq.residues):
if not res.is_gap:
seq.residues[index] = ungapped[old_index]
old_index += 1
else:
# Something is very wrong, sequence lengths don't match.
# We replace the residues, so all residue-level
# metadata is going to be lost. We probably need to issue
# a warning message when this happens.
pass
# Just get rid of the old sequence.
seq._old_residues = None
seq.propagateGapsToChildren()
return True
[docs]def pdb_create_sequence(pdb_id, chain_id, sequence_string):
"""
Creates a new sequence out of sequence string read from a PDB file.
:type pdb_id: str
:param pdb_id: PDB ID (4-letter code)
:type chain_id: str
:param chain_id: single-letter chain ID
:type sequence_string: str
:param sequence_string: single-letter code amino acid string to be converted
to the sequence
:rtype: `Sequence`
:return: Created sequence.
"""
seq = Sequence()
seq.chain_id = chain_id
seq.type = constants.SEQ_AMINO_ACIDS
seq.name = pdb_id
seq.short_name = pdb_id
seq.makeShortName()
seq.appendResidues(sequence_string)
return seq
"""
ATOM 1 N MET A 1 -14.195 -1.520 4.416
ATOM 447 CA PHE A 30 -0.555 4.359 0.858 1.00 0.09 C
"""
[docs]def load_PDB_file(sequence_group,
file_name,
requested_chain_id=None,
given_pdb_id=None,
align_func=None):
"""
Reads a PDB file, extracts relevant data and creates the sequence and
annotations.
:type sequence_group: `SequenceGroup`
:param sequence_group: target sequence group
:type file_name: str
:param file_name: name of the file to be read
:type requested_chain_id: str
:param requested_chain_id: Optional parameter. If specified, only the
chain ID equal to this parameter will be read.
:rtype: bool
:return: True on success, False otherwise.
"""
if file_name[-3:] == ".gz":
try:
inp_file = gzip.open(file_name, "r")
if not inp_file:
return False
except:
return False
else:
try:
inp_file = open(file_name, "r")
if not inp_file:
return False
except IOError:
return False
lines = inp_file.readlines()
inp_file.close()
sequence_string = ""
valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1)
pdb_id = None
record_index = 0
last_record_index = 0
if given_pdb_id:
pdb_id = given_pdb_id
else:
pdb_id = os.path.basename(file_name)
pdb_id, ext = os.path.splitext(pdb_id)
if pdb_id[:3] == "pdb":
pdb_id = pdb_id[3:]
if len(pdb_id) >= 4 and pdb_id[0].isdigit():
pdb_id = pdb_id[:4].upper()
sequence_dict = {}
secondary_dict = {}
structure_dict = {}
ssbond_dict = {}
# Multipass processing of the PDB file.
# Extract PDB ID.
for line in lines:
if line.startswith("HEADER"):
# Try to extract PDB ID from HEADER record.
new_pdb_id = line[62:66].replace(" ", "")
if len(new_pdb_id) == 0:
pdb_id = new_pdb_id
break
if line.startswith("ATOM") or line.startswith("END"):
break
# Extract SEQRES records.
chain_id = ""
for line in lines:
if line.startswith("SEQRES"):
# Read SEQRES records and split them into individual chains.
record_index = int(line[6:10].replace(" ", ""))
if record_index < last_record_index and \
len(sequence_string) > 0:
sequence_dict[chain_id] = pdb_create_sequence(
pdb_id, chain_id, sequence_string)
sequence_string = ""
last_record_index = record_index
chain_id = line[11]
residue_list = line[19:72].split(" ")
seq_str = "".join([
constants.AMINO_ACIDS_3_TO_1.get(res, "X")
for res in residue_list
])
for res in residue_list:
if len(res) == 3:
if res in valid_amino_acids:
sequence_string += constants.AMINO_ACIDS_3_TO_1[res]
else:
sequence_string += "X"
if len(sequence_string) > 0:
sequence_dict[chain_id] = pdb_create_sequence(pdb_id, chain_id,
sequence_string)
sequence_string = ""
# Extract secondary structure assignments.
# They need to be matched with residue numbers, so they will
# be assigned later on. For now we are only creating
# placeholders.
for line in lines:
if line.startswith("HELIX"):
record_index = int(line[6:10].replace(" ", ""))
chain_id = line[19]
if chain_id not in secondary_dict and chain_id in sequence_dict:
secondary_dict[chain_id] = \
sequence_dict[chain_id].createSecondaryAssignment()
secondary_dict[chain_id]._tmp_ssa = []
start = line[20:25].replace(" ", "")
end = line[32:37].replace(" ", "")
if chain_id in secondary_dict:
ssa = secondary_dict[chain_id]
ssa._tmp_ssa.append(('H', start, end))
elif line.startswith("SHEET"):
record_index = int(line[6:10].replace(" ", ""))
chain_id = line[21]
if chain_id not in secondary_dict and chain_id in sequence_dict:
secondary_dict[chain_id] = \
sequence_dict[chain_id].createSecondaryAssignment()
secondary_dict[chain_id]._tmp_ssa = []
start = line[22:27].replace(" ", "")
end = line[33:38].replace(" ", "")
if chain_id in secondary_dict:
ssa = secondary_dict[chain_id]
ssa._tmp_ssa.append(('E', start, end))
elif line.startswith("SSBOND"):
"""
SSBOND 1 CYS A 3 CYS A 40 1555 1555 2.00
SSBOND 2 CYS A 4 CYS A 32 1555 1555 2.04
SSBOND 3 CYS A 16 CYS A 26 1555 1555 2.05
"""
chain1 = line[15]
chain2 = line[29]
resi1 = line[16:22].strip()
resi2 = line[30:36].strip()
if chain1 == chain2:
# For now we only support inter-molecular disulfide bonds.
chain_id = chain1
if chain_id not in ssbond_dict and chain_id in sequence_dict:
ssbond_dict[chain_id] = \
sequence_dict[chain_id].createSSBondAssignment()
ssbond_dict[chain_id]._tmp_bond_list.append((resi1, resi2))
amino_acids = list(constants.AMINO_ACIDS_3_TO_1)
previous_res_id = None
current_residue = None
# Should we ignore what we read
ignore = False
# Read ATOM data.
for line in lines:
if line.startswith("MODEL"):
model_num = int(line[5:14])
if model_num > 1:
# Incorporate only the first model.
ignore = True
if line.startswith("ENDMDL"):
ignore = False
continue
if ignore:
continue
if line.startswith("ATOM") or line.startswith("HETATM"):
# Get a residue ID.
res_id = line[22:27]
if not previous_res_id or previous_res_id != res_id:
previous_res_id = res_id
# Get a chain ID.
chain_id = line[21]
if chain_id in structure_dict:
structure = structure_dict[chain_id]
else:
structure = Sequence()
structure.name = "Atomic Coordinates"
structure.short_name = "Atomic Coordinates"
structure.type = constants.ANNOTATION_STRUCTURE
structure.chain_id = chain_id
structure_dict[chain_id] = structure
# Get a residue name.
res_name = line[17:20]
bfactor = line[60:65]
try:
bfactor = float(bfactor)
except:
bfactor = -1.0
# Convert the residue name to amino acid single letter code.
if res_name in amino_acids:
res_code = constants.AMINO_ACIDS_3_TO_1[res_name]
res = Residue()
res.name = res_name
res.icode = res_id[-1:]
res.num = int(res_id[:-1])
res.code = res_code
res.sequence = structure
res.bfactor = bfactor
res.coordinates = []
current_residue = res
res.structureless = False
structure.residues.append(res)
if current_residue:
current_residue.structure.append(line)
# Process the structures that don't have sequences (lack of SEQRES record)
for structure in structure_dict.values():
chain_id = structure.chain_id
if chain_id not in sequence_dict:
sequence_string = structure.text()
sequence_dict[chain_id] = pdb_create_sequence(
pdb_id, chain_id, sequence_string)
# Append the read PDB sequences to the sequence group.
for seq in sequence_dict.values():
if not requested_chain_id or \
seq.chain_id == requested_chain_id:
seq.has_structure = True
if seq.chain_id in structure_dict and align_func:
align_func(seq, structure_dict[seq.chain_id])
sequence_group.sequences.append(seq)
if chain_id in ssbond_dict:
ssb = ssbond_dict[seq.chain_id]
bond_list = []
for resi1, resi2 in ssb._tmp_bond_list:
ridx1 = seq.getResidueIndex(resi1)
ridx2 = seq.getResidueIndex(resi2)
if ridx1 is not None and ridx2 is not None:
bond_list.append((ridx1, ridx2))
seq.ssb_bond_list = bond_list
ssb._tmp_bond_list
if bond_list:
seq.children.append(ssb)
ssb.parent_sequence = seq
ssb.height = int(0.5 * len(bond_list) + 0.5)
for pos in range(len(ssb.residues)):
res = seq.residues[pos]
if seq.chain_id in secondary_dict:
ssa = secondary_dict[seq.chain_id]
# Actually assign secondary structure from the read
# assignments.
seq.children.append(ssa)
ssa.parent_sequence = seq
for pos in range(len(ssa.residues)):
ss_res = ssa.residues[pos]
res = seq.residues[pos]
for ss in ssa._tmp_ssa:
code, start, end = ss
start_i = int(start.rstrip(string.ascii_letters))
end_i = int(end.rstrip(string.ascii_letters))
if res.num >= start_i and res.num <= end_i:
ss_res.code = code
res.ss_code = ss_res.code
return True
[docs]def save_PDB_file(sequence, file_name):
lines = [res.structure for res in sequence.residues if res.structure]
if not lines:
return False
output_file = open(file_name, "w")
output_file.writelines(lines)
output_file.write("TER\nEND\n")
output_file.close()
return True
[docs]def load_file(sequence_group, file_name, format=None, align_func=None):
"""
Loads a file. The file format can be inferred from the file name extension,
or can be explictly given.
:type file_name: string
:param file_name: input file name
:type format: string
:param format: format of the input file
:rtype: bool
:return: True if file successfully read; otherwise False
"""
root, ext = os.path.splitext(file_name)
ext = ext.lower()
if format == "fasta" or ext == ".fasta" or ext == ".fst" or \
ext == ".fas" or ext == ".seq" or ext == ".txt":
return load_fasta_file(sequence_group, file_name)
elif format == "pdb" or ext == ".pdb" or ext == ".ent":
return load_PDB_file(sequence_group, file_name, align_func=align_func)
elif format == "mae" or ext == ".mae":
return (load_maestro_file(sequence_group,
file_name,
align_func=align_func))
elif format == "sp" or ext == ".sp" or ext == ".sw" or \
ext == ".swiss" or ext == ".swissprot":
return load_swissprot_file(sequence_group, file_name)
elif format == "pir" or ext == ".pir":
return load_PIR_file(sequence_group, file_name)
elif format == "gcg" or ext == ".msf" or ext == ".gcg":
return load_GCG_file(sequence_group, file_name)
elif format == "embl" or ext == ".embl" or ext == ".emb":
return load_EMBL_file(sequence_group, file_name)
[docs]def load_maestro_file(sequence_group, file_name, align_func=None):
"""
Load a file in Maestro format.
"""
valid_amino_acids = list(constants.AMINO_ACIDS_3_TO_1)
reader = structure.MaestroReader(file_name)
for st in reader:
has_seqres = False
name = None
if "s_pdb_PDB_ID" in st.property:
name = st.property["s_pdb_PDB_ID"]
try:
data_handle = mm.mmct_ct_m2io_get_unrequested_handle(st)
except mm.MmException:
try:
data_handle = mm.mmct_ct_get_or_open_additional_data(st, True)
except:
pass
chain_seq_dict = {}
if has_seqres:
try:
mm.m2io_goto_block(data_handle, "m_PDB_SEQRES", 1)
has_seqres = True
except:
pass
for chain in st.chain:
seq = None
is_protein = False
for residue in chain.residue:
if residue.pdbres[:3] in valid_amino_acids:
is_protein = True
break
if not is_protein:
continue
secondary = None
ssb = None
seq = Sequence()
seq.type = constants.SEQ_AMINO_ACIDS
seq.chain_id = chain.name[0]
seq.color_scheme = constants.COLOR_MAESTRO
seq.has_structure = True
sequence_group.sequences.append(seq)
ssb = seq.createSSBondAssignment()
ssb.parent_sequence = seq
seq.children.append(ssb)
# This code comes from mcpro, zmat.py
bond_list = []
asl = "a.pt SG and (res. CYS or res. CYX)"
if chain.name > ' ':
asl += " and (chain.name %s)" % str(chain.name)
s_atoms = evaluate_asl(st, asl)
found = False
for s_atom in s_atoms:
for s2_atom in s_atoms:
if mm.mmct_is_atom_bonded(st, s_atom, s2_atom) == mm.TRUE:
found = True
if s_atom <= s2_atom:
resi1 = str(
st.atom[s_atom].resnum) + \
str(st.atom[s_atom].inscode)
resi2 = str(
st.atom[s2_atom].resnum) + \
str(st.atom[s2_atom].inscode)
bond_list.append((resi1, resi2))
if not found:
seq.children.remove(ssb)
secondary = seq.createSecondaryAssignment()
secondary.parent_sequence = seq
seq.children.append(secondary)
if not name:
name = st.title
name = name[:]
short_name = name[:]
seq.name = name
seq.short_name = short_name
residues = []
ss_residues = []
ssb_residues = []
residue_list = structure.get_residues_unsorted(chain)
for residue in residue_list:
ca_atom = None
for atom in residue.atom:
if "CA" in atom.pdbname:
ca_atom = atom
break
if not ca_atom:
continue
if residue.pdbres[:3] in valid_amino_acids:
code = constants.AMINO_ACIDS_3_TO_1[residue.pdbres[:3]]
else:
code = 'X'
res = Residue()
res.sequence = seq
res.code = code
res.name = residue.pdbres[:3]
res.num = residue.resnum
res.icode = residue.inscode
res.structureless = False
res.bfactor = ca_atom.temperature_factor
res.maestro_color = ca_atom.color.rgb
res.maestro_atom_num = ca_atom.index
ss_res = Residue()
ss_res.sequence = seq
if ssb:
ssb_res = Residue()
ssb_res.sequence = seq
ssb_res.code = ' '
ssb_residues.append(ssb_res)
if ca_atom.secondary_structure == 1:
ss_res.code = 'H'
res.ss_code = 'H'
elif ca_atom.secondary_structure == 2:
ss_res.code = 'E'
res.ss_code = 'E'
else:
ss_res.code = ' '
res.ss_code = ' '
residues.append(res)
ss_residues.append(ss_res)
if residues:
seq.residues = residues
secondary.residues = ss_residues
if ssb and ssb in seq.children:
ssb.residues = ssb_residues
new_bond_list = []
for rid1, rid2 in bond_list:
ridx1 = seq.getResidueIndex(rid1)
ridx2 = seq.getResidueIndex(rid2)
if ridx1 is not None and ridx2 is not None:
new_bond_list.append((ridx1, ridx2))
seq.ssb_bond_list = new_bond_list
ssb.bond_list = new_bond_list
if not new_bond_list:
seq.children.remove(ssb)
else:
ssb.height = int(len(new_bond_list) * 0.5 + 0.5)
else:
sequence_group.sequences.remove(seq)
if residues and chain.name in chain_seq_dict:
# Use full SEQRES sequence.
full_seq = Sequence()
full_seq.appendResidues(chain_seq_dict[chain.name])
if align_func:
align_func(full_seq, seq)
for res in seq.residues:
res.sequence = seq
for child in seq.children:
child.parent_sequence = seq
for res in child.residues:
res.sequence = child
return True