Source code for schrodinger.application.bioluminate.propka_parse
"""
Module used to parse the output from a propka job.
"""
#- Imports -------------------------------------------------------------------
import re
#- Globals -------------------------------------------------------------------
REGEXS = {
'break': re.compile(r'(^---+|^\s*$)'),
'detailed': re.compile(r'^\s*RESIDUE\s+pKa\s+BURIED[ \t\w]\n$'),
'summary': re.compile(r'^\s*Group\s+pKa\s+model-pKa'
r'\s+ligand atom-type\s*$'),
'free_ene': re.compile(r'^Free energy of'),
'charge': re.compile(r'^\s*pH\s+unfolded\s+folded\s*$'),
'pI': re.compile(r'^\s*The\s+pI\s+is\s+'
r'(?P<folded_pI>-?\d*\.\d*)'
r'\s+\(folded\)\s+and\s+'
r'(?P<unfolded_pI>-?\d*\.\d*)'
r'\s+\(unfolded\)\s*$')
}
DETAILED_HEADER = [
'resname', 'resnum', 'chain', 'pKa', 'buried', 'desolvation regular',
'effects re', 'sidechain h-bond', 'backbone h-bond', 'coulombic interaction'
]
SUMMARY_HEADER = [
'resname', 'resnum', 'chain', 'pKa', 'pKmodel', 'ligand atom-type'
]
FREE_ENERGY_HEADER = ['pH', 'free energy']
CHARGE_HEADER = ['pH', 'unfolded', 'folded']
#- Functions -----------------------------------------------------------------
[docs]def get_detailed(pka_file, headers=None):
"""
Get the detailed report from the PROPKA output file. This will return
a list of lists. The first list is the "header" and the remaining lists
will be the values corresponding to the headers.
:param pka_file: The name of the propka output (usually `<jobname>.pka`)
:type pka_file: string
:param headers: A list of header to return in the summary. Only these
headers and their corresponding data are returned. If
this is None, all headers and values are returned.
:see: `DETAILED_HEADERS`
"""
[docs]def get_summary(pka_file, headers=None):
"""
Get the summary from the PROPKA output file. This will return a list of
headers and a list of lists. The list of lists will be the values
corresponding to the headers.
:param pka_file: The name of the propka output (usually `<jobname>.pka`)
:type pka_file: string
:param headers: A list of header to return in the summary. Only these
headers and their corresponding data are returned. If
this is None, all headers and values are returned.
:see: `SUMMARY_HEADERS`
"""
headers = headers or SUMMARY_HEADER
indices = [SUMMARY_HEADER.index(h) for h in headers]
table_data = []
with open(pka_file) as lines:
get_data = False
for line in lines:
if REGEXS.get('summary').search(line):
get_data = True
continue
if REGEXS.get('break').search(line):
get_data = False
continue
if not get_data:
continue
tokens = [item.strip() for item in line.split() if item.strip()]
# Catch 4-digit residue numbers
if (len(tokens[0]) >= 7):
try:
name = tokens.pop(0)
resnum = int(name[-4:])
pdbres = name[:-4]
tokens = [pdbres, resnum] + tokens
except:
pass
# Make sure to add a value for "ligand atom-type" if none is
# reported in the summary.
if len(tokens) < 6:
tokens.append(None)
# Filter out only the tokens we need
filtered_tokens = [tokens[i] for i in indices]
table_data.append(filtered_tokens)
return (headers, table_data)
[docs]def get_free_energy(pka_file, headers=None):
"""
Get the free energy of folding (kcal/mol) as a function of pH from the
PROPKA output file. This will return a list of
headers and a list of lists. The list of lists will be the values
corresponding to the headers.This will return a list of lists. The first list
is the "header" and the remaining lists will be the values corresponding
to the headers.
:param pka_file: The name of the propka output (usually `<jobname>.pka`)
:type pka_file: string
:param headers: A list of header to return in the summary. Only these
headers and their corresponding data are returned. If
this is None, all headers and values are returned.
:see: `FREE_ENERGY_HEADERS`
"""
headers = headers or FREE_ENERGY_HEADER
indices = [FREE_ENERGY_HEADER.index(h) for h in headers]
table_data = []
with open(pka_file) as lines:
get_data = False
for line in lines:
if REGEXS.get('free_ene').search(line):
get_data = True
continue
if REGEXS.get('break').search(line):
get_data = False
continue
if not get_data:
continue
tokens = [item.strip() for item in line.split() if item.strip()]
# Filter out only the tokens we need
filtered_tokens = [tokens[i] for i in indices]
table_data.append(filtered_tokens)
return (headers, table_data)
[docs]def get_charge(pka_file, headers=None):
"""
Get the protein charge of folded and unfolded state as a function of pH
from the PROPKA output file. This will return a list of headers, a list
of lists containing the data, and a list of pI values for folded and
unfolded states.
:param pka_file: The name of the propka output (usually `<jobname>.pka`)
:type pka_file: string
:param headers: A list of header to return in the summary. Only these
headers and their corresponding data are returned. If
this is None, all headers and values are returned.
:see: `CHARGE_HEADERS`
"""
headers = headers or CHARGE_HEADER
indices = [CHARGE_HEADER.index(h) for h in headers]
table_data = []
pI_data = None
with open(pka_file) as lines:
get_data = False
for line in lines:
if REGEXS.get('charge').search(line):
get_data = True
continue
if REGEXS.get('break').search(line):
get_data = False
continue
match = REGEXS.get('pI').search(line)
if match:
folded_pI = match.group('folded_pI')
unfolded_pI = match.group('unfolded_pI')
pI_data = [folded_pI, unfolded_pI]
break
if not get_data:
continue
tokens = [item.strip() for item in line.split() if item.strip()]
# Filter out only the tokens we need
filtered_tokens = [tokens[i] for i in indices]
table_data.append(filtered_tokens)
return (headers, table_data, pI_data)
if __name__ == '__main__':
import sys
ifile = sys.argv[1]
headers, summary = get_summary(ifile, SUMMARY_HEADER[:4])
print('Headers', headers)
print('\nSummary:\n', summary)