Source code for schrodinger.application.bioluminate.propka_parse

"""
Module used to parse the output from a propka job.


"""

#- Imports -------------------------------------------------------------------

import re

#- Globals -------------------------------------------------------------------

REGEXS = {
    'break': re.compile(r'(^---+|^\s*$)'),
    'detailed': re.compile(r'^\s*RESIDUE\s+pKa\s+BURIED[ \t\w]\n$'),
    'summary': re.compile(r'^\s*Group\s+pKa\s+model-pKa'
                          r'\s+ligand atom-type\s*$'),
    'free_ene': re.compile(r'^Free energy of'),
    'charge': re.compile(r'^\s*pH\s+unfolded\s+folded\s*$'),
    'pI': re.compile(r'^\s*The\s+pI\s+is\s+'
                     r'(?P<folded_pI>-?\d*\.\d*)'
                     r'\s+\(folded\)\s+and\s+'
                     r'(?P<unfolded_pI>-?\d*\.\d*)'
                     r'\s+\(unfolded\)\s*$')
}

DETAILED_HEADER = [
    'resname', 'resnum', 'chain', 'pKa', 'buried', 'desolvation regular',
    'effects re', 'sidechain h-bond', 'backbone h-bond', 'coulombic interaction'
]
SUMMARY_HEADER = [
    'resname', 'resnum', 'chain', 'pKa', 'pKmodel', 'ligand atom-type'
]
FREE_ENERGY_HEADER = ['pH', 'free energy']
CHARGE_HEADER = ['pH', 'unfolded', 'folded']

#- Functions -----------------------------------------------------------------


[docs]def get_detailed(pka_file, headers=None):
    """
    Get the detailed report from the PROPKA output file.  This will return
    a list of lists. The first list is the "header" and the remaining lists
    will be the values corresponding to the headers.

    :param pka_file: The name of the propka output (usually `<jobname>.pka`)
    :type  pka_file: string
    :param  headers: A list of header to return in the summary. Only these
                     headers and their corresponding data are returned. If
                     this is None, all headers and values are returned.


    :see: `DETAILED_HEADERS`

    """


[docs]def get_summary(pka_file, headers=None):
    """
    Get the summary from the PROPKA output file.  This will return a list of
    headers and a list of lists. The list of lists will be the values
    corresponding to the headers.

    :param pka_file: The name of the propka output (usually `<jobname>.pka`)
    :type  pka_file: string
    :param  headers: A list of header to return in the summary. Only these
                     headers and their corresponding data are returned. If
                     this is None, all headers and values are returned.

    :see: `SUMMARY_HEADERS`

    """
    headers = headers or SUMMARY_HEADER
    indices = [SUMMARY_HEADER.index(h) for h in headers]

    table_data = []
    with open(pka_file) as lines:
        get_data = False
        for line in lines:
            if REGEXS.get('summary').search(line):
                get_data = True
                continue
            if REGEXS.get('break').search(line):
                get_data = False
                continue
            if not get_data:
                continue

            tokens = [l.strip() for l in line.split() if l.strip()]

            # Catch 4-digit residue numbers
            if (len(tokens[0]) >= 7):
                try:
                    name = tokens.pop(0)
                    resnum = int(name[-4:])
                    pdbres = name[:-4]
                    tokens = [pdbres, resnum] + tokens
                except:
                    pass

            # Make sure to add a value for "ligand atom-type" if none is
            # reported in the summary.
            if len(tokens) < 6:
                tokens.append(None)

            # Filter out only the tokens we need
            filtered_tokens = [tokens[i] for i in indices]

            table_data.append(filtered_tokens)

    return (headers, table_data)


[docs]def get_free_energy(pka_file, headers=None):
    """
    Get the free energy of folding (kcal/mol) as a function of pH from the
    PROPKA output file.  This will return a list of
    headers and a list of lists. The list of lists will be the values
    corresponding to the headers.This will return a list of lists. The first list
    is the "header" and the remaining lists will be the values corresponding
    to the headers.

    :param pka_file: The name of the propka output (usually `<jobname>.pka`)
    :type  pka_file: string
    :param  headers: A list of header to return in the summary. Only these
                     headers and their corresponding data are returned. If
                     this is None, all headers and values are returned.

    :see: `FREE_ENERGY_HEADERS`

    """
    headers = headers or FREE_ENERGY_HEADER
    indices = [FREE_ENERGY_HEADER.index(h) for h in headers]
    table_data = []
    with open(pka_file) as lines:
        get_data = False
        for line in lines:
            if REGEXS.get('free_ene').search(line):
                get_data = True
                continue
            if REGEXS.get('break').search(line):
                get_data = False
                continue
            if not get_data:
                continue

            tokens = [l.strip() for l in line.split() if l.strip()]

            # Filter out only the tokens we need
            filtered_tokens = [tokens[i] for i in indices]

            table_data.append(filtered_tokens)

    return (headers, table_data)


[docs]def get_charge(pka_file, headers=None):
    """
    Get the protein charge of folded and unfolded state as a function of pH
    from the PROPKA output file.  This will return a list of headers, a list
    of lists containing the data, and a list of pI values for folded and
    unfolded states.

    :param pka_file: The name of the propka output (usually `<jobname>.pka`)
    :type  pka_file: string
    :param  headers: A list of header to return in the summary. Only these
                     headers and their corresponding data are returned. If
                     this is None, all headers and values are returned.

    :see: `CHARGE_HEADERS`

    """
    headers = headers or CHARGE_HEADER
    indices = [CHARGE_HEADER.index(h) for h in headers]

    table_data = []
    pI_data = None
    with open(pka_file) as lines:
        get_data = False
        for line in lines:
            if REGEXS.get('charge').search(line):
                get_data = True
                continue
            if REGEXS.get('break').search(line):
                get_data = False
                continue
            match = REGEXS.get('pI').search(line)
            if match:
                folded_pI = match.group('folded_pI')
                unfolded_pI = match.group('unfolded_pI')
                pI_data = [folded_pI, unfolded_pI]
                break
            if not get_data:
                continue

            tokens = [l.strip() for l in line.split() if l.strip()]

            # Filter out only the tokens we need
            filtered_tokens = [tokens[i] for i in indices]

            table_data.append(filtered_tokens)

    return (headers, table_data, pI_data)


if __name__ == '__main__':

    import sys
    ifile = sys.argv[1]

    headers, summary = get_summary(ifile, SUMMARY_HEADER[:4])
    print('Headers', headers)
    print('\nSummary:\n', summary)