Source code for schrodinger.project.pandasutils

"""
A module which contains functions to convert between Schrodinger project data
and a Pandas data frame.
"""
import collections
import enum
import re
import typing

import pandas as pd
from rdkit.Chem import PandasTools

from schrodinger import project
from schrodinger.structutils.smiles import SmilesGenerator

WhichRows = enum.Enum('WhichRows', ('SELECTED', 'ALL'))
WhichColumns = enum.Enum('WhichColumns', ('VISIBLE', 'ALL'))


[docs]def get_data_frame_from_project(pt: project.Project,
                                which_rows: WhichRows = WhichRows.ALL,
                                which_columns: WhichColumns = WhichColumns.ALL,
                                prop_filter: typing.Optional[str] = None,
                                with_rdkit: bool = False,
                                with_smiles: bool = False) -> pd.DataFrame:
    """
    Return a Pandas frame given a Schrodinger project object (as might be
    returned from maestro.get_project_table()

    :param pt: Project (already open via Maestro or standalone) to convert
    :param which_rows:  Which rows from the project are to be converted (all or
                        selected)
    :param which_columns: Which columns from the project are to be converted
    :param prop_filter: A regular expression which, if defined, will restrict
                        the properties to datanames which match this expression
    :param with_rdkit: A flag which indicates if RdKit MOL objects should be
                       added
    :return: A Pandas dataframe populated with data from the project
    """
    if which_columns == WhichColumns.ALL:
        prop_names = pt.getPropertyNames()
    else:
        prop_names = pt.getVisiblePropertyNames()

    if prop_filter:
        prop_names = [p for p in prop_names if re.match(prop_filter, p)]

    data_dict = collections.defaultdict(list)
    sg = SmilesGenerator()
    rows = pt.all_rows if (which_rows == WhichRows.ALL) else pt.selected_rows

    for row in rows:
        for p in prop_names:
            data_dict[p].append(row.property[p])
        if with_rdkit or with_smiles:
            data_dict["smiles"].append(sg.getSmiles(row.getStructure()))

    df = pd.DataFrame(data_dict)

    if with_rdkit:
        PandasTools.AddMoleculeColumnToFrame(df, "smiles", "RDKit Mol")

    return df