Source code for schrodinger.application.glide.glideanalysis

"""
Functions and classes for calculating Glide Docking Enrichment Factors and
creating sorted pose libraries.

Copyright Schrodinger, LLC. All rights reserved.

"""

#Contributors: K Shawn Watts, Jeff Saunders

################################################################################
# Packages
################################################################################

import math
import os
import sys
import tempfile
from past.utils import old_div

import schrodinger.analysis.enrichment as enrichment
import schrodinger.structure as structure
import schrodinger.structutils.sort as sort
import schrodinger.utils.fileutils as fileutils

################################################################################
# Globals
################################################################################
_version = '$Revision: 1.21 $'


################################################################################
# Functions
################################################################################
[docs]def get_enrichment_calculator(
    file_name="",
    file_type=None,  # 'pv' or 'lib'
    actives=[],  # noqa: M511 # titles of known actives found in file_name
    report_header="",
    total_ligands=0,
    sort_property_1="r_i_glide_gscore",
    sort_property_2="r_i_glide_emodel",
):
    """
    Returns an schrodinger.analysis.enrichment.Calculator instance.

    :type file_name: string
    :param file_name:
            The path to file upon which to operate.  The 'file_name'
            pose file may contain multiple poses for each ligand, but
            the analysis requires culling to the best pose per ligand.
            Default is ''.

    :type file_type: string
    :param file_type:
            Should be 'pv' or 'lib'; used to determine the position of the
            first ligand pose.  Default is 'pv'.

    :type actives: list of strings
    :param actives:
            Each string corresponds to the title of a known active used in
            the docking experiment.  Default is [].

    :type total_ligands: int
    :param total_ligands:
            Integer number of the total number of ligands used in the
            experiment. Default is 0.

    :type report_header: string
    :param report_header:
            Not implemented.  This parameter is ignored.

    :type sort_property_1: string
    :param sort_property_1:
            An m2io dataname that identifies the values for single pose per
            ligand sorting.  Default is "r_i_glide_gscore".

    :type sort_property_2: string
    :param sort_property_2:
            An m2io dataname that identifies the values of intra-ligand
            group sorting.  SP and HTVS poses should use 'r_i_glide_emodel'
            to pick the best pose of the same ligand-title.  XP poses should
            use 'i_glide_XP_PoseRank' to determine the best pose within a
            ligand-title group.  Default is "r_i_glide_emodel".

    This is a convenience interface for
    schrodinger.analysis.enrichment.Calculator,
    which is the preferred Enrichment metric generator.
    schrodinger.analysis.enrichment.Calculator has more metrics
    (e.g. BEDROC, ROC, AUAC), support for interactive plotting, and
    can generate plot png format files.  This function also implements
    sort.py for better sorting performance.

    Two files are created as a result of running this function;
    a list of active compound titles in <root>_actives.txt and
    a sorted structure file in <root>_efcalc<ext>, where <root> is the
    basename and <ext> the extension of the filename provided.

    """

    (root, ext) = fileutils.splitext(file_name)

    # Sort the structure file and write out just the 'best' pose per block.
    # Assign the starting index, pv_files have a receptor, libs don't
    # if the type was not specified guess by the file ext
    file_index = None
    if file_type:
        file_type = file_type.lower()
        if file_type == 'pv':
            file_index = 2
        elif file_type == 'lib':
            file_index = 1
        else:
            raise ValueError("Invalid file type: %s" % file_type)
    else:
        # Note, this is a weak test based on file extention.
        if fileutils.is_poseviewer_file(file_name):
            file_index = 2
        else:
            file_index = 1

    sort_criteria = [(prop, sort.ASCENDING) for prop in sort_property_1.split()]
    intra_block_sort_criteria = [('s_m_title', sort.ASCENDING)]
    intra_block_sort_criteria.extend([
        (prop, sort.ASCENDING) for prop in sort_property_2.split()
    ])
    sfs = sort.StructureFileSorter(
        file_name,
        file_index,
        sort_criteria,
        intra_block_sort_criteria,
        keep_structures=sort._in_memory_sort_ok(file_name))
    sfs.sort()
    sorted_structure_file = '%s_efcalc%s' % (root, ext)
    sfs.writeTopNFromBlock(sorted_structure_file, max_per_block=1)

    # Write the active compound titles to disk.
    actives_file = '%s_actives.txt' % (root)
    actives_fh = open(actives_file, 'w')
    for title in actives:
        actives_fh.write("%s\n" % title)
    actives_fh.close()

    # Determine the number decoys used in the experiment.
    num_decoys = total_ligands - len(actives)
    ef_calc = enrichment.Calculator(actives=actives_file,
                                    results=sorted_structure_file,
                                    total_decoys=num_decoys)
    return ef_calc


################################################################################
# Classes
################################################################################
#ev81790 Removed the deprecated Enrichment class.


[docs]class PoseLibrary:
    """
    A helper class to sort a pose library.  By default, multiple poses for
    one ligand-title are 'block sorted'.  i.e. grouped and sorted by Title
    and sort_property_2, and each group is sorted by its best member's
    sort_property_1.  The pose library may be resorted by sort_property_1,
    in a title agnostic manner, by calling sortByProperties().

    API Example::
        pl = glideanalysis.PoseLibrary(file_name='input_lib.mae')
        pl.writeTopPoses('output_lib.mae')

    :vartype debug: bool
    :cvar debug:
            Increases the verbosity of the various bookkeeping operations.

    :vartype dock_results: dict
    :ivar dock_results:
            A dictionary of pose properties, pose file index keys for property
            dict.

    :vartype pose_lib_order: tuple
    :ivar pose_lib_order:
            A sorted tuple sequence of all pose file indexes.

    :vartype best_pose_lib_order: tuple
    :ivar best_pose_lib_order:
            A sorted tuple sequence of poses file indexes, from just the best
            pose of each title-group.

    :vartype title_tuples: list
    :ivar title_tuples:
            A list of (pose indexes) tuples, one tuple per title.

    :vartype title_blocks: dict
    :ivar title_blocks:
            A dictionary of pose index lists, 'title' keys for list of sorted
            (intra title) indexes.

    """

    debug = False

[docs]    def __init__(
            self,
            file_name="",
            file_type=None,  # 'pv' or 'lib'
            sort_property_1="r_i_glide_gscore",
            sort_property_2="r_i_glide_emodel",
            block_sort=True):
        """
        :type file_name: string
        :param file_name:
                Path to the file upon with to operate.  The 'file_name' pose
                file may contain multiple poses for each ligand. Default is
                "".

        :type file_type: string
        :param file_type:
                Should be 'pv' or 'lib'; used to determine the position of the
                first ligand pose. If not specified, determined from ext.

        :type sort_property_1: string
        :param sort_property_1:
                An m2io dataname or a space-separated list of m2io datanames
                that identify the values for single best pose per ligand-title
                sorting. The input string is broken into fields by split().
                Default is "r_i_glide_gscore".

        :type sort_property_2: string
        :param sort_property_2:
                An m2io dataname or a space-separated list of m2io datanames
                that identify the values for ligand-title group sorting.
                The input string is broken into fields by split(). Default is
                "r_i_glide_emodel".

            SP and HTVS poses should use 'r_i_glide_emodel' to pick the
            best pose of the same ligand-title.  XP poses should use
            'i_glide_XP_PoseRank' to determine the best pose order within
            a ligand-title group.

        :type block_sort: bool
        :param block_sort:
                Boolean for the initial sorting operation.  The pose
                library can always be resorted at a later point, this
                attribute just sets the default sorting operation for the
                instance.  If True, poses are organized into ligand-title
                grouping as normal.  If False, the poses are organized by a
                straight multi-key sort of sort_property_1. If there are
                multiple poses per ligand title this option should be used with
                care. Default is True.

        """

        self.file_name = file_name  # Maestro file path
        self.file_type = file_type  # lib or pv format, determined below
        self.sort_property_1 = sort_property_1.strip()
        self.sort_property_2 = sort_property_2.strip()
        self.file_index = 1  # Start file position index pv=2/lib=1
        self.block_sort = block_sort

        # Store pose file properties and initial rank by title
        self.pose_lib_order = []
        self.best_pose_lib_order = []  # multiple poses per ligand
        self.dock_results = {}
        self.statistic = {}

        # Assign the starting index, pv_files have a receptor, libs don't
        # if the type was not specified guess by the file ext
        if self.file_type:
            self.file_type = self.file_type.lower()
            if self.file_type == 'pv':
                self.file_index = 2
            elif self.file_type == 'lib':
                self.file_index = 1
            else:
                raise ValueError("Invalid file type: %s" % self.file_type)
        else:
            if fileutils.is_poseviewer_file(self.file_name):
                self.file_index = 2
                self.file_type = 'pv'
            else:
                self.file_index = 1
                self.file_type = 'lib'

        # Store structure file properties by initial file position index
        index = self.file_index
        for st in structure.StructureReader(self.file_name, self.file_index):
            self.dock_results[index] = {}
            for p in list(st.property):
                self.dock_results[index][p] = st.property[p]
            index += 1

        if self.block_sort:
            # Legacy mode of sorting puts the poses into ligand-title blocks.
            self.sortPoses()

        else:
            # Simple multi-key sorting, no ligand-title blocks
            self.sortByProperties()

        # Undecorate and sort the pose library by imposing this index order
        # on file.
        self.pose_lib_order = tuple(self.pose_lib_order)
        self.best_pose_lib_order = tuple(self.best_pose_lib_order)
        self.title_tuples = tuple(self.title_tuples)

        # Collect simple statistics on the library
        self.calculate_stats()

        return

    def _show(self):
        """
        Helper/debugging method to print out values.  Prints table of
        sorted poses.

        """
        print("PoseLibrary: Read %s, with type %s, at file index %d" %
              (self.file_name, self.file_type, self.file_index))
        print("PoseLibrary: Found %d total poses" % len(self.dock_results))
        print("Final Poses file index sorted order (All poses)")
        print(self.pose_lib_order)
        print("Final Poses file index sorted order (Best pose per ligand)")
        print(self.best_pose_lib_order)
        print("Table of Final Poses file index sorted order (All poses)")
        fields = []
        fields.append('Title')
        fields.extend(self.sort_property_1.split())
        fields.extend(self.sort_property_2.split())
        fields.append('file_index')
        print(" ".join(fields))
        for i in self.pose_lib_order:
            row = [self.dock_results[i]['s_m_title']]
            for p in self.sort_property_1.split():
                row.append(str(self.dock_results[i][p]))
            for p in self.sort_property_2.split():
                row.append(str(self.dock_results[i][p]))

            row.append(str(i))
            print("\t".join(row))

        return

[docs]    def sortPoses(self):
        """
        Helper method that calls sortIntraTitle(), then sortInterTitle().
        No return value.

        """
        self.sortIntraTitle()
        self.sortInterTitle()
        self.pose_lib_order = tuple(self.pose_lib_order)
        self.best_pose_lib_order = tuple(self.best_pose_lib_order)
        self.title_tuples = tuple(self.title_tuples)
        return

[docs]    def sortIntraTitle(self):
        """
        Creates tuples of poses with the same title, then sorts within
        each title-tuple by self.sort_property_2.  No return value.

        Ligands are not 'ranked' between titles by this function.

        Raises an Exception if a pose without a title is encountered.

        """

        self.title_tuples = []  # A list of index tuples
        self.title_blocks = {}  # Helper dict
        props = self.sort_property_2.split()
        sort_p_2 = []  # List of lists
        indexes = []  # List of file positions with the same title
        titles = []

        # Seed list of lists
        sort_p_2.append(titles)  # slice 0
        for p in props:
            sort_p_2.append([])
        sort_p_2.append(indexes)  # slice -1

        # Create ligand-title blocks
        for file_index in self.dock_results:

            title = self.dock_results[file_index].get('s_m_title')
            if not title:
                msg = 'No title for %d' % file_index
                raise Exception(msg)

            # Populate the list of lists
            sort_p_2[0].append(title)
            for prop_index, prop in enumerate(props):
                prop_index += 1  # title is 0th slice so bump the index
                lst = sort_p_2[prop_index]
                lst.append(self.dock_results[file_index].get(prop))

            sort_p_2[-1].append(file_index)

        # Decorate, sort, undecorate index
        title_family = list(zip(*sort_p_2))
        title_family.sort()
        for item in title_family:
            title = item[0]
            index = item[-1]
            # Check for existing hash key
            if title in self.title_blocks:
                self.title_blocks[title].append(index)
            else:
                self.title_blocks[title] = []
                self.title_blocks[title].append(index)

        # Title_blocks dict was useful for debugging/development, but
        # a list of tuples is nicer for sorting Inter-Title
        for title in self.title_blocks:
            self.title_tuples.append(tuple(self.title_blocks[title]))

        return

[docs]    def sortInterTitle(self):
        """
        Orders the title_tuple families (see `sortIntraTitle`) by
        self.sort_property_2.  No return value.

        """

        # Reset as an appendable lists
        self.best_pose_lib_order = []
        props = self.sort_property_1.split()

        # Sort the tuples by their leading member's self.sort_property_1
        families = []
        sort_p_1 = []  # seed list of lists
        for p in props:
            sort_p_1.append([])
        sort_p_1.append(families)  # last slice, -1

        # Populate the list of lists
        for item in self.title_tuples:
            best_member_index = item[0]
            for prop_index, prop in enumerate(props):
                lst = sort_p_1[prop_index]
                lst.append(self.dock_results[best_member_index].get(prop))
            sort_p_1[-1].append(item[:])  # Copy the family index order

        # Decorate, sort, undecorate index tuples
        pose_list = list(zip(*sort_p_1))
        pose_list.sort()
        self.title_tuples = []  # Reset title_tuples as an appendable list
        for item in pose_list:
            family = item[-1]
            self.title_tuples.append(family[:])

        # Pick the first element to get the best ligand order
        for item in self.title_tuples:
            self.best_pose_lib_order.append(item[0])

        # Flatten the tuples into a list of all
        self.pose_lib_order = []
        for item in self.title_tuples:
            self.pose_lib_order.extend(list(item[:]))

        return

[docs]    def sortByProperties(self):
        """
        Orders the pose library in a Title agnostic manner, considering
        only sort_property_1; a simple (multi-key) sort of the docking
        results. The instance attribute 'pose_lib_order' is reassigned to
        the new ordered tuple, but since the block ordering is lost
        the attributes 'best_pose_lib_order' and 'title_tuples' are
        redefined as empty tuples, and 'title_blocks' is redefined to an
        empty dictionary.

        """

        # Reset irrelevant data structures
        self.best_pose_lib_order = ()
        self.title_tuples = ()
        self.title_blocks = {}

        # Reset as an appendable list
        self.pose_lib_order = []
        props = self.sort_property_1.split()
        sort_p_1 = []  # List of lists
        indexes = []  # List of file positions

        # Seed list of lists
        for p in props:
            sort_p_1.append([])
        sort_p_1.append(indexes)  # slice -1

        # Populate the list of lists
        for pose_index in self.dock_results:
            for prop_index, p in enumerate(props):
                sort_p_1[prop_index].append(
                    self.dock_results[pose_index].get(p))
            sort_p_1[-1].append(pose_index)

        # Decorate, sort, undecorate index
        pose_organizer = list(zip(*sort_p_1))
        pose_organizer.sort()
        for item in pose_organizer:
            self.pose_lib_order.append(item[-1])

        self.pose_lib_order = tuple(self.pose_lib_order)

        return

[docs]    def calculate_stats(self):
        """
        Calculate simple statistics on the glide scores.  Sets the
        self.statistic dictionary and adds the standard score (glide gscore)
        to the self.dock_results dict.

        Returns None.

        """

        # List comprehension of glide scores
        glide_score = [
            self.dock_results[i]['r_i_glide_gscore'] for i in self.dock_results
        ]

        glide_score_mean = old_div(sum(glide_score), len(glide_score))
        glide_score_stdev = math.sqrt(
            old_div(sum([(i - glide_score_mean)**2 for i in glide_score]),
                    len(glide_score)))
        glide_score_min = min(glide_score)
        glide_score_max = max(glide_score)
        # Median score.  // is a floor division operator
        glide_score_median = glide_score[len(glide_score) // 2]

        self.statistic['glide_gscore_min'] = glide_score_min
        self.statistic['glide_gscore_max'] = glide_score_max
        self.statistic['glide_gscore_mean'] = glide_score_mean
        self.statistic['glide_gscore_stdev'] = glide_score_stdev
        self.statistic['glide_gscore_median'] = glide_score_median

        for i in self.dock_results:
            score = self.dock_results[i]['r_i_glide_gscore']
            std_score = None
            if glide_score_stdev > 0:
                std_score = float(
                    old_div((score - glide_score_mean), glide_score_stdev))
            self.dock_results[i]['r_user_std_glide_gscore'] = std_score

        return

[docs]    def write(self, output_lib_name="", indexes=(), block_size=4000):
        """
        Outputs an ordered pose lib file.  The pose order
        is the same as indexes' sequence of original file positions.

        This method loads all pose structure.Structure objects into memory
        only if there are fewer than 'block_size' structures.  Otherwise,
        it reads the input file multiple times, sorting structures in
        'block_size' chunks until the entire library is sorted.

        Returns None.

        """

        pose_lib_indexes = list(indexes)
        pose_indexes = []
        sts = []
        (self.tmp_lib_desc, self.tmp_lib) = tempfile.mkstemp(
            #"pose_library__tmp_UNIQUEGIBBERISH.mae"
            suffix='.mae',
            prefix='pose_library_tmp_',
            dir=os.getcwd(),
            text=True)

        # See if the file size is bigger than the block size, we don't
        # have to count them all, just determine if we have more than the
        # block size.  enum is 0 based, so we add 1.
        for count, st in enumerate(structure.StructureReader(self.file_name)):
            if count + 1 > block_size:
                break

        # 'Normal' sized file, should be sortable within memory
        if count < block_size:
            if self.debug:
                print("Sort all structures at once, read from file index: %d" \
                    % self.file_index)

            file_index = self.file_index
            for st in structure.StructureReader(self.file_name,
                                                self.file_index):
                # Skip any index not in the list
                if file_index in pose_lib_indexes:
                    sts.append(st)
                    pose_indexes.append(pose_lib_indexes.index(file_index))
                file_index += 1

            sorted_sts = list(zip(pose_indexes, sts))
            sorted_sts.sort()
            for (i, st) in sorted_sts:
                st.append(self.tmp_lib)

        # 'Big' files can't be sorted in memory at once, so sort in batches
        else:
            if self.debug:
                print("Sorting structures in batches")

            while pose_lib_indexes:
                islice = []
                for i in range(1, block_size):
                    try:
                        islice.append(pose_lib_indexes.pop(0))
                    except IndexError:
                        break  # exit for loop

                if self.debug:
                    print("Sorting slice:")
                    print(islice)

                sts = []
                pose_indexes = []
                sorted_sts = []
                file_index = self.file_index
                for st in structure.StructureReader(self.file_name,
                                                    self.file_index):
                    if file_index in islice:
                        sts.append(st)
                        pose_indexes.append(islice.index(file_index))

                    file_index += 1

                sorted_sts = list(zip(pose_indexes, sts))
                sorted_sts.sort()

                for (i, st) in sorted_sts:
                    st.append(self.tmp_lib)

        os.close(self.tmp_lib_desc)
        if sys.platform == 'win32' and os.path.exists(output_lib_name):
            os.remove(output_lib_name)
        os.rename(self.tmp_lib, output_lib_name)

        return

[docs]    def writeValidPoses(
            self,
            output_lib_name="",
            actives=[],  # noqa: M511
            max_pose_per_active=10,
            max_pose_per_decoy=1,
            max_number_decoys=1000,
            block_size=2000):
        """
        Writes a pose library with multiple poses per active but a single
        pose per 'decoy'.  Returns None.

        """

        # Container for pose indexes
        valid_poses = []

        # Container for pose accounting
        valid_count = {}
        valid_count['__decoys'] = 0

        for i in self.pose_lib_order:
            title = self.dock_results[i]['s_m_title']
            if title not in valid_count:
                valid_count[title] = 0

            if title in actives and valid_count[title] < max_pose_per_active:
                valid_poses.append(i)
                valid_count[title] += 1

            elif valid_count[title] < max_pose_per_decoy and \
                valid_count['__decoys'] < max_number_decoys:
                valid_poses.append(i)
                valid_count[title] += 1
                valid_count['__decoys'] += 1

        self.write(output_lib_name, indexes=valid_poses, block_size=block_size)

        return

[docs]    def writeTopPoses(self,
                      output_lib_name="",
                      max_pose_per_title=1,
                      block_size=2000):
        """
        Writes a pose library with the top N poses per ligand.
        Returns None.

        Assumes self.title_tuples is sorted in the desired order.

        """

        # Set the number of poses to grab
        N = max_pose_per_title

        # Container for pose indexes
        indexes = []

        # Get the pose indexes
        for item in self.title_tuples:
            indexes.extend(item[:N])

        # Verbose
        if self.debug:
            print("writeTopPoses... max_pose_per_title: ", N)
            print("writeTopPoses... indexes:")
            print(indexes)

        # Write them out
        self.write(output_lib_name, indexes=indexes, block_size=block_size)

        return


# EOF