"""
Functions and classes for calculating Glide Docking Enrichment Factors and
creating sorted pose libraries.
Copyright Schrodinger, LLC. All rights reserved.
"""
#Contributors: K Shawn Watts, Jeff Saunders
################################################################################
# Packages
################################################################################
import math
import os
import sys
import tempfile
from past.utils import old_div
import schrodinger.analysis.enrichment as enrichment
import schrodinger.structure as structure
import schrodinger.structutils.sort as sort
import schrodinger.utils.fileutils as fileutils
################################################################################
# Globals
################################################################################
_version = '$Revision: 1.21 $'
################################################################################
# Functions
################################################################################
[docs]def get_enrichment_calculator(
file_name="",
file_type=None, # 'pv' or 'lib'
actives=[], # noqa: M511 # titles of known actives found in file_name
report_header="",
total_ligands=0,
sort_property_1="r_i_glide_gscore",
sort_property_2="r_i_glide_emodel",
):
"""
Returns an schrodinger.analysis.enrichment.Calculator instance.
:type file_name: string
:param file_name:
The path to file upon which to operate. The 'file_name'
pose file may contain multiple poses for each ligand, but
the analysis requires culling to the best pose per ligand.
Default is ''.
:type file_type: string
:param file_type:
Should be 'pv' or 'lib'; used to determine the position of the
first ligand pose. Default is 'pv'.
:type actives: list of strings
:param actives:
Each string corresponds to the title of a known active used in
the docking experiment. Default is [].
:type total_ligands: int
:param total_ligands:
Integer number of the total number of ligands used in the
experiment. Default is 0.
:type report_header: string
:param report_header:
Not implemented. This parameter is ignored.
:type sort_property_1: string
:param sort_property_1:
An m2io dataname that identifies the values for single pose per
ligand sorting. Default is "r_i_glide_gscore".
:type sort_property_2: string
:param sort_property_2:
An m2io dataname that identifies the values of intra-ligand
group sorting. SP and HTVS poses should use 'r_i_glide_emodel'
to pick the best pose of the same ligand-title. XP poses should
use 'i_glide_XP_PoseRank' to determine the best pose within a
ligand-title group. Default is "r_i_glide_emodel".
This is a convenience interface for
schrodinger.analysis.enrichment.Calculator,
which is the preferred Enrichment metric generator.
schrodinger.analysis.enrichment.Calculator has more metrics
(e.g. BEDROC, ROC, AUAC), support for interactive plotting, and
can generate plot png format files. This function also implements
sort.py for better sorting performance.
Two files are created as a result of running this function;
a list of active compound titles in <root>_actives.txt and
a sorted structure file in <root>_efcalc<ext>, where <root> is the
basename and <ext> the extension of the filename provided.
"""
(root, ext) = fileutils.splitext(file_name)
# Sort the structure file and write out just the 'best' pose per block.
# Assign the starting index, pv_files have a receptor, libs don't
# if the type was not specified guess by the file ext
file_index = None
if file_type:
file_type = file_type.lower()
if file_type == 'pv':
file_index = 2
elif file_type == 'lib':
file_index = 1
else:
raise ValueError("Invalid file type: %s" % file_type)
else:
# Note, this is a weak test based on file extention.
if fileutils.is_poseviewer_file(file_name):
file_index = 2
else:
file_index = 1
sort_criteria = [(prop, sort.ASCENDING) for prop in sort_property_1.split()]
intra_block_sort_criteria = [('s_m_title', sort.ASCENDING)]
intra_block_sort_criteria.extend([
(prop, sort.ASCENDING) for prop in sort_property_2.split()
])
sfs = sort.StructureFileSorter(
file_name,
file_index,
sort_criteria,
intra_block_sort_criteria,
keep_structures=sort._in_memory_sort_ok(file_name))
sfs.sort()
sorted_structure_file = '%s_efcalc%s' % (root, ext)
sfs.writeTopNFromBlock(sorted_structure_file, max_per_block=1)
# Write the active compound titles to disk.
actives_file = '%s_actives.txt' % (root)
actives_fh = open(actives_file, 'w')
for title in actives:
actives_fh.write("%s\n" % title)
actives_fh.close()
# Determine the number decoys used in the experiment.
num_decoys = total_ligands - len(actives)
ef_calc = enrichment.Calculator(actives=actives_file,
results=sorted_structure_file,
total_decoys=num_decoys)
return ef_calc
################################################################################
# Classes
################################################################################
#ev81790 Removed the deprecated Enrichment class.
[docs]class PoseLibrary:
"""
A helper class to sort a pose library. By default, multiple poses for
one ligand-title are 'block sorted'. i.e. grouped and sorted by Title
and sort_property_2, and each group is sorted by its best member's
sort_property_1. The pose library may be resorted by sort_property_1,
in a title agnostic manner, by calling sortByProperties().
API Example::
pl = glideanalysis.PoseLibrary(file_name='input_lib.mae')
pl.writeTopPoses('output_lib.mae')
:vartype debug: bool
:cvar debug:
Increases the verbosity of the various bookkeeping operations.
:vartype dock_results: dict
:ivar dock_results:
A dictionary of pose properties, pose file index keys for property
dict.
:vartype pose_lib_order: tuple
:ivar pose_lib_order:
A sorted tuple sequence of all pose file indexes.
:vartype best_pose_lib_order: tuple
:ivar best_pose_lib_order:
A sorted tuple sequence of poses file indexes, from just the best
pose of each title-group.
:vartype title_tuples: list
:ivar title_tuples:
A list of (pose indexes) tuples, one tuple per title.
:vartype title_blocks: dict
:ivar title_blocks:
A dictionary of pose index lists, 'title' keys for list of sorted
(intra title) indexes.
"""
debug = False
[docs] def __init__(
self,
file_name="",
file_type=None, # 'pv' or 'lib'
sort_property_1="r_i_glide_gscore",
sort_property_2="r_i_glide_emodel",
block_sort=True):
"""
:type file_name: string
:param file_name:
Path to the file upon with to operate. The 'file_name' pose
file may contain multiple poses for each ligand. Default is
"".
:type file_type: string
:param file_type:
Should be 'pv' or 'lib'; used to determine the position of the
first ligand pose. If not specified, determined from ext.
:type sort_property_1: string
:param sort_property_1:
An m2io dataname or a space-separated list of m2io datanames
that identify the values for single best pose per ligand-title
sorting. The input string is broken into fields by split().
Default is "r_i_glide_gscore".
:type sort_property_2: string
:param sort_property_2:
An m2io dataname or a space-separated list of m2io datanames
that identify the values for ligand-title group sorting.
The input string is broken into fields by split(). Default is
"r_i_glide_emodel".
SP and HTVS poses should use 'r_i_glide_emodel' to pick the
best pose of the same ligand-title. XP poses should use
'i_glide_XP_PoseRank' to determine the best pose order within
a ligand-title group.
:type block_sort: bool
:param block_sort:
Boolean for the initial sorting operation. The pose
library can always be resorted at a later point, this
attribute just sets the default sorting operation for the
instance. If True, poses are organized into ligand-title
grouping as normal. If False, the poses are organized by a
straight multi-key sort of sort_property_1. If there are
multiple poses per ligand title this option should be used with
care. Default is True.
"""
self.file_name = file_name # Maestro file path
self.file_type = file_type # lib or pv format, determined below
self.sort_property_1 = sort_property_1.strip()
self.sort_property_2 = sort_property_2.strip()
self.file_index = 1 # Start file position index pv=2/lib=1
self.block_sort = block_sort
# Store pose file properties and initial rank by title
self.pose_lib_order = []
self.best_pose_lib_order = [] # multiple poses per ligand
self.dock_results = {}
self.statistic = {}
# Assign the starting index, pv_files have a receptor, libs don't
# if the type was not specified guess by the file ext
if self.file_type:
self.file_type = self.file_type.lower()
if self.file_type == 'pv':
self.file_index = 2
elif self.file_type == 'lib':
self.file_index = 1
else:
raise ValueError("Invalid file type: %s" % self.file_type)
else:
if fileutils.is_poseviewer_file(self.file_name):
self.file_index = 2
self.file_type = 'pv'
else:
self.file_index = 1
self.file_type = 'lib'
# Store structure file properties by initial file position index
index = self.file_index
for st in structure.StructureReader(self.file_name, self.file_index):
self.dock_results[index] = {}
for p in list(st.property):
self.dock_results[index][p] = st.property[p]
index += 1
if self.block_sort:
# Legacy mode of sorting puts the poses into ligand-title blocks.
self.sortPoses()
else:
# Simple multi-key sorting, no ligand-title blocks
self.sortByProperties()
# Undecorate and sort the pose library by imposing this index order
# on file.
self.pose_lib_order = tuple(self.pose_lib_order)
self.best_pose_lib_order = tuple(self.best_pose_lib_order)
self.title_tuples = tuple(self.title_tuples)
# Collect simple statistics on the library
self.calculate_stats()
return
def _show(self):
"""
Helper/debugging method to print out values. Prints table of
sorted poses.
"""
print("PoseLibrary: Read %s, with type %s, at file index %d" %
(self.file_name, self.file_type, self.file_index))
print("PoseLibrary: Found %d total poses" % len(self.dock_results))
print("Final Poses file index sorted order (All poses)")
print(self.pose_lib_order)
print("Final Poses file index sorted order (Best pose per ligand)")
print(self.best_pose_lib_order)
print("Table of Final Poses file index sorted order (All poses)")
fields = []
fields.append('Title')
fields.extend(self.sort_property_1.split())
fields.extend(self.sort_property_2.split())
fields.append('file_index')
print(" ".join(fields))
for i in self.pose_lib_order:
row = [self.dock_results[i]['s_m_title']]
for p in self.sort_property_1.split():
row.append(str(self.dock_results[i][p]))
for p in self.sort_property_2.split():
row.append(str(self.dock_results[i][p]))
row.append(str(i))
print("\t".join(row))
return
[docs] def sortPoses(self):
"""
Helper method that calls sortIntraTitle(), then sortInterTitle().
No return value.
"""
self.sortIntraTitle()
self.sortInterTitle()
self.pose_lib_order = tuple(self.pose_lib_order)
self.best_pose_lib_order = tuple(self.best_pose_lib_order)
self.title_tuples = tuple(self.title_tuples)
return
[docs] def sortIntraTitle(self):
"""
Creates tuples of poses with the same title, then sorts within
each title-tuple by self.sort_property_2. No return value.
Ligands are not 'ranked' between titles by this function.
Raises an Exception if a pose without a title is encountered.
"""
self.title_tuples = [] # A list of index tuples
self.title_blocks = {} # Helper dict
props = self.sort_property_2.split()
sort_p_2 = [] # List of lists
indexes = [] # List of file positions with the same title
titles = []
# Seed list of lists
sort_p_2.append(titles) # slice 0
for p in props:
sort_p_2.append([])
sort_p_2.append(indexes) # slice -1
# Create ligand-title blocks
for file_index in self.dock_results:
title = self.dock_results[file_index].get('s_m_title')
if not title:
msg = 'No title for %d' % file_index
raise Exception(msg)
# Populate the list of lists
sort_p_2[0].append(title)
for prop_index, prop in enumerate(props):
prop_index += 1 # title is 0th slice so bump the index
lst = sort_p_2[prop_index]
lst.append(self.dock_results[file_index].get(prop))
sort_p_2[-1].append(file_index)
# Decorate, sort, undecorate index
title_family = list(zip(*sort_p_2))
title_family.sort()
for item in title_family:
title = item[0]
index = item[-1]
# Check for existing hash key
if title in self.title_blocks:
self.title_blocks[title].append(index)
else:
self.title_blocks[title] = []
self.title_blocks[title].append(index)
# Title_blocks dict was useful for debugging/development, but
# a list of tuples is nicer for sorting Inter-Title
for title in self.title_blocks:
self.title_tuples.append(tuple(self.title_blocks[title]))
return
[docs] def sortInterTitle(self):
"""
Orders the title_tuple families (see `sortIntraTitle`) by
self.sort_property_2. No return value.
"""
# Reset as an appendable lists
self.best_pose_lib_order = []
props = self.sort_property_1.split()
# Sort the tuples by their leading member's self.sort_property_1
families = []
sort_p_1 = [] # seed list of lists
for p in props:
sort_p_1.append([])
sort_p_1.append(families) # last slice, -1
# Populate the list of lists
for item in self.title_tuples:
best_member_index = item[0]
for prop_index, prop in enumerate(props):
lst = sort_p_1[prop_index]
lst.append(self.dock_results[best_member_index].get(prop))
sort_p_1[-1].append(item[:]) # Copy the family index order
# Decorate, sort, undecorate index tuples
pose_list = list(zip(*sort_p_1))
pose_list.sort()
self.title_tuples = [] # Reset title_tuples as an appendable list
for item in pose_list:
family = item[-1]
self.title_tuples.append(family[:])
# Pick the first element to get the best ligand order
for item in self.title_tuples:
self.best_pose_lib_order.append(item[0])
# Flatten the tuples into a list of all
self.pose_lib_order = []
for item in self.title_tuples:
self.pose_lib_order.extend(list(item[:]))
return
[docs] def sortByProperties(self):
"""
Orders the pose library in a Title agnostic manner, considering
only sort_property_1; a simple (multi-key) sort of the docking
results. The instance attribute 'pose_lib_order' is reassigned to
the new ordered tuple, but since the block ordering is lost
the attributes 'best_pose_lib_order' and 'title_tuples' are
redefined as empty tuples, and 'title_blocks' is redefined to an
empty dictionary.
"""
# Reset irrelevant data structures
self.best_pose_lib_order = ()
self.title_tuples = ()
self.title_blocks = {}
# Reset as an appendable list
self.pose_lib_order = []
props = self.sort_property_1.split()
sort_p_1 = [] # List of lists
indexes = [] # List of file positions
# Seed list of lists
for p in props:
sort_p_1.append([])
sort_p_1.append(indexes) # slice -1
# Populate the list of lists
for pose_index in self.dock_results:
for prop_index, p in enumerate(props):
sort_p_1[prop_index].append(
self.dock_results[pose_index].get(p))
sort_p_1[-1].append(pose_index)
# Decorate, sort, undecorate index
pose_organizer = list(zip(*sort_p_1))
pose_organizer.sort()
for item in pose_organizer:
self.pose_lib_order.append(item[-1])
self.pose_lib_order = tuple(self.pose_lib_order)
return
[docs] def calculate_stats(self):
"""
Calculate simple statistics on the glide scores. Sets the
self.statistic dictionary and adds the standard score (glide gscore)
to the self.dock_results dict.
Returns None.
"""
# List comprehension of glide scores
glide_score = [
self.dock_results[i]['r_i_glide_gscore'] for i in self.dock_results
]
glide_score_mean = old_div(sum(glide_score), len(glide_score))
glide_score_stdev = math.sqrt(
old_div(sum([(i - glide_score_mean)**2 for i in glide_score]),
len(glide_score)))
glide_score_min = min(glide_score)
glide_score_max = max(glide_score)
# Median score. // is a floor division operator
glide_score_median = glide_score[len(glide_score) // 2]
self.statistic['glide_gscore_min'] = glide_score_min
self.statistic['glide_gscore_max'] = glide_score_max
self.statistic['glide_gscore_mean'] = glide_score_mean
self.statistic['glide_gscore_stdev'] = glide_score_stdev
self.statistic['glide_gscore_median'] = glide_score_median
for i in self.dock_results:
score = self.dock_results[i]['r_i_glide_gscore']
std_score = None
if glide_score_stdev > 0:
std_score = float(
old_div((score - glide_score_mean), glide_score_stdev))
self.dock_results[i]['r_user_std_glide_gscore'] = std_score
return
[docs] def write(self, output_lib_name="", indexes=(), block_size=4000):
"""
Outputs an ordered pose lib file. The pose order
is the same as indexes' sequence of original file positions.
This method loads all pose structure.Structure objects into memory
only if there are fewer than 'block_size' structures. Otherwise,
it reads the input file multiple times, sorting structures in
'block_size' chunks until the entire library is sorted.
Returns None.
"""
pose_lib_indexes = list(indexes)
pose_indexes = []
sts = []
(self.tmp_lib_desc, self.tmp_lib) = tempfile.mkstemp(
#"pose_library__tmp_UNIQUEGIBBERISH.mae"
suffix='.mae',
prefix='pose_library_tmp_',
dir=os.getcwd(),
text=True)
# See if the file size is bigger than the block size, we don't
# have to count them all, just determine if we have more than the
# block size. enum is 0 based, so we add 1.
for count, st in enumerate(structure.StructureReader(self.file_name)):
if count + 1 > block_size:
break
# 'Normal' sized file, should be sortable within memory
if count < block_size:
if self.debug:
print("Sort all structures at once, read from file index: %d" \
% self.file_index)
file_index = self.file_index
for st in structure.StructureReader(self.file_name,
self.file_index):
# Skip any index not in the list
if file_index in pose_lib_indexes:
sts.append(st)
pose_indexes.append(pose_lib_indexes.index(file_index))
file_index += 1
sorted_sts = list(zip(pose_indexes, sts))
sorted_sts.sort()
for (i, st) in sorted_sts:
st.append(self.tmp_lib)
# 'Big' files can't be sorted in memory at once, so sort in batches
else:
if self.debug:
print("Sorting structures in batches")
while pose_lib_indexes:
islice = []
for i in range(1, block_size):
try:
islice.append(pose_lib_indexes.pop(0))
except IndexError:
break # exit for loop
if self.debug:
print("Sorting slice:")
print(islice)
sts = []
pose_indexes = []
sorted_sts = []
file_index = self.file_index
for st in structure.StructureReader(self.file_name,
self.file_index):
if file_index in islice:
sts.append(st)
pose_indexes.append(islice.index(file_index))
file_index += 1
sorted_sts = list(zip(pose_indexes, sts))
sorted_sts.sort()
for (i, st) in sorted_sts:
st.append(self.tmp_lib)
os.close(self.tmp_lib_desc)
if sys.platform == 'win32' and os.path.exists(output_lib_name):
os.remove(output_lib_name)
os.rename(self.tmp_lib, output_lib_name)
return
[docs] def writeValidPoses(
self,
output_lib_name="",
actives=[], # noqa: M511
max_pose_per_active=10,
max_pose_per_decoy=1,
max_number_decoys=1000,
block_size=2000):
"""
Writes a pose library with multiple poses per active but a single
pose per 'decoy'. Returns None.
"""
# Container for pose indexes
valid_poses = []
# Container for pose accounting
valid_count = {}
valid_count['__decoys'] = 0
for i in self.pose_lib_order:
title = self.dock_results[i]['s_m_title']
if title not in valid_count:
valid_count[title] = 0
if title in actives and valid_count[title] < max_pose_per_active:
valid_poses.append(i)
valid_count[title] += 1
elif valid_count[title] < max_pose_per_decoy and \
valid_count['__decoys'] < max_number_decoys:
valid_poses.append(i)
valid_count[title] += 1
valid_count['__decoys'] += 1
self.write(output_lib_name, indexes=valid_poses, block_size=block_size)
return
[docs] def writeTopPoses(self,
output_lib_name="",
max_pose_per_title=1,
block_size=2000):
"""
Writes a pose library with the top N poses per ligand.
Returns None.
Assumes self.title_tuples is sorted in the desired order.
"""
# Set the number of poses to grab
N = max_pose_per_title
# Container for pose indexes
indexes = []
# Get the pose indexes
for item in self.title_tuples:
indexes.extend(item[:N])
# Verbose
if self.debug:
print("writeTopPoses... max_pose_per_title: ", N)
print("writeTopPoses... indexes:")
print(indexes)
# Write them out
self.write(output_lib_name, indexes=indexes, block_size=block_size)
return
# EOF