"""
This module contains the class for generating the default enrichment report.
Example metrics from two different screens:
The enrichment metrics from example_A are generally more favorable than
those from example_B.
Enrichment Report
-----------------
Actives file: example_A_actives.txt
Results: example_A_dock_pv.rept
Total actives: 117
Total ligands(actives+decoys): 1117
Number of ranked actives: 117
BEDROC(alpha=160.9, alpha*Ra=16.8534): 1.000
BEDROC(alpha=20.0, alpha*Ra=2.0949): 0.914
BEDROC(alpha=8.0, alpha*Ra=0.8380): 0.868
ROC: 0.92
RIE: 7.65
Area under accumulation curve: 0.87
Ave. Number of outranking decoys: 82
Count and percentage of actives in top N% of decoy results.
# Actives (1%|2%|5%|10%|20%): 90| 90| 92| 94| 97
% Actives (1%|2%|5%|10%|20%): 76.9| 76.9| 78.6| 80.3| 82.9
Enrichment Factors with respect to N% sample size.
EF (1%|2%|5%|10%|20%): 9.5| 9.5| 9.4| 7.7| 4.1
EF*(1%|2%|5%|10%|20%): 77| 38| 16| 8| 4.1
EF'(1%|2%|5%|10%|20%): 2.9e+02|1.7e+02| 54| 23| 9.9
Eff(1%|2%|5%|10%|20%): 0.974| 0.949| 0.88| 0.779| 0.611
Enrichment Factors with respect to N% actives recovered.
EF (40%|50%|60%|70%|80%|90%|100%): 9.3| 9.4| 9.4| 9.2| 5.7| 2| 1.3
EF*(40%|50%|60%|70%|80%|90%|100%): 4e+02|5e+02|6e+02|2.3e+02| 13| 2.2| 1.4
EF'(40%|50%|60%|70%|80%|90%|100%): 3.8e+02|4.3e+02|4.7e+02|4.3e+02| 38| 4.7| 2.7
FOD(40%|50%|60%|70%|80%|90%|100%): 9e-05|0.0003|0.0004|0.0006|0.003| 0.03| 0.08
Enrichment Report
-----------------
Actives file: example_B_actives.txt
Results: example_B_dock_pv.rept
Total actives: 62
Total ligands(actives+decoys): 1062
Number of ranked actives: 62
BEDROC(alpha=160.9, alpha*Ra=9.3934): 0.703
BEDROC(alpha=20.0, alpha*Ra=1.1676): 0.256
BEDROC(alpha=8.0, alpha*Ra=0.4670): 0.323
ROC: 0.72
RIE: 3.02
Area under accumulation curve: 0.71
Ave. Number of outranking decoys: 281
Count and percentage of actives in top N% of decoy results.
# Actives (1%|2%|5%|10%|20%): 8| 8| 9| 13| 23
% Actives (1%|2%|5%|10%|20%): 12.9| 12.9| 14.5| 21.0| 37.1
Enrichment Factors with respect to N% sample size.
EF (1%|2%|5%|10%|20%): 12| 6.5| 2.9| 2.1| 1.6
EF*(1%|2%|5%|10%|20%): 13| 6.5| 2.9| 2.1| 1.9
EF'(1%|2%|5%|10%|20%): 23| 12| 5.3| 3.4| 2.3
Eff(1%|2%|5%|10%|20%): 0.856| 0.732| 0.488| 0.354| 0.299
Enrichment Factors with respect to N% actives recovered.
EF (40%|50%|60%|70%|80%|90%|100%): 1.8| 2| 1.9| 2| 1.6| 1.6| 1
EF*(40%|50%|60%|70%|80%|90%|100%): 1.9| 2.1| 2| 2.1| 1.6| 1.6| 1.1
EF'(40%|50%|60%|70%|80%|90%|100%): 2.3| 2.2| 2.2| 2.2| 2.1| 2| 1.6
FOD(40%|50%|60%|70%|80%|90%|100%): 0.1| 0.1| 0.1| 0.2| 0.2| 0.2| 0.3
Copyright Schrodinger, LLC. All rights reserved.
"""
import os
import sys
from collections import defaultdict
from decorator import decorator
import schrodinger.application.canvas.fingerprint as canvas_fp
import schrodinger.application.canvas.similarity as canvas_sim
import schrodinger.structure as structure
import schrodinger.utils.fileutils as fileutils
import schrodinger.utils.log as log
from schrodinger.analysis.enrichment import enrichment_input
from schrodinger.analysis.enrichment import metrics
from schrodinger.analysis.enrichment import plotter
from schrodinger.application.glide.utils import is_valid_pv_file
CSV_FILE_EXT = ('.csv', '.CSV')
ST_TITLE_HEADER = "s_m_title"
logger = log.get_output_logger(__file__)
@decorator
def _exception_handler(func, *args, **kwargs):
"""
A decorator to handle exceptions bubbling from functions in metrics.py.
"""
# special_functions stores a list of Calculator class function names as
# keys. These functions return multiple values, thus need to be
# handled individually to ensure the conformity of return values.
special_functions = {
"calcBEDROC": (None, None),
}
try:
return func(*args, **kwargs)
except (ValueError, ZeroDivisionError, OverflowError, AttributeError) as e:
func_name = func.__name__
logger.debug(f"{func_name} caught {type(e).__name__}: {e}")
logger.debug("Setting output as None.")
return special_functions.get(func_name, None)
###############################################################################
# Classes
###############################################################################
[docs]class Calculator:
"""
A class to report default set of enrichment terms for a screen.
By default, a report containing a suite of metrics is directed to
standard out.
:note: This is not the preferred way to obtain enrichment metrics.
Please consider using parser and metric functions directly in
enrichment_input.py and metrics.py if possible.
:cvar ef_precision: Number of decimals when reporting EF values.
Default = 2
:vartype ef_precision: int
:cvar efp_precision: Number of decimals when reporting EF' values.
Default = 2
:vartype efp_precision: int
:cvar efs_precision: Number if decimals when reporting EF* values.
Default = 2
:vartype efs_precision: int
:cvar eff_precision: Number of decimals when reporting Eff values.
Default = 3
:vartype eff_precision: int
:cvar fod_precision: Number of decimals when reporting FOD values.
Default = 1
:vartype fod_precision: int
"""
# Default reporting formats with %g idiom.
ef_precision = 2
efs_precision = 2
efp_precision = 2
eff_precision = 3
fod_precision = 1
[docs] def __init__(self, actives, results, total_decoys=0):
"""
:param actives: File name or a list of strings containing all active
titles. If a file name is provided, the input should be
a valid csv or structure file, a raw text file
containing one line per title is also acceptable.
Duplicate titles are discarded, only the first
occurrence is recorded.
:type actives: str or list(str)
:param results: File name, a list of strings, or a list of
structure.Structure containing the virtual screening
result ordered by the scoring metric. If a file name is
provided, the input should be a valid csv file or
structure file. Duplicate titles are discarded, only the
first occurrence is recorded.
:type results: str or list(str) or list(structure.Structure)
:param total_decoys: The total number of decoys. If specified, the total
number of ligands will be distinct active titles
from actives file + num_decoy. This will enable the
calculation of the correction term in calc_AUAC,
should the total number of ligands not equal to the
total number of ranked titles in results_file.
:type total_decoys: int
"""
# Used for default enrichment report
self.percentage_active = (0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
self.percentage_ligand = (0.01, 0.02, 0.05, 0.10, 0.20)
# Values used in fingerprint components, uses in report()
self.min_Tc_total_actives = None
self.missing_active_titles = None
# Parse the actives file/list
self.active_titles = self._parseActives(actives)
# Parse the results file/list
self._parseResults(results, total_decoys)
# Shortcut for self.report()
if isinstance(actives, str):
self.actives = os.path.basename(actives)
else:
self.actives = "List of active titles"
if isinstance(results, str):
self.results = os.path.basename(results)
else:
self.results = "List of ranked ligand titles"
self.total_decoys = self.total_ligands - self.total_actives
# Calculate all of the default metrics
self._calculateMetrics()
def __repr__(self):
rpr = "EnrichmentReporter(%s, %s, %d)" % (
self.actives,
self.results,
int(self.total_decoys) # Cast back to input integer.
)
return rpr
def _parseActives(self, actives_file):
"""
parse actives_file, return distinct active titles from the actives file.
Repeated active titles are ignored.
:param actives_file: File name or a list of strings containing all
active titles. If a file name is provided, the
input should be a valid csv or structure file,
a raw text file containing one line per title
is also acceptable. Duplicate titles are
discarded, only the first occurrence is
recorded.
:type actives_file: str or list(str)
:return: distinct active titles from the actives file.
:rtype: set(str)
"""
if isinstance(actives_file, str):
_, ext = fileutils.splitext(actives_file)
# csv file
if ext in CSV_FILE_EXT:
active_titles = \
enrichment_input.extract_active_titles_from_csv(
actives_file)
# structure file
elif fileutils.get_structure_file_format(actives_file):
active_titles = \
enrichment_input.extract_active_titles_from_mae(
actives_file)
# assume input is a plain text file otherwise
else:
active_titles = \
enrichment_input.extract_active_titles_from_txt(
actives_file)
# Actives input is a sequence of structure titles.
elif hasattr(actives_file, '__iter__'):
if not actives_file:
raise TypeError("Actives list must not be empty.")
if isinstance(actives_file[0], str):
active_titles = \
enrichment_input.extract_active_titles_from_list(
actives_file)
else:
raise TypeError("Actives list must be a list of strings.")
else:
raise TypeError("Actives input type must be a string or a list.")
return active_titles
def _parseResults(self, results_file, num_decoy):
"""
Identify the file type of results_file and call the correct parser
function to set rank and count related class variables.
:param results_file: File name, a list of strings, or a list of
structure.Structure containing the virtual
screening result ordered by the scoring metric.
If a file name is provided, the input should be a
valid csv file or structure file. Duplicate titles
are discarded, only the first occurrence is
recorded.
:type results_file: str or list(str) or list(structure.Structure)
:param num_decoy: The total number of decoys. If specified, the total
number of ligands will be distinct active titles
from actives file + num_decoy. This will enable the
calculation of the correction term in calc_AUAC,
should the total number of ligands not equal to the
total number of ranked titles in results_file.
:type num_decoy: int
"""
if isinstance(results_file, str):
_, ext = fileutils.splitext(results_file)
# csv file
if ext in CSV_FILE_EXT:
self.total_actives, self.total_ligands, self.active_ranks, \
self.adjusted_active_ranks, self.total_ranked, \
self.title_ranks = \
enrichment_input.extract_ranks_from_csv(
csv_file_name=results_file,
active_titles=self.active_titles,
num_decoy=num_decoy)
# structure file
elif fileutils.get_structure_file_format(results_file):
self.total_actives, self.total_ligands, self.active_ranks, \
self.adjusted_active_ranks, self.total_ranked, \
self.title_ranks = \
enrichment_input.extract_ranks_from_mae(
mae_file_name=results_file,
active_titles=self.active_titles,
num_decoy=num_decoy)
self._getFingerprintComponent(results_file)
else:
msg = "Results must be a csv file or a maestro file."
raise TypeError(msg)
# Actives input is a sequence of structure titles.
elif hasattr(results_file, '__iter__'):
if not results_file:
raise TypeError("Results list must not be empty.")
# list of strings
if isinstance(results_file[0], str):
self.total_actives, self.total_ligands, self.active_ranks, \
self.adjusted_active_ranks, self.total_ranked, \
self.title_ranks = \
enrichment_input.extract_ranks_from_list(
titles_iter=results_file,
active_titles=self.active_titles,
num_decoy=num_decoy)
# list of structure.Structure
elif isinstance(results_file[0], structure.Structure):
self.total_actives, self.total_ligands, self.active_ranks, \
self.adjusted_active_ranks, self.total_ranked, \
self.title_ranks = \
enrichment_input.extract_ranks_from_structures(
structure_iter=results_file,
active_titles=self.active_titles,
num_decoy=num_decoy)
self._getFingerprintComponent(results_file)
else:
msg = "Results list must be a list of strings " \
"or structure.Structure."
raise TypeError(msg)
else:
raise TypeError("Results input type must be a string or a list.")
def _getFingerprintComponent(self, results_file):
"""
Initialize and return a data class object needed for fingerprint-related
calculations. This is basically enrichment_input.getFingerprintComponent
but we'll also need to find the missing active titles for the reporter.
:param results_file: File name, a list of strings, or a list of
structure.Structure containing the virtual
screening result ordered by the scoring metric.
If a file name is provided, the input should be a
valid csv file or structure file. Duplicate titles
are discarded, only the first occurrence is
recorded.
:type results_file: str or list(str) or list(structure.Structure)
"""
fp_gen = canvas_fp.CanvasFingerprintGenerator(logger=logger)
fp_gen.setType('Linear')
fp_gen.setAtomBondTyping(fp_gen.getDefaultAtomTypingScheme())
fp_sim = canvas_sim.CanvasFingerprintSimilarity(logger=logger)
fp_sim.setMetric('Tanimoto')
# Bind initial values
title_ranks = {}
hits_titles = set()
active_fingerprint = {}
if isinstance(results_file, str):
# Assume Glide results are typical, skip the receptor in pv files
file_index = 2 if is_valid_pv_file(results_file) else 1
for st in structure.StructureReader(results_file, file_index):
title = st.property[ST_TITLE_HEADER]
enrichment_input._fingerprint_gen(st, title, self.active_titles,
fp_gen, hits_titles,
title_ranks,
active_fingerprint)
elif hasattr(results_file, '__iter__'):
for st in results_file:
title = st.property[ST_TITLE_HEADER]
enrichment_input._fingerprint_gen(st, title, self.active_titles,
fp_gen, hits_titles,
title_ranks,
active_fingerprint)
min_tc = enrichment_input._calc_min_tc_total_actives(
fp_sim, active_fingerprint)
self.min_Tc_total_actives = min_tc
self.fingerprint_object = enrichment_input.FingerprintComponent(
fp_gen, fp_sim, active_fingerprint, min_tc)
self.missing_active_titles = \
[title for title in self.active_titles if title not in hits_titles]
###################
# Metrics functions
###################
def _getActiveSampleSizeStar(self, fraction_of_actives):
return metrics.get_active_sample_size_star(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
adjusted_active_ranks=self.adjusted_active_ranks,
fraction_of_actives=fraction_of_actives)
def _getActiveSampleSize(self, fraction_of_actives):
return metrics.get_active_sample_size(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
fraction_of_actives=fraction_of_actives)
def _getDecoySampleSize(self, fraction_of_decoys):
return metrics.get_decoy_sample_size(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
fraction_of_decoys=fraction_of_decoys)
[docs] @_exception_handler
def calcEF(self, n_sampled_set, min_actives=None):
return metrics.calc_EF(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
n_sampled_set=n_sampled_set,
min_actives=min_actives)
[docs] @_exception_handler
def calcEFStar(self, n_sampled_decoy_set, min_actives=None):
return metrics.calc_EFStar(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
n_sampled_decoy_set=n_sampled_decoy_set,
min_actives=min_actives)
[docs] @_exception_handler
def calcEFP(self, n_sampled_decoy_set, min_actives=None):
return metrics.calc_EFP(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
n_sampled_decoy_set=n_sampled_decoy_set,
min_actives=min_actives)
[docs] @_exception_handler
def calcFOD(self, fraction_of_actives):
return metrics.calc_FOD(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
fraction_of_actives=fraction_of_actives)
[docs] @_exception_handler
def calcEFF(self, fraction_of_decoys):
return metrics.calc_EFF(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
adjusted_active_ranks=self.adjusted_active_ranks,
fraction_of_decoys=fraction_of_decoys)
[docs] def calcActivesInN(self, n_sampled_set):
return metrics.calc_ActivesInN(active_ranks=self.active_ranks,
n_sampled_set=n_sampled_set)
[docs] def calcActivesInNStar(self, n_sampled_set):
return metrics.calc_ActivesInNStar(
adjusted_active_ranks=self.adjusted_active_ranks,
n_sampled_set=n_sampled_set)
[docs] def calcAveNumberOutrankingDecoys(self):
return metrics.calc_AveNumberOutrankingDecoys(
active_ranks=self.active_ranks)
[docs] @_exception_handler
def calcBEDROC(self, alpha=20.0):
return metrics.calc_BEDROC(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
alpha=alpha)
[docs] @_exception_handler
def calcRIE(self, alpha=20.0):
return metrics.calc_RIE(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
alpha=alpha)
[docs] @_exception_handler
def calcAUAC(self):
return metrics.calc_AUAC(total_actives=self.total_actives,
total_ligands=self.total_ligands,
total_ranked=self.total_ranked,
active_ranks=self.active_ranks)
[docs] @_exception_handler
def calcROC(self):
return metrics.calc_ROC(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
adjusted_active_ranks=self.adjusted_active_ranks)
[docs] @_exception_handler
def calcDEF(self, n_sampled_set, min_actives=None):
return metrics.calc_DEF(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
title_ranks=self.title_ranks,
fingerprint_comp=self.fingerprint_object,
n_sampled_set=n_sampled_set,
min_actives=min_actives)
[docs] @_exception_handler
def calcDEFStar(self, n_sampled_decoy_set, min_actives=None):
return metrics.calc_DEFStar(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
title_ranks=self.title_ranks,
fingerprint_comp=self.fingerprint_object,
n_sampled_decoy_set=n_sampled_decoy_set,
min_actives=min_actives)
[docs] @_exception_handler
def calcDEFP(self, n_sampled_decoy_set, min_actives=None):
return metrics.calc_DEFP(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
title_ranks=self.title_ranks,
fingerprint_comp=self.fingerprint_object,
n_sampled_decoy_set=n_sampled_decoy_set,
min_actives=min_actives)
####################
# Plotter functions
####################
[docs] def calculateSpecificity(self, rank):
return plotter.calc_specificity_with_rank(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
rank=rank)
[docs] def getPercentScreenCurvePoints(self):
return plotter.get_percent_screen_curve_points(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks)
[docs] def getActiveRankCsvRows(self):
return plotter.get_plot_data(total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
title_ranks=self.title_ranks)
[docs] def savePlot(self,
png_file="plot.png",
title='Screen Results',
xlabel='1-Specificity',
ylabel='Sensitivity'):
return plotter.save_plot(
total_actives=self.total_actives,
total_ligands=self.total_ligands,
active_ranks=self.active_ranks,
adjusted_active_ranks=self.adjusted_active_ranks,
total_ranked=self.total_ranked,
png_file=png_file,
title=title,
xlabel=xlabel,
ylabel=ylabel)
####################
# Reporter functions
####################
def _calculateMetrics(self):
"""
Sets a suite of enrichment factor terms as instance data members.
"""
active_sample_size_list = [
self._getActiveSampleSize(e) for e in self.percentage_active
]
active_sample_size_list_star = [
self._getActiveSampleSizeStar(e) for e in self.percentage_active
]
m_active_size_list = [
round(self.total_actives * e) for e in self.percentage_active
]
ligand_sample_size_list = [
self._getDecoySampleSize(e) for e in self.percentage_ligand
]
m_ligand_size_list = [
round(self.total_ligands * e) for e in self.percentage_ligand
]
self.ave_num_outranking_decoys = self.calcAveNumberOutrankingDecoys()
self.bedroc20, self.alphaRa20 = self.calcBEDROC(alpha=20.0)
self.bedroc160_9, self.alphaRa160_9 = self.calcBEDROC(alpha=160.9)
self.bedroc8_0, self.alphaRa8_0 = self.calcBEDROC(alpha=8.0)
self.roc = self.calcROC()
self.rie = self.calcRIE()
self.auac = self.calcAUAC()
for pct, n_sampled_set, n_sampled_decoy_set, min_actives in zip(
self.percentage_active, active_sample_size_list,
active_sample_size_list_star, m_active_size_list):
ef_name = f'ef_{int(pct * 100)}'
ef_value = self.calcEF(n_sampled_set, min_actives)
setattr(self, ef_name, ef_value)
efs_name = f'efs_{int(pct * 100)}'
efs_value = self.calcEFStar(n_sampled_decoy_set, min_actives)
setattr(self, efs_name, efs_value)
efp_name = f'efp_{int(pct * 100)}'
efp_value = self.calcEFP(n_sampled_decoy_set, min_actives)
setattr(self, efp_name, efp_value)
fod_stat_name = f"fod_{int(pct * 100)}"
fod_stat_value = self.calcFOD(pct)
setattr(self, fod_stat_name, fod_stat_value)
for pct, n_sampled_set, n_sampled_decoy_set in zip(
self.percentage_ligand, m_ligand_size_list,
ligand_sample_size_list):
stat_name = f"actives_in_top_{int(pct * 100)}_pct"
stat_value = self.calcActivesInN(self.total_ligands * pct)
setattr(self, stat_name, stat_value)
pct_stat_name = f"pct_actives_in_top_{int(pct * 100)}_pct"
pct_stat_value = (100.0 * stat_value / self.total_actives)
setattr(self, pct_stat_name, pct_stat_value)
stat_name_star = f"actives_in_top_{int(pct * 100)}_pct_star"
stat_value_star = self.calcActivesInNStar(self.total_decoys * pct)
setattr(self, stat_name_star, stat_value_star)
pct_stat_name_star = f"pct_actives_in_top_{int(pct * 100)}_pct_star"
pct_stat_value_star = (100.0 * stat_value_star / self.total_actives)
setattr(self, pct_stat_name_star, pct_stat_value_star)
ef_pct_name = f"ef_{int(pct * 100)}pct"
ef_pct_value = self.calcEF(n_sampled_set)
setattr(self, ef_pct_name, ef_pct_value)
efs_pct_name = f"efs_{int(pct * 100)}pct"
efs_pct_value = self.calcEFStar(n_sampled_decoy_set)
setattr(self, efs_pct_name, efs_pct_value)
efp_pct_name = f"efp_{int(pct * 100)}pct"
efp_pct_value = self.calcEFP(n_sampled_decoy_set)
setattr(self, efp_pct_name, efp_pct_value)
eff_stat_name = f"eff_{int(pct * 100)}pct"
eff_stat_value = self.calcEFF(pct)
setattr(self, eff_stat_name, eff_stat_value)
def_name = f"def_{int(pct * 100)}pct"
def_value = self.calcDEF(n_sampled_set)
setattr(self, def_name, def_value)
defs_name = f"defs_{int(pct * 100)}pct"
defs_value = self.calcDEFStar(n_sampled_decoy_set)
setattr(self, defs_name, defs_value)
defp_name = f"defp_{int(pct * 100)}pct"
defp_value = self.calcDEFP(n_sampled_decoy_set)
setattr(self, defp_name, defp_value)
# Assign synomym attributes so clients of the deprecated
# glideanalysis.Enrichment class are less affected by the change.
old_ef_name = f"ef_{int(pct * 100)}pct"
new_ef_name = f"efd_{int(pct * 100)}"
new_ef_value = getattr(self, old_ef_name)
setattr(self, new_ef_name, new_ef_value)
old_ait_name = f"actives_in_top_{int(pct * 100)}_pct"
new_ait_name = f"actives_in_top_{int(pct * 100)}_prcnt"
new_ait_value = getattr(self, old_ait_name)
setattr(self, new_ait_name, new_ait_value)
[docs] def report(self, file_handle=sys.stdout, header="", footer=""):
"""
Prints text summary of results to the file_handle.
:param header: Header for the report.
:type header: str
:param footer: Footer for the report.
:type footer: str
:param file_handle: File handle-like object, default is sys.stdout.
:type file_handle: file
"""
out_fh = file_handle
results = self.results
if not isinstance(results, str):
results = 'Structure list (%d items)' % len(self.results)
line = '=' * 80
out_fh.write(line)
out_fh.write('\n')
line = """
Enrichment Report
-----------------
%s
Actives file: %s
Results: %s
Total actives: %d
Total ligands(actives+decoys): %d
Number of ranked actives: %d
BEDROC(alpha=160.9, alpha*Ra=%.4f): %.3f
BEDROC(alpha=20.0, alpha*Ra=%.4f): %.3f
BEDROC(alpha=8.0, alpha*Ra=%.4f): %.3f
ROC: %.2f
RIE: %.2f
Area under accumulation curve: %.2f
Ave. Number of outranking decoys: %d
Minimum Tc over all active pairs: %s
""" % (header, self.actives, results, self.total_actives,
self.total_ligands, len(
self.active_ranks), self.alphaRa160_9, self.bedroc160_9,
self.alphaRa20, self.bedroc20, self.alphaRa8_0, self.bedroc8_0,
self.roc, self.rie, self.auac, self.ave_num_outranking_decoys,
self.format(self.min_Tc_total_actives, 2))
out_fh.write(line)
line = """
Count and percentage of actives in top N%% of decoy results.
%% Decoys |%s|%s|%s|%s|%s|
# Actives |%s|%s|%s|%s|%s|
%% Actives |%s|%s|%s|%s|%s|
""" % ("%5s" % "1%", "%5s" % "2%", "%5s" % "5%", "%5s" % "10%",
"%5s" % "20%", "%5d" % self.actives_in_top_1_pct_star,
"%5d" % self.actives_in_top_2_pct_star,
"%5d" % self.actives_in_top_5_pct_star,
"%5d" % self.actives_in_top_10_pct_star,
"%5d" % self.actives_in_top_20_pct_star,
"%5.1f" % self.pct_actives_in_top_1_pct_star,
"%5.1f" % self.pct_actives_in_top_2_pct_star,
"%5.1f" % self.pct_actives_in_top_5_pct_star,
"%5.1f" % self.pct_actives_in_top_10_pct_star,
"%5.1f" % self.pct_actives_in_top_20_pct_star)
out_fh.write(line)
line = """
Count and percentage of actives in top N%% of results.
%% Results |%s|%s|%s|%s|%s|
# Actives |%s|%s|%s|%s|%s|
%% Actives |%s|%s|%s|%s|%s|
""" % ("%5s" % "1%", "%5s" % "2%", "%5s" % "5%", "%5s" % "10%",
"%5s" % "20%", "%5d" % self.actives_in_top_1_pct, "%5d" %
self.actives_in_top_2_pct, "%5d" % self.actives_in_top_5_pct,
"%5d" % self.actives_in_top_10_pct,
"%5d" % self.actives_in_top_20_pct,
"%5.1f" % self.pct_actives_in_top_1_pct,
"%5.1f" % self.pct_actives_in_top_2_pct,
"%5.1f" % self.pct_actives_in_top_5_pct,
"%5.1f" % self.pct_actives_in_top_10_pct,
"%5.1f" % self.pct_actives_in_top_20_pct)
out_fh.write(line)
line = """
Enrichment Factors with respect to N%% sample size.
%% Sample |%s|%s|%s|%s|%s|
EF |%s|%s|%s|%s|%s|
EF* |%s|%s|%s|%s|%s|
EF' |%s|%s|%s|%s|%s|
DEF |%s|%s|%s|%s|%s|
DEF* |%s|%s|%s|%s|%s|
DEF' |%s|%s|%s|%s|%s|
Eff |%s|%s|%s|%s|%s|
""" % (
"%7s" % "1%",
"%7s" % "2%",
"%7s" % "5%",
"%7s" % "10%",
"%7s" % "20%",
"%7s" % self.format(self.ef_1pct, self.ef_precision),
"%7s" % self.format(self.ef_2pct, self.ef_precision),
"%7s" % self.format(self.ef_5pct, self.ef_precision),
"%7s" % self.format(self.ef_10pct, self.ef_precision),
"%7s" % self.format(self.ef_20pct, self.ef_precision),
"%7s" % self.format(self.efs_1pct, self.efs_precision),
"%7s" % self.format(self.efs_2pct, self.efs_precision),
"%7s" % self.format(self.efs_5pct, self.efs_precision),
"%7s" % self.format(self.efs_10pct, self.efs_precision),
"%7s" % self.format(self.efs_20pct, self.efs_precision),
"%7s" % self.format(self.efp_1pct, self.efp_precision),
"%7s" % self.format(self.efp_2pct, self.efp_precision),
"%7s" % self.format(self.efp_5pct, self.efp_precision),
"%7s" % self.format(self.efp_10pct, self.efp_precision),
"%7s" % self.format(self.efp_20pct, self.efp_precision),
"%7s" % self.format(self.def_1pct, self.ef_precision),
"%7s" % self.format(self.def_2pct, self.ef_precision),
"%7s" % self.format(self.def_5pct, self.ef_precision),
"%7s" % self.format(self.def_10pct, self.ef_precision),
"%7s" % self.format(self.def_20pct, self.ef_precision),
"%7s" % self.format(self.defs_1pct, self.efs_precision),
"%7s" % self.format(self.defs_2pct, self.efs_precision),
"%7s" % self.format(self.defs_5pct, self.efs_precision),
"%7s" % self.format(self.defs_10pct, self.efs_precision),
"%7s" % self.format(self.defs_20pct, self.efs_precision),
"%7s" % self.format(self.defp_1pct, self.efp_precision),
"%7s" % self.format(self.defp_2pct, self.efp_precision),
"%7s" % self.format(self.defp_5pct, self.efp_precision),
"%7s" % self.format(self.defp_10pct, self.efp_precision),
"%7s" % self.format(self.defp_20pct, self.efp_precision),
"%7s" % self.format(self.eff_1pct, self.eff_precision),
"%7s" % self.format(self.eff_2pct, self.eff_precision),
"%7s" % self.format(self.eff_5pct, self.eff_precision),
"%7s" % self.format(self.eff_10pct, self.eff_precision),
"%7s" % self.format(self.eff_20pct, self.eff_precision),
)
out_fh.write(line)
line = """
Enrichment Factors with respect to N%% actives recovered.
%% Actives |%s|%s|%s|%s|%s|%s|%s|
EF |%s|%s|%s|%s|%s|%s|%s|
EF* |%s|%s|%s|%s|%s|%s|%s|
EF' |%s|%s|%s|%s|%s|%s|%s|
FOD |%s|%s|%s|%s|%s|%s|%s|
""" % (
"%7s" % "40%",
"%7s" % "50%",
"%7s" % "60%",
"%7s" % "70%",
"%7s" % "80%",
"%7s" % "90%",
"%7s" % "100%",
"%7s" % self.format(self.ef_40, self.ef_precision),
"%7s" % self.format(self.ef_50, self.ef_precision),
"%7s" % self.format(self.ef_60, self.ef_precision),
"%7s" % self.format(self.ef_70, self.ef_precision),
"%7s" % self.format(self.ef_80, self.ef_precision),
"%7s" % self.format(self.ef_90, self.ef_precision),
"%7s" % self.format(self.ef_100, self.ef_precision),
"%7s" % self.format(self.efs_40, self.efs_precision),
"%7s" % self.format(self.efs_50, self.efs_precision),
"%7s" % self.format(self.efs_60, self.efs_precision),
"%7s" % self.format(self.efs_70, self.efs_precision),
"%7s" % self.format(self.efs_80, self.efs_precision),
"%7s" % self.format(self.efs_90, self.efs_precision),
"%7s" % self.format(self.efs_100, self.efs_precision),
"%7s" % self.format(self.efp_40, self.efp_precision),
"%7s" % self.format(self.efp_50, self.efp_precision),
"%7s" % self.format(self.efp_60, self.efp_precision),
"%7s" % self.format(self.efp_70, self.efp_precision),
"%7s" % self.format(self.efp_80, self.efp_precision),
"%7s" % self.format(self.efp_90, self.efp_precision),
"%7s" % self.format(self.efp_100, self.efp_precision),
"%7s" % self.format(self.fod_40, self.fod_precision),
"%7s" % self.format(self.fod_50, self.fod_precision),
"%7s" % self.format(self.fod_60, self.fod_precision),
"%7s" % self.format(self.fod_70, self.fod_precision),
"%7s" % self.format(self.fod_80, self.fod_precision),
"%7s" % self.format(self.fod_90, self.fod_precision),
"%7s" % self.format(self.fod_100, self.fod_precision),
)
out_fh.write(line)
out_fh.write('\n')
out_fh.write('%s %s\n' % ('Rank'.ljust(6), 'Title'.ljust(73)))
out_fh.write('%s %s\n' % ('-' * 6, '-' * 73))
for rank in sorted(self.title_ranks):
title = self.title_ranks[rank]
out_fh.write('%s %s\n' % (str(rank).ljust(6), title.ljust(73)))
out_fh.write('\n')
if self.missing_active_titles:
out_fh.write("Missing ranks for:\n")
out_fh.write("\n".join(self.missing_active_titles))
out_fh.write('\n')
out_fh.write(footer)
out_fh.write('\n')
line = '=' * 80
out_fh.write(line)
out_fh.write('\n')
[docs] def getCsvRows(self):
"""
Return a list of two lists, the first inner list contains all metric
names, the other contains all corresponding metric values.
:return: a list of header and enrichment value tuples.
:rtype: list
"""
percentage_active = [int(pct * 100) for pct in self.percentage_active]
percentage_ligand = [int(pct * 100) for pct in self.percentage_ligand]
dict_keys = ("actives_in_top_pct", "pct_actives_in_top_pct",
"actives_in_top_pct_star", "pct_actives_in_top_pct_star",
"ef_pct", "efs_pct", "efp_pct", "def_pct", "defs_pct",
"defp_pct", "eff_pct", "ef_actives", "efs_actives",
"efp_actives", "fod_actives")
value_storage = defaultdict(list)
for pct in percentage_ligand:
value_storage[dict_keys[0]].append(
(f"# Actives {pct}%", getattr(self,
f"actives_in_top_{pct}_pct")))
value_storage[dict_keys[1]].append(
(f"% Actives {pct}%",
getattr(self, f"pct_actives_in_top_{pct}_pct")))
value_storage[dict_keys[2]].append(
(f"# Actives {pct}%*",
getattr(self, f"actives_in_top_{pct}_pct_star")))
value_storage[dict_keys[3]].append(
(f"% Actives {pct}%*",
getattr(self, f"pct_actives_in_top_{pct}_pct_star")))
value_storage[dict_keys[4]].append(
(f"EF {pct}%",
self.format(getattr(self, f"ef_{pct}pct"), self.ef_precision)))
value_storage[dict_keys[5]].append(
(f"EF* {pct}%",
self.format(getattr(self, f"efs_{pct}pct"),
self.efs_precision)))
value_storage[dict_keys[6]].append(
(f"EF' {pct}%",
self.format(getattr(self, f"efp_{pct}pct"),
self.efp_precision)))
value_storage[dict_keys[7]].append(
(f"DEF {pct}%",
self.format(getattr(self, f"def_{pct}pct"),
self.ef_precision)))
value_storage[dict_keys[8]].append(
(f"DEF* {pct}%",
self.format(getattr(self, f"defs_{pct}pct"),
self.efs_precision)))
value_storage[dict_keys[9]].append(
(f"DEF' {pct}%",
self.format(getattr(self, f"defp_{pct}pct"),
self.efp_precision)))
value_storage[dict_keys[10]].append(
(f"Eff {pct}%",
self.format(getattr(self, f"eff_{pct}pct"),
self.eff_precision)))
for pct in percentage_active:
value_storage[dict_keys[11]].append(
(f"EF {pct}% actives",
self.format(getattr(self, f"ef_{pct}"), self.ef_precision)))
value_storage[dict_keys[12]].append(
(f"EF* {pct}% actives",
self.format(getattr(self, f"efs_{pct}"), self.efs_precision)))
value_storage[dict_keys[13]].append(
(f"EF' {pct}% actives",
self.format(getattr(self, f"efp_{pct}"), self.efp_precision)))
value_storage[dict_keys[14]].append(
(f"FOD {pct}% actives",
self.format(getattr(self, f"fod_{pct}"), self.fod_precision)))
# yapf: disable
enrich_data = [('Actives file', self.actives),
('Results file', self.results),
('Total ligands(actives+decoys)', self.total_ligands),
('Total actives', self.total_actives),
('Number of ranked actives', len(self.active_ranks)),
('BEDROC(alpha=160.9)', self.bedroc160_9),
('BEDROC(alpha=160.9) alpha*Ra', self.alphaRa160_9),
('BEDROC(alpha=20.0)', self.bedroc20),
('BEDROC(alpha=20.0) alpha*Ra', self.alphaRa20),
('BEDROC(alpha=8.0)', self.bedroc8_0),
('BEDROC(alpha=8.0) alpha*Ra', self.alphaRa8_0),
('ROC', self.roc),
('RIE', self.rie),
('Area under accumulation curve', self.auac),
('Ave. Num. of outranking decoys',
self.ave_num_outranking_decoys)]
# yapf: enable
for k in dict_keys:
enrich_data.extend(value_storage[k])
rows = []
rows.append([item[0] for item in enrich_data])
rows.append([item[1] for item in enrich_data])
return rows