"""
Stand-alone functions for calculating metrics. The metrics include terms
such as Receiver Operator Characteristic area under the curve (ROC),
Enrichment Factors (EF), and Robust Initial Enhancement (RIE).
Copyright Schrodinger, LLC. All rights reserved.
"""
import itertools
import math
import textwrap
import schrodinger.utils.log as log
logger = log.get_output_logger(__file__)
def _check_positive_int(value, variable_name):
"""
Raise a ValueError if value is not a positive integer.
"""
if not isinstance(value, int) or (value <= 0):
raise ValueError(f"{variable_name} must be a positive integer.")
def _check_none_or_positive_int(value, variable_name):
"""
Raise a ValueError if value is not None or a positive integer.
"""
if not isinstance(value, int) or (value <= 0):
if value is not None:
msg = f"{variable_name} must be None or a positive integer."
raise ValueError(msg)
def _check_non_negative_int(value, variable_name):
"""
Raise a ValueError if value is not a non-negative integer.
"""
if not isinstance(value, int) or (value < 0):
raise ValueError(f"{variable_name} must be a non-negative integer.")
def _check_non_empty_list(lst, lst_name):
"""
Raise a ValueError if lst is an empty list.
"""
if (not isinstance(lst, list)) or (len(lst) == 0):
raise ValueError(f"{lst_name} must be a non-empty list.")
def _get_diversity_factor(fingerprint_comp, active_ranks, title_ranks,
n_actives_sampled_set):
"""
df = (1- min_Tc_actives)/(1 - min_Tc_total_actives)
O(N^2) complexity, N = total number of actives.
:return: a float representation of the diversity factor.
:rtype: float
"""
active_ranks = active_ranks[:n_actives_sampled_set]
active_fingerprint = fingerprint_comp.active_fingerprint
fp_sim = fingerprint_comp.fp_sim
min_Tc_total_actives = fingerprint_comp.min_Tc_total_actives
# The highest possible value for minTc is 1
min_Tc_actives = 1.0
if not active_fingerprint:
msg = "_get_diversity_factor: missing fingerprints. minTc set to 1."
logger.debug(msg)
return min_Tc_actives
for i_rank, j_rank in itertools.combinations(active_ranks, 2):
sim = fp_sim.calculateSimilarity(
active_fingerprint[title_ranks[i_rank]],
active_fingerprint[title_ranks[j_rank]])
if sim < min_Tc_actives:
min_Tc_actives = sim
diverse_factor = (1 - min_Tc_actives) / (1 - min_Tc_total_actives)
return diverse_factor
[docs]def get_active_sample_size_star(total_actives, total_ligands,
adjusted_active_ranks, fraction_of_actives):
"""
The size of the decoy sample set required to recover the specified fraction
of actives. If there are fewer ranked actives than the requested fraction
of all actives then the number of total_ligands is returned.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param adjusted_active_ranks: Modified active ranks; each rank is improved
by the number of preceding actives. For
example, a screen result that placed three
actives as the first three ranks, [1, 2, 3],
has adjusted ranks of [1, 1, 1]. In this way,
actives are not penalized by being outranked
by other actives.
:type adjusted_active_ranks: list(int)
:param fraction_of_actives: Decimal notation for the fraction of
sampled actives, used to determine
the sample set size.
:type fraction_of_actives: float
:return: The size of the decoy sample set required to recover the
specified fraction of actives.
:rtype: int
"""
n_actives = int(round(total_actives * fraction_of_actives))
n_sampled_set = None
try:
# The sampled set size is the rank of the active at the index
# needed to recover n_actives -1, because it is zero-based,
# plus the number of previously ranked decoys.
n_sampled_set = \
adjusted_active_ranks[n_actives - 1] + \
len(adjusted_active_ranks[:n_actives - 2])
except IndexError:
n_sampled_set = total_ligands
return n_sampled_set
[docs]def get_active_sample_size(total_actives, total_ligands, active_ranks,
fraction_of_actives):
"""
The size of the sample set required to recover the specified fraction
of actives. If there are fewer ranked actives than the requested fraction
of all actives then the number of total_ligands is returned.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param fraction_of_actives: Decimal notation for the fraction of
sampled actives, used to determine
the sample set size.
:type fraction_of_actives: float
:return: the size of the sample set required to recover the specified
fraction of actives.
:rtype: int
"""
n_actives = math.ceil(total_actives * fraction_of_actives)
n_sampled_set = None
if n_actives > len(active_ranks):
n_sampled_set = total_ligands
else:
for index, rank in enumerate(active_ranks):
if index + 1 <= n_actives:
n_sampled_set = rank
return n_sampled_set
[docs]def get_decoy_sample_size(total_actives, total_ligands, active_ranks,
fraction_of_decoys):
"""
Returns the size of the sample set required to recover the specified
fraction of decoys.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param fraction_of_decoys: Decimal notation for the fraction of
sampled decoys, used to determine
the sample set size.
:type fraction_of_decoys: float
:return: Size of the sample set required to recover the specified
fraction of decoys.
:rtype: int
"""
total_decoys = total_ligands - total_actives
n_decoys = int(round(fraction_of_decoys * total_decoys))
sampled_decoys = []
size_max = int(round(total_ligands + 1))
for rank in range(1, size_max):
if rank not in active_ranks:
sampled_decoys.append(rank)
if len(sampled_decoys) == n_decoys:
break
n_sampled_set = 0
if sampled_decoys:
n_sampled_set = sampled_decoys[-1] - 1
return n_sampled_set
[docs]def calc_ActivesInN(active_ranks, n_sampled_set):
"""
Return the number of the known active ligands found in a given sample size.
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param n_sampled_set: The number of rank results for which to calculate
the metric. Every active with a rank less than or
equal to this value will be counted as found in
the set.
:type n_sampled_set: int
:return: the number of the known active ligands found in a given
sample size.
:rtype: int
"""
return len([r for r in active_ranks if r <= n_sampled_set])
[docs]def calc_ActivesInNStar(adjusted_active_ranks, n_sampled_set):
"""
Return the number of the known active ligands found in a given sample size.
:param adjusted_active_ranks: Modified active ranks; each rank is improved
by the number of preceding actives. For
example, a screen result that placed three
actives as the first three ranks, [1, 2, 3],
has adjusted ranks of [1, 1, 1]. In this way,
actives are not penalized by being outranked
by other actives.
:type adjusted_active_ranks: list(int)
:param n_sampled_set: The number of rank results for which to calculate
the metric. Every active with a rank less than or
equal to this value will be counted as found in
the set.
:type n_sampled_set: int
:return: the number of the known active ligands found in a given
sample size.
:rtype: int
"""
return len([r for r in adjusted_active_ranks if r <= n_sampled_set])
[docs]def calc_AveNumberOutrankingDecoys(active_ranks):
"""
The rank of each active is adjusted by the number of outranking
actives. The number of outranking decoys is then defined as the
adjusted rank of that active minus one. The number of outranking
decoys is calculated for each docked active and averaged.
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:return: the average number of decoys that outranked the actives.
:rtype: float
"""
# Note:
# Only known actives that dock contribute to this number.
# Experiments that dock one active with a high rank appear
# dramatically better than experiments that dock all known actives
# with so-so ranks.
# Bind default value
ave_num_outranking_decoys = None
num_outranking_decoys = []
for rank in active_ranks:
# The number of decoy ranked higher is the actives rank minus 1,
# and less the number of preceeding actives.
num_outranking_decoys.append((rank - 1) - active_ranks.index(rank))
if len(active_ranks) > 0:
ave_num_outranking_decoys = \
sum(num_outranking_decoys) / len(active_ranks)
return ave_num_outranking_decoys
[docs]def calc_DEF(total_actives,
total_ligands,
active_ranks,
title_ranks,
fingerprint_comp,
n_sampled_set,
min_actives=None):
"""
Diverse Enrichment Factor, calculated with respect to the number of
total ligands.
DEF is defined as::
1 - (min_similarity_among_actives_in_sampled_set)
DEF = EF * --------------------------------------------------
1 - (min_similarity_among_all_actives)
where 'n_sampled_set' is the number of *all* ranks in which
to search for actives.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param title_ranks: *Unadjusted* integer rank keys for title. Not available
for table inputs, or other screen results that don't
list the title.
:type title_ranks: dict(str, int)
:param fingerprint_comp: Fingerprint component data class object
:type fingerprint_comp: enrichment_input.FingerprintComponent
:param n_sampled_set: The number of ranked decoy results for which to
calculate the enrichment factor.
:type n_sampled_set: int
:param min_actives: The number of actives that must be within the
n_sampled_set, otherwise the returned EF value
is None.
:type min_actives: int
:return: Diverse Enrichment Factor (DEF) for the given sample size of
the screen results. If fewer than min_actives are found
in the set, or the calculation raises a ZeroDivisionError,
the returned value is None.
:rtype: float
"""
fp_gen = fingerprint_comp.fp_gen
fp_sim = fingerprint_comp.fp_sim
if not (fp_gen and fp_sim):
logger.debug("calcDEF: fingerprint tools unavailable.")
return None
# Bind initial value.
d_ef = calc_EF(total_actives, total_ligands, active_ranks, n_sampled_set,
min_actives)
diverse_factor = None
# Number of actives in sampled set
n_actives_sampled_set = calc_ActivesInN(active_ranks, n_sampled_set)
# Determine the diversity modifier. Return early if the value
# can't be calculated.
if n_actives_sampled_set <= 1:
diverse_factor = 0 # No ij sim calculation possible.
logger.debug(
"calcDEF %d actives in sample, no similarity to calculate." %
n_actives_sampled_set)
return None
elif n_actives_sampled_set == 1 and total_actives == 1:
diverse_factor = 1
else:
diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks,
title_ranks,
n_actives_sampled_set)
# Finally, do d_ef calc.
d_ef *= diverse_factor
return d_ef
[docs]def calc_DEFStar(total_actives,
total_ligands,
active_ranks,
title_ranks,
fingerprint_comp,
n_sampled_decoy_set,
min_actives=None):
"""
Here, Diverse EF* (DEF*) is defined as::
1 - (min_similarity_among_actives_in_sampled_set)
DEF = EF_star * --------------------------------------------------
1 - (min_similarity_among_all_actives)
where 'n_sampled_decoy_set' is the number of *decoy* ranks in
which to search for actives.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param title_ranks: *Unadjusted* integer rank keys for title. Not available
for table inputs, or other screen results that don't
list the title.
:type title_ranks: dict(str, int)
:param fingerprint_comp: Fingerprint component data class object
:type fingerprint_comp: enrichment_input.FingerprintComponent
:param n_sampled_decoy_set: The number of ranked decoys for which to
calculate the enrichment factor.
:type n_sampled_decoy_set: int
:param min_actives: The number of actives that must be within the
n_sampled_decoy_set, otherwise the returned
EF value is None.
:type min_actives: int
:return: Diverse Enrichment Factor (DEF*) for the given sample size
of the screen results, calculated with respect to the total
decoys instead of the more traditional total ligands.
If fewer than min_actives are found in the set the returned
value is None.
:rtype: float
"""
fp_gen = fingerprint_comp.fp_gen
fp_sim = fingerprint_comp.fp_sim
if not (fp_gen and fp_sim):
logger.debug("calcDEFStar: fingerprint tools unavailable.")
return None
# Bind initial value.
defs = calc_EFStar(total_actives, total_ligands, active_ranks,
n_sampled_decoy_set, min_actives)
n_actives_sampled_set = 0
for rank in range(1, int(n_sampled_decoy_set + 2)):
if rank in active_ranks:
n_actives_sampled_set += 1
# Determine the diversity modifier. Return early if the value
# can't be calculated.
if n_actives_sampled_set <= 1:
diverse_factor = 0 # No ij sim calculation possible.
logger.debug(
"calcDEFStar %d actives in sample, no similarity to calculate." %
n_actives_sampled_set)
return None
elif n_actives_sampled_set == 1 and total_actives == 1:
diverse_factor = 1
else:
diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks,
title_ranks,
n_actives_sampled_set)
# Finally, do d_ef calc.
defs *= diverse_factor
return defs
[docs]def calc_DEFP(total_actives,
total_ligands,
active_ranks,
title_ranks,
fingerprint_comp,
n_sampled_decoy_set,
min_actives=None):
"""
Diverse EF' (DEF') is defined as::
1 - (min_similarity_among_actives_in_sampled_set)
DEF' = EF' * --------------------------------------------------
1 - (min_similarity_among_all_actives)
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param title_ranks: *Unadjusted* integer rank keys for title. Not available
for table inputs, or other screen results that don't
list the title.
:type title_ranks: dict(str, int)
:param fingerprint_comp: Fingerprint component data class object
:type fingerprint_comp: enrichment_input.FingerprintComponent
:param n_sampled_decoy_set: The number of ranked decoy results for
which to calculate the enrichment factor.
:type n_sampled_decoy_set: int
:param min_actives: The number of actives that must be within the
n_sampled_decoy_set, otherwise the returned
EF' value is None.
:type min_actives: int
:return: Diverse Enrichment Factor prime (DEF') for a given sample
size. If fewer than min_actives are found in the set the
returned value is None.
:rtype: float
"""
fp_gen = fingerprint_comp.fp_gen
fp_sim = fingerprint_comp.fp_sim
if not (fp_gen and fp_sim):
logger.debug("calcDEFStar: fingerprint tools unavailable.")
return None
# Bind initial value.
defp = calc_EFP(total_actives, total_ligands, active_ranks,
n_sampled_decoy_set, min_actives)
n_actives_sampled_set = 0
for rank in range(1, int(total_ligands) + 1):
if rank in active_ranks:
n_actives_sampled_set += 1
if rank >= n_sampled_decoy_set + 1:
break
# Determine the diversity modifier. Return early if the value
# can't be calculated.
if n_actives_sampled_set <= 1:
diverse_factor = 0 # No ij sim calculation possible.
logger.debug(
"calcDEFStar %d actives in sample, no similarity to calculate." %
n_actives_sampled_set)
return None
elif n_actives_sampled_set == 1 and total_actives == 1:
diverse_factor = 1
else:
diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks,
title_ranks,
n_actives_sampled_set)
defp *= diverse_factor
return defp
[docs]def calc_EF(total_actives,
total_ligands,
active_ranks,
n_sampled_set,
min_actives=None):
"""
Calculates the Enrichment factor (EF) for the given sample size of the
screen results. If fewer than min_actives are found in the set,
or the calculation raises a ZeroDivisionError, the returned value is None.
EF is defined as::
n_actives_in_sampled_set / n_sampled_set
EF = ----------------------------------------
total_actives / total_ligands
where 'n_sampled_set' is the number of *all* ranks in which
to search for actives.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param n_sampled_set: The number of ranked results for which to calculate
the enrichment factor.
:type n_sampled_set: int
:param min_actives: The number of actives that must be within the
n_sampled_set, otherwise the returned EF value is None.
:type min_actives: int
:return: enrichment factor
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_positive_int(n_sampled_set, "n_sampled_set")
_check_non_empty_list(active_ranks, "active_ranks")
_check_none_or_positive_int(min_actives, "min_actives")
# Bind default value
ef = None
# Number of actives in sampled set.
n_actives_sampled_set = calc_ActivesInN(active_ranks, n_sampled_set)
# Sanity check this threshold. Return early if the value can't
# be calculated.
if min_actives and (min_actives > n_actives_sampled_set):
logger.warning(
"calc_EF: %d minimum actives, but only %d in the sample." %
(min_actives, n_actives_sampled_set))
return ef
# Finally, do ef calc.
ef = (n_actives_sampled_set / n_sampled_set) / \
(total_actives / total_ligands)
return ef
[docs]def calc_EFStar(total_actives,
total_ligands,
active_ranks,
n_sampled_decoy_set,
min_actives=None):
"""
Calculate the Enrichment factor* (EF*) for the given sample size of
the screen results, calculated with respect to the total decoys instead of
the more traditional total ligands. If fewer than min_actives are found
in the set the returned value is None.
Here, EF* is defined as::
n_actives_in_sampled_set / n_sampled_decoy_set
EF* = ----------------------------------------------
total_actives / total_decoys
where 'n_sampled_decoy_set' is the number of *decoy* ranks in
which to search for actives.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param n_sampled_decoy_set: The number of ranked decoys
for which to calculate the enrichment factor.
:type n_sampled_decoy_set: int
:param min_actives: The number of actives that must be within the
n_sampled_decoy_set, otherwise the returned EF value
is None.
:type min_actives: int
:return: enrichment factor*
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_non_empty_list(active_ranks, "active_ranks")
_check_non_negative_int(n_sampled_decoy_set, "n_sampled_decoy_set")
_check_none_or_positive_int(min_actives, "min_actives")
# Bind default value.
ef = None
total_decoys = total_ligands - total_actives
if total_decoys <= 0:
raise ValueError("total_ligands must be larger than total_actives.")
n_decoys = 0.0
n_actives = 0.0
n_actives_sampled_set = 0.0
for rank in range(1, int(n_sampled_decoy_set + 2)):
if rank in active_ranks:
n_actives += 1.0
else:
n_decoys += 1.0
frac_active_found = n_actives / total_actives
frac_decoy_found = n_decoys / total_decoys
n_actives_sampled_set = n_actives
if min_actives and (n_actives_sampled_set < min_actives):
logger.warning(
"calc_EFStar: %d minimum actives, but only %d in the sample." %
(min_actives, n_actives_sampled_set))
ef = None
else:
try:
ef = frac_active_found / frac_decoy_found
except ZeroDivisionError:
logger.debug("calc_EFStar: caught ZeroDivisionError.")
if frac_active_found:
logger.debug("Actives found in the sampled set, return inf.")
ef = float('inf')
else:
logger.debug("No actives in the sampled set, return None.")
ef = None
return ef
[docs]def calc_EFP(total_actives,
total_ligands,
active_ranks,
n_sampled_decoy_set,
min_actives=None):
"""
Calculates modified enrichment factor defined using the average of the
reciprocals of the EF* enrichment factors for recovering the first
aa% of the known actives, Enrichment Factor prime (EF').
EF'(x) will be larger than EF*(x) if the actives in question come
relatively early in the sequence, and smaller if they come relatively
late. If fewer than min_actives are found in the set the returned
value is None.
EF' is defined as::
n_actives_sampled_set
EF' = --------------------------------------------
cumulative_sum(frac. decoys/frac. actives)
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param n_sampled_decoy_set: The number of ranked decoys
for which to calculate the enrichment factor.
:type n_sampled_decoy_set: int
:param min_actives: The number of actives that must be within the
n_sampled_decoy_set, otherwise the returned EF value
is None.
:type min_actives: int
:return: enrichment factor prime
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_non_empty_list(active_ranks, "active_ranks")
_check_non_negative_int(n_sampled_decoy_set, "n_sampled_decoy_set")
_check_none_or_positive_int(min_actives, "min_actives")
# Bind default value
efp = None
total_decoys = total_ligands - total_actives
if total_decoys <= 0:
raise ValueError("total_ligands must be larger than total_actives.")
nkount = 0.0
frac_actives_found = 0.0
frac_decoys_found = 0.0
ensum = 0.0
n_actives = 0.0
n_decoys = 0.0
active_ranks_set = set(active_ranks)
for rank in range(1, int(total_ligands) + 1):
if rank in active_ranks_set:
n_actives += 1
frac_actives_found = n_actives / total_actives
else:
n_decoys += 1
frac_decoys_found = n_decoys / total_decoys
if frac_actives_found > 0 and frac_decoys_found > 0:
nkount += 1
ensum += frac_decoys_found / frac_actives_found
if rank >= n_sampled_decoy_set + 1:
break
if n_actives == 0:
logger.warning("calc_EFP: No actives found in the sample.")
efp = None
elif min_actives and (n_actives < min_actives):
logger.warning(
"calc_EFP: %d minimum actives, but only %d in the sample." %
(min_actives, n_sampled_decoy_set))
efp = None
else:
try:
efp = nkount / ensum
except ZeroDivisionError:
logger.debug("calc_EFP: caught ZeroDivisionError.")
logger.debug("Actives found in the sampled set, return inf.")
efp = float('inf')
return efp
[docs]def calc_FOD(total_actives, total_ligands, active_ranks, fraction_of_actives):
r"""
Calculates the average fraction of decoys outranking the given
fraction, provided as a float, of known active ligands.
The returned value is None if a) the calculation raises a
ZeroDivisionError, or b) fraction_of_actives generates
more actives than are ranked, or c) the fraction_of_actives
is greater than 1.0
FOD is defined as::
__
1 \ number_outranking_decoys_in_sampled_set
FOD = ------------- / ---------------------------------------
num_actives -- total_decoys
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param fraction_of_actives: Decimal notation of the fraction of sampled
actives, used to set the sampled set size.
:type fraction_of_actives: float
:return: Average fraction of outranking decoys.
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_non_empty_list(active_ranks, "active_ranks")
# Bind default value
fod = None
total_decoys = total_ligands - total_actives
n_actives = int(round(total_actives * fraction_of_actives))
if total_decoys <= 0:
raise ValueError("total_ligands must be larger than total_actives.")
if (fraction_of_actives > 1.0) or (fraction_of_actives < 0.0):
msg = textwrap.dedent("""
calc_FOD: fraction_of_actives must be within [0, 1],
setting FOD as None.
""")
logger.warning(msg)
return fod
if (n_actives == 0):
logger.warning("calc_FOD: n_actives is 0, setting FOD as None.")
return fod
if (n_actives > len(active_ranks)):
msg = textwrap.dedent("""
calc_FOD: fraction_of_actives is generating more actives than
ranked, setting FOD as None.
""")
logger.warning(msg)
return fod
# Create a list of the frac. of outranking decoys for each active rank.
fractions = []
for rank in active_ranks[0:n_actives]:
# Number of decoys ranked higher is the rank of the active,
# not counting itself (-1), and not count any preceeding actives
# (index.(rank))
num_outranking_decoys = float((rank - 1) - active_ranks.index(rank))
fractions.append(num_outranking_decoys / total_decoys)
# Finally, do FOD calc.
fod = sum(fractions) / n_actives
return fod
[docs]def calc_EFF(total_actives, total_ligands, adjusted_active_ranks,
fraction_of_decoys):
"""
Calculate efficiency in distinguishing actives from decoys (EFF)
on an absolute scale of 1 (perfect; all actives come before any decoys)
to -1 (all decoys come before any actives); a value of 0 means that
actives and decoys were recovered at equal proportionate rates.
The returned value is None if the calculation raises a ZeroDivisionError.
EFF is defined as::
frac. actives in sample
EFF = (2* -----------------------------------------------) - 1
frac actives in sample + frac. decoys in sample
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param adjusted_active_ranks: Modified active ranks; each rank is improved
by the number of preceding actives. For
example, a screen result that placed three
actives as the first three ranks, [1, 2, 3],
has adjusted ranks of [1, 1, 1]. In this way,
actives are not penalized by being outranked
by other actives.
:type adjusted_active_ranks: list(int)
:param fraction_of_decoys: The size of the set is in terms of the number of
decoys in the screen. For example, given 1000
decoys and fraction_of_decoys = 0.20, actives
that appear within the first 200 ranks are
counted.
:type fraction_of_decoys: float
:return: Active recovery efficiency at a particular sample set size
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_non_empty_list(adjusted_active_ranks, "adjusted_active_ranks")
# Bind default value
eff = None
total_decoys = total_ligands - total_actives
if total_decoys <= 0:
raise ValueError("total_ligands must be larger than total_actives.")
# Determine fraction of database to assay.
n_sampled_set = total_decoys * fraction_of_decoys
# Number of actives in sampled set.
n_actives_sampled_set = calc_ActivesInNStar(adjusted_active_ranks,
n_sampled_set)
frac_actives = n_actives_sampled_set / total_actives
frac_decoys = n_sampled_set / total_decoys
# Finally, do eff calc.
try:
eff = 2.0 * (frac_actives / (frac_actives + frac_decoys)) - 1
except ZeroDivisionError:
logger.warning("calc_EFF: caught ZeroDivisionError.")
return eff
[docs]def calc_BEDROC(total_actives, total_ligands, active_ranks, alpha=20.0):
"""
Boltzmann-enhanced Discrimination Receiver Operator Characteristic
area under the curve. The value is bounded between 1 and 0,
with 1 being ideal screen performance. The default alpha=20
weights the first ~8% of screen results. When alpha*Ra << 1,
where Ra is the radio of total actives to total ligands, and
alpha is the exponential prefactor, the BEDROC metric takes on
a probabilistic meaning. Calculated as described by Trunchon
and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq 36.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param alpha: Exponential prefactor for adjusting early enrichment
emphasis. Larger values more heavily weight the early ranks.
alpha = 20 weights the first ~8% of the screen, alpha = 10
weights the first ~10% of the screen, alpha = 50 weights
the first ~3% of the screen results.
:type alpha: float
:return: a tuple of two floats, the first represents the area under
the curve for the Boltzmann-enhanced discrimination of ROC
(BEDROC) analysis, the second is the alpha*Ra term.
:rtype: (float, float)
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_non_empty_list(active_ranks, "active_ranks")
if alpha <= 0:
raise ValueError("alpha must be greater than zero.")
bedroc = None
Ra = total_actives / total_ligands
alphaRa = alpha * Ra
rie = calc_RIE(total_actives, total_ligands, active_ranks, alpha)
term_1 = None
term_2 = None
try:
term_1 = Ra * math.sinh(alpha / 2.0) /\
(math.cosh(alpha / 2.0) - math.cosh(alpha / 2.0 - Ra * alpha))
term_2 = 1.0 / (1.0 - math.exp(alpha * (1.0 - Ra)))
bedroc = rie * term_1 + term_2
except ZeroDivisionError:
raise ZeroDivisionError("unexpected ZeroDivisionError.")
except OverflowError:
raise OverflowError("unexpected OverflowError.")
return (bedroc, alphaRa)
[docs]def calc_RIE(total_actives, total_ligands, active_ranks, alpha=20.0):
"""
Robust Initial Enhancement (RIE). Active ranks are weighted with
an continuously decreasing exponential term. Large positive
RIE values indicate better screen performance. Calculated as
described by Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47,
488-508 Eq 18.
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:param alpha: Exponential prefactor for adjusting early enrichment
emphasis. Larger values more heavily weight the early ranks.
alpha = 20 weights the first ~8% of the screen, alpha = 10
weights the first ~10% of the screen, alpha = 50 weights
the first ~3% of the screen results.
:type alpha: float
:return: a float for the Robust Initial Enhancement (RIE).
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_non_empty_list(active_ranks, "active_ranks")
if alpha <= 0:
raise ValueError("alpha must be greater than zero.")
Ra = total_actives / total_ligands
wght_sum = sum(
[math.exp(-1 * alpha * r_i / total_ligands) for r_i in active_ranks])
try:
rie = wght_sum / (Ra * (1.0 - math.exp(-alpha)) /
(math.exp(alpha / total_ligands) - 1.0))
except ZeroDivisionError:
msg = textwrap.dedent("""
calc_RIE: There were zero total actives for RIE calculation,
setting RIE to zero.
""")
logger.warning(msg)
rie = 0.0
return rie
[docs]def calc_AUAC(total_actives, total_ligands, total_ranked, active_ranks):
"""
Area Under the Accumulation Curve (AUAC). The value is bounded between
1 and 0, with 1 being ideal screen performance. Calculated as described
by Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq 8.
(execept adjusted to a trapezoidal integration, to decrease errors
for small data sets).
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param total_ranked: The total number of ligands ranked by the virtual
screen scoring metric.
:type total_ranked: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:return: A float representation of the Area Under the Accumulation Curve.
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_positive_int(total_ranked, "total_ranked")
_check_non_empty_list(active_ranks, "active_ranks")
# Integrate stepwise over the ranked actives with curve terminating
# at (Ntotal,Nranked_actives)
area = sum([total_ligands - rank for rank in active_ranks])
# Add trapezoidal correction
area += len(active_ranks) / 2.0
# Add correction for unranked actives, evenly distributed
# after last ranked ligand. Curve now drawn to (Ntotal,Nactives)
area += (total_actives - len(active_ranks)) * (total_ligands -
total_ranked) / 2.0
# Normalize by the Ntotal*Nactives area to get a ratio:
auac = area / (total_actives * total_ligands)
return auac
[docs]def calc_ROC(total_actives, total_ligands, adjusted_active_ranks):
"""
Calculates a representation of the Receiver Operator Characteristic
area underneath the curve. Typically interpreted as the probability
an active will appear before an inactive. A value of 1.0 reflects
ideal performance, a value of 0.5 reflects a performance on par with
random selection. Calculated as described by:
Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq A.8
Clasically ROC area is defined as::
AUAC Ra
ROC = ------ - -----
Ri 2Ri
Where AUAC is the area under the accumulation curve, Ri is the ratio
of inactives, Ra is the ratio of actives.
A different method is used here in order to account for unranked
actives - see PYTHON-3055 & PYTHON-3106
:param total_actives: The total number of active ligands in the screen,
ranked and unranked.
:type total_actives: int
:param total_ligands: The total number of ligands (actives and unknowns/
decoys) used in the screen.
:type total_ligands: int
:param active_ranks: List of *unadjusted* integer ranks for the actives
found in the screen. For example, a screen result that
placed three actives as the first three ranks has an
active_ranks list of = [1, 2, 3].
:type active_ranks: list(int)
:return: receiver operator characteristic area underneath the curve
:rtype: float
"""
_check_positive_int(total_actives, "total_actives")
_check_positive_int(total_ligands, "total_ligands")
_check_non_empty_list(adjusted_active_ranks, "adjusted_active_ranks")
total_decoys = total_ligands - total_actives
if total_decoys <= 0:
raise ValueError("total_ligands must be larger than total_actives.")
# Integrate stepwise over the ranked actives with curve terminating at
# (Ndecoys,Nranked_actives)
area = sum(
[total_decoys - adj_rank + 1 for adj_rank in adjusted_active_ranks])
# Normalize by the Ndecoys*Nactives area
roc = area / (total_decoys * total_actives)
return roc