Source code for schrodinger.analysis.enrichment.metrics

"""
Stand-alone functions for calculating metrics. The metrics include terms
such as Receiver Operator Characteristic area under the curve (ROC),
Enrichment Factors (EF), and Robust Initial Enhancement (RIE).

Copyright Schrodinger, LLC. All rights reserved.
"""

import itertools
import math
import textwrap

import scipy.stats

import schrodinger.utils.log as log

logger = log.get_output_logger(__file__)


def _check_positive_int(value, variable_name):
    """
    Raise a ValueError if value is not a positive integer.
    """
    if not isinstance(value, int) or (value <= 0):
        raise ValueError(f"{variable_name} must be a positive integer.")


def _check_none_or_positive_int(value, variable_name):
    """
    Raise a ValueError if value is not None or a positive integer.
    """
    if not isinstance(value, int) or (value <= 0):
        if value is not None:
            msg = f"{variable_name} must be None or a positive integer."
            raise ValueError(msg)


def _check_non_negative_int(value, variable_name):
    """
    Raise a ValueError if value is not a non-negative integer.
    """
    if not isinstance(value, int) or (value < 0):
        raise ValueError(f"{variable_name} must be a non-negative integer.")


def _check_non_empty_list(lst, lst_name):
    """
    Raise a ValueError if lst is an empty list.
    """
    if (not isinstance(lst, list)) or (len(lst) == 0):
        raise ValueError(f"{lst_name} must be a non-empty list.")


def _get_diversity_factor(fingerprint_comp, active_ranks, title_ranks,
                          n_actives_sampled_set):
    """
    df = (1- min_Tc_actives)/(1 - min_Tc_total_actives)
    O(N^2) complexity, N = total number of actives.

    :return: a float representation of the diversity factor.
    :rtype: float

    """
    active_ranks = active_ranks[:n_actives_sampled_set]

    active_fingerprint = fingerprint_comp.active_fingerprint
    fp_sim = fingerprint_comp.fp_sim
    min_Tc_total_actives = fingerprint_comp.min_Tc_total_actives

    # The highest possible value for minTc is 1
    min_Tc_actives = 1.0

    if not active_fingerprint:
        msg = "_get_diversity_factor: missing fingerprints. minTc set to 1."
        logger.debug(msg)
        return min_Tc_actives

    for i_rank, j_rank in itertools.combinations(active_ranks, 2):
        sim = fp_sim.calculateSimilarity(
            active_fingerprint[title_ranks[i_rank]],
            active_fingerprint[title_ranks[j_rank]])
        if sim < min_Tc_actives:
            min_Tc_actives = sim

    diverse_factor = (1 - min_Tc_actives) / (1 - min_Tc_total_actives)

    return diverse_factor


[docs]def get_active_sample_size_star(total_actives, total_ligands,
                                adjusted_active_ranks, fraction_of_actives):
    """
    The size of the decoy sample set required to recover the specified fraction
    of actives.  If there are fewer ranked actives than the requested fraction
    of all actives then the number of total_ligands is returned.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param adjusted_active_ranks: Modified active ranks; each rank is improved
                                  by the number of preceding actives. For
                                  example, a screen result that placed three
                                  actives as the first three ranks, [1, 2, 3],
                                  has adjusted ranks of [1, 1, 1]. In this way,
                                  actives are not penalized by being outranked
                                  by other actives.
    :type adjusted_active_ranks: list(int)

    :param fraction_of_actives: Decimal notation for the fraction of
                                sampled actives, used to determine
                                the sample set size.
    :type fraction_of_actives: float

    :return: The size of the decoy sample set required to recover the
             specified fraction of actives.
    :rtype: int

    """

    n_actives = int(round(total_actives * fraction_of_actives))

    n_sampled_set = None
    try:
        # The sampled set size is the rank of the active at the index
        # needed to recover n_actives -1, because it is zero-based,
        # plus the number of previously ranked decoys.
        n_sampled_set = \
            adjusted_active_ranks[n_actives - 1] + \
            len(adjusted_active_ranks[:n_actives - 2])
    except IndexError:
        n_sampled_set = total_ligands

    return n_sampled_set


[docs]def get_active_sample_size(total_actives, total_ligands, active_ranks,
                           fraction_of_actives):
    """
    The size of the sample set required to recover the specified fraction
    of actives.  If there are fewer ranked actives than the requested fraction
    of all actives then the number of total_ligands is returned.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param fraction_of_actives: Decimal notation for the fraction of
                                sampled actives, used to determine
                                the sample set size.
    :type fraction_of_actives: float

    :return: the size of the sample set required to recover the specified
             fraction of actives.
    :rtype: int

    """

    n_actives = math.ceil(total_actives * fraction_of_actives)
    n_sampled_set = None
    if n_actives > len(active_ranks):
        n_sampled_set = total_ligands
    else:
        for index, rank in enumerate(active_ranks):
            if index + 1 <= n_actives:
                n_sampled_set = rank

    return n_sampled_set


[docs]def get_decoy_sample_size(total_actives, total_ligands, active_ranks,
                          fraction_of_decoys):
    """
    Returns the size of the sample set required to recover the specified
    fraction of decoys.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param fraction_of_decoys: Decimal notation for the fraction of
                               sampled decoys, used to determine
                               the sample set size.
    :type fraction_of_decoys: float

    :return: Size of the sample set required to recover the specified
             fraction of decoys.
    :rtype: int
    """

    total_decoys = total_ligands - total_actives
    n_decoys = int(round(fraction_of_decoys * total_decoys))
    sampled_decoys = []
    size_max = int(round(total_ligands + 1))
    for rank in range(1, size_max):
        if rank not in active_ranks:
            sampled_decoys.append(rank)
        if len(sampled_decoys) == n_decoys:
            break

    n_sampled_set = 0
    if sampled_decoys:
        n_sampled_set = sampled_decoys[-1] - 1

    return n_sampled_set


[docs]def calc_ActivesInN(active_ranks, n_sampled_set):
    """
    Return the number of the known active ligands found in a given sample size.

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param n_sampled_set: The number of rank results for which to calculate
                          the metric. Every active with a rank less than or
                          equal to this value will be counted as found in
                          the set.
    :type n_sampled_set: int

    :return: the number of the known active ligands found in a given
             sample size.
    :rtype: int

    """

    return len([r for r in active_ranks if r <= n_sampled_set])


[docs]def calc_ActivesInNStar(adjusted_active_ranks, n_sampled_set):
    """
    Return the number of the known active ligands found in a given sample size.

    :param adjusted_active_ranks: Modified active ranks; each rank is improved
                                  by the number of preceding actives. For
                                  example, a screen result that placed three
                                  actives as the first three ranks, [1, 2, 3],
                                  has adjusted ranks of [1, 1, 1]. In this way,
                                  actives are not penalized by being outranked
                                  by other actives.
    :type adjusted_active_ranks: list(int)

    :param n_sampled_set: The number of rank results for which to calculate
                          the metric. Every active with a rank less than or
                          equal to this value will be counted as found in
                          the set.
    :type n_sampled_set: int

    :return: the number of the known active ligands found in a given
             sample size.
    :rtype: int

    """

    return len([r for r in adjusted_active_ranks if r <= n_sampled_set])


[docs]def calc_AveNumberOutrankingDecoys(active_ranks):
    """
    The rank of each active is adjusted by the number of outranking
    actives.  The number of outranking decoys is then defined as the
    adjusted rank of that active minus one.  The number of outranking
    decoys is calculated for each docked active and averaged.

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :return: the average number of decoys that outranked the actives.
    :rtype: float

    """

    # Note:
    # Only known actives that dock contribute to this number.
    # Experiments that dock one active with a high rank appear
    # dramatically better than experiments that dock all known actives
    # with so-so ranks.

    # Bind default value
    ave_num_outranking_decoys = None
    num_outranking_decoys = []
    for rank in active_ranks:
        # The number of decoy ranked higher is the actives rank minus 1,
        # and less the number of preceeding actives.
        num_outranking_decoys.append((rank - 1) - active_ranks.index(rank))

    if len(active_ranks) > 0:
        ave_num_outranking_decoys = \
            sum(num_outranking_decoys) / len(active_ranks)

    return ave_num_outranking_decoys


[docs]def calc_DEF(total_actives,
             total_ligands,
             active_ranks,
             title_ranks,
             fingerprint_comp,
             n_sampled_set,
             min_actives=None):
    """
    Diverse Enrichment Factor, calculated with respect to the number of
    total ligands.

    DEF is defined as::

                  1 - (min_similarity_among_actives_in_sampled_set)
      DEF = EF * --------------------------------------------------
                  1 - (min_similarity_among_all_actives)

    where 'n_sampled_set' is the number of *all* ranks in which
    to search for actives.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param title_ranks: *Unadjusted* integer rank keys for title. Not available
                       for table inputs, or other screen results that don't
                       list the title.
    :type title_ranks: dict(str, int)

    :param fingerprint_comp: Fingerprint component data class object
    :type fingerprint_comp: enrichment_input.FingerprintComponent

    :param n_sampled_set: The number of ranked decoy results for which to
                          calculate the enrichment factor.
    :type n_sampled_set: int

    :param min_actives: The number of actives that must be within the
                        n_sampled_set, otherwise the returned EF value
                        is None.
    :type min_actives: int

    :return: Diverse Enrichment Factor (DEF) for the given sample size of
             the screen results.  If fewer than min_actives are found
             in the set, or the calculation raises a ZeroDivisionError,
             the returned value is None.
    :rtype: float

    """

    fp_gen = fingerprint_comp.fp_gen
    fp_sim = fingerprint_comp.fp_sim

    if not (fp_gen and fp_sim):
        logger.debug("calcDEF: fingerprint tools unavailable.")
        return None

    # Bind initial value.
    d_ef = calc_EF(total_actives, total_ligands, active_ranks, n_sampled_set,
                   min_actives)
    diverse_factor = None

    # Number of actives in sampled set
    n_actives_sampled_set = calc_ActivesInN(active_ranks, n_sampled_set)

    # Determine the diversity modifier.  Return early if the value
    # can't be calculated.
    if n_actives_sampled_set <= 1:
        diverse_factor = 0  # No ij sim calculation possible.
        logger.debug(
            "calcDEF %d actives in sample, no similarity to calculate." %
            n_actives_sampled_set)
        return None
    elif n_actives_sampled_set == 1 and total_actives == 1:
        diverse_factor = 1
    else:
        diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks,
                                               title_ranks,
                                               n_actives_sampled_set)

    # Finally, do d_ef calc.
    d_ef *= diverse_factor

    return d_ef


[docs]def calc_DEFStar(total_actives,
                 total_ligands,
                 active_ranks,
                 title_ranks,
                 fingerprint_comp,
                 n_sampled_decoy_set,
                 min_actives=None):
    """
    Here, Diverse EF* (DEF*) is defined as::

                       1 - (min_similarity_among_actives_in_sampled_set)
      DEF = EF_star * --------------------------------------------------
                            1 - (min_similarity_among_all_actives)

    where 'n_sampled_decoy_set' is the number of *decoy* ranks in
    which to search for actives.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param title_ranks: *Unadjusted* integer rank keys for title. Not available
                       for table inputs, or other screen results that don't
                       list the title.
    :type title_ranks: dict(str, int)

    :param fingerprint_comp: Fingerprint component data class object
    :type fingerprint_comp: enrichment_input.FingerprintComponent

    :param n_sampled_decoy_set: The number of ranked decoys for which to
                                calculate the enrichment factor.
    :type n_sampled_decoy_set: int

    :param min_actives: The number of actives that must be within the
                        n_sampled_decoy_set, otherwise the returned
                        EF value is None.
    :type min_actives: int

    :return: Diverse Enrichment Factor (DEF*) for the given sample size
             of the screen results, calculated with respect to the total
             decoys instead of the more traditional total ligands.
             If fewer than min_actives are found in the set the returned
             value is None.
    :rtype: float
    """

    fp_gen = fingerprint_comp.fp_gen
    fp_sim = fingerprint_comp.fp_sim

    if not (fp_gen and fp_sim):
        logger.debug("calcDEFStar: fingerprint tools unavailable.")
        return None

    # Bind initial value.
    defs = calc_EFStar(total_actives, total_ligands, active_ranks,
                       n_sampled_decoy_set, min_actives)

    n_actives_sampled_set = 0
    for rank in range(1, int(n_sampled_decoy_set + 2)):
        if rank in active_ranks:
            n_actives_sampled_set += 1

    # Determine the diversity modifier.  Return early if the value
    # can't be calculated.
    if n_actives_sampled_set <= 1:
        diverse_factor = 0  # No ij sim calculation possible.
        logger.debug(
            "calcDEFStar %d actives in sample, no similarity to calculate." %
            n_actives_sampled_set)
        return None
    elif n_actives_sampled_set == 1 and total_actives == 1:
        diverse_factor = 1
    else:
        diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks,
                                               title_ranks,
                                               n_actives_sampled_set)

    # Finally, do d_ef calc.
    defs *= diverse_factor

    return defs


[docs]def calc_DEFP(total_actives,
              total_ligands,
              active_ranks,
              title_ranks,
              fingerprint_comp,
              n_sampled_decoy_set,
              min_actives=None):
    """
    Diverse EF' (DEF') is defined as::

                   1 - (min_similarity_among_actives_in_sampled_set)
      DEF' = EF' * --------------------------------------------------
                        1 - (min_similarity_among_all_actives)

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param title_ranks: *Unadjusted* integer rank keys for title. Not available
                       for table inputs, or other screen results that don't
                       list the title.
    :type title_ranks: dict(str, int)

    :param fingerprint_comp: Fingerprint component data class object
    :type fingerprint_comp: enrichment_input.FingerprintComponent

    :param n_sampled_decoy_set: The number of ranked decoy results for
                                which to calculate the enrichment factor.
    :type n_sampled_decoy_set: int

    :param min_actives: The number of actives that must be within the
                        n_sampled_decoy_set, otherwise the returned
                        EF' value is None.
    :type min_actives: int

    :return: Diverse Enrichment Factor prime (DEF') for a given sample
             size. If fewer than min_actives are found in the set the
             returned value is None.
    :rtype: float

    """

    fp_gen = fingerprint_comp.fp_gen
    fp_sim = fingerprint_comp.fp_sim

    if not (fp_gen and fp_sim):
        logger.debug("calcDEFStar: fingerprint tools unavailable.")
        return None

    # Bind initial value.
    defp = calc_EFP(total_actives, total_ligands, active_ranks,
                    n_sampled_decoy_set, min_actives)

    n_actives_sampled_set = 0
    for rank in range(1, int(total_ligands) + 1):
        if rank in active_ranks:
            n_actives_sampled_set += 1
        if rank >= n_sampled_decoy_set + 1:
            break

    # Determine the diversity modifier.  Return early if the value
    # can't be calculated.
    if n_actives_sampled_set <= 1:
        diverse_factor = 0  # No ij sim calculation possible.
        logger.debug(
            "calcDEFStar %d actives in sample, no similarity to calculate." %
            n_actives_sampled_set)
        return None
    elif n_actives_sampled_set == 1 and total_actives == 1:
        diverse_factor = 1
    else:
        diverse_factor = _get_diversity_factor(fingerprint_comp, active_ranks,
                                               title_ranks,
                                               n_actives_sampled_set)

    defp *= diverse_factor

    return defp


[docs]def calc_EF(total_actives,
            total_ligands,
            active_ranks,
            n_sampled_set,
            min_actives=None):
    """
    Calculates the Enrichment factor (EF) for the given sample size of the
    screen results. If fewer than min_actives are found in the set,
    or the calculation raises a ZeroDivisionError, the returned value is None.

    EF is defined as::

            n_actives_in_sampled_set / n_sampled_set
      EF =  ----------------------------------------
                 total_actives / total_ligands

    where 'n_sampled_set' is the number of *all* ranks in which
    to search for actives.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param n_sampled_set: The number of ranked results for which to calculate
                          the enrichment factor.
    :type n_sampled_set: int

    :param min_actives: The number of actives that must be within the
                        n_sampled_set, otherwise the returned EF value is None.
    :type min_actives: int

    :return: enrichment factor
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_positive_int(n_sampled_set, "n_sampled_set")
    _check_non_empty_list(active_ranks, "active_ranks")
    _check_none_or_positive_int(min_actives, "min_actives")

    # Bind default value
    ef = None

    # Number of actives in sampled set.
    n_actives_sampled_set = calc_ActivesInN(active_ranks, n_sampled_set)

    # Sanity check this threshold. Return early if the value can't
    # be calculated.
    if min_actives and (min_actives > n_actives_sampled_set):
        logger.warning(
            "calc_EF: %d minimum actives, but only %d in the sample." %
            (min_actives, n_actives_sampled_set))
        return ef

    # Finally, do ef calc.
    ef = (n_actives_sampled_set / n_sampled_set) / \
         (total_actives / total_ligands)

    return ef


[docs]def calc_EFStar(total_actives,
                total_ligands,
                active_ranks,
                n_sampled_decoy_set,
                min_actives=None):
    """
    Calculate the Enrichment factor* (EF*) for the given sample size of
    the screen results, calculated with respect to the total decoys instead of
    the more traditional total ligands. If fewer than min_actives are found
    in the set the returned value is None.

    Here, EF* is defined as::

             n_actives_in_sampled_set / n_sampled_decoy_set
      EF* =  ----------------------------------------------
                  total_actives / total_decoys

    where 'n_sampled_decoy_set' is the number of *decoy* ranks in
    which to search for actives.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param n_sampled_decoy_set: The number of ranked decoys
                                for which to calculate the enrichment factor.
    :type n_sampled_decoy_set: int

    :param min_actives: The number of actives that must be within the
                        n_sampled_decoy_set, otherwise the returned EF value
                        is None.
    :type min_actives: int

    :return: enrichment factor*
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(active_ranks, "active_ranks")
    _check_non_negative_int(n_sampled_decoy_set, "n_sampled_decoy_set")
    _check_none_or_positive_int(min_actives, "min_actives")

    # Bind default value.
    ef = None
    total_decoys = total_ligands - total_actives
    if total_decoys <= 0:
        raise ValueError("total_ligands must be larger than total_actives.")

    n_decoys = 0.0
    n_actives = 0.0
    n_actives_sampled_set = 0.0
    for rank in range(1, int(n_sampled_decoy_set + 2)):
        if rank in active_ranks:
            n_actives += 1.0
        else:
            n_decoys += 1.0
    frac_active_found = n_actives / total_actives
    frac_decoy_found = n_decoys / total_decoys
    n_actives_sampled_set = n_actives

    if min_actives and (n_actives_sampled_set < min_actives):
        logger.warning(
            "calc_EFStar: %d minimum actives, but only %d in the sample." %
            (min_actives, n_actives_sampled_set))
        ef = None
    else:
        try:
            ef = frac_active_found / frac_decoy_found
        except ZeroDivisionError:
            logger.debug("calc_EFStar: caught ZeroDivisionError.")
            if frac_active_found:
                logger.debug("Actives found in the sampled set, return inf.")
                ef = float('inf')
            else:
                logger.debug("No actives in the sampled set, return None.")
                ef = None

    return ef


[docs]def calc_EFP(total_actives,
             total_ligands,
             active_ranks,
             n_sampled_decoy_set,
             min_actives=None):
    """
    Calculates modified enrichment factor defined using the average of the
    reciprocals of the EF* enrichment factors for recovering the first
    aa% of the known actives, Enrichment Factor prime (EF').

    EF'(x) will be larger than EF*(x) if the actives in question come
    relatively early in the sequence, and smaller if they come relatively
    late. If fewer than min_actives are found in the set the returned
    value is None.

    EF' is defined as::

                       n_actives_sampled_set
       EF' = --------------------------------------------
              cumulative_sum(frac. decoys/frac. actives)

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param n_sampled_decoy_set: The number of ranked decoys
                                for which to calculate the enrichment factor.
    :type n_sampled_decoy_set: int

    :param min_actives: The number of actives that must be within the
                        n_sampled_decoy_set, otherwise the returned EF value
                        is None.
    :type min_actives: int

    :return: enrichment factor prime
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(active_ranks, "active_ranks")
    _check_non_negative_int(n_sampled_decoy_set, "n_sampled_decoy_set")
    _check_none_or_positive_int(min_actives, "min_actives")

    # Bind default value
    efp = None
    total_decoys = total_ligands - total_actives
    if total_decoys <= 0:
        raise ValueError("total_ligands must be larger than total_actives.")

    nkount = 0.0
    frac_actives_found = 0.0
    frac_decoys_found = 0.0
    ensum = 0.0
    n_actives = 0.0
    n_decoys = 0.0
    active_ranks_set = set(active_ranks)
    for rank in range(1, int(total_ligands) + 1):
        if rank in active_ranks_set:
            n_actives += 1
            frac_actives_found = n_actives / total_actives
        else:
            n_decoys += 1
            frac_decoys_found = n_decoys / total_decoys
        if frac_actives_found > 0 and frac_decoys_found > 0:
            nkount += 1
            ensum += frac_decoys_found / frac_actives_found
        if rank >= n_sampled_decoy_set + 1:
            break

    if n_actives == 0:
        logger.warning("calc_EFP: No actives found in the sample.")
        efp = None
    elif min_actives and (n_actives < min_actives):
        logger.warning(
            "calc_EFP: %d minimum actives, but only %d in the sample." %
            (min_actives, n_sampled_decoy_set))
        efp = None
    else:
        try:
            efp = nkount / ensum
        except ZeroDivisionError:
            logger.debug("calc_EFP: caught ZeroDivisionError.")
            logger.debug("Actives found in the sampled set, return inf.")
            efp = float('inf')

    return efp


[docs]def calc_FOD(total_actives, total_ligands, active_ranks, fraction_of_actives):
    r"""
    Calculates the average fraction of decoys outranking the given
    fraction, provided as a float, of known active ligands.
    The returned value is None if a) the calculation raises a
    ZeroDivisionError, or b) fraction_of_actives generates
    more actives than are ranked, or c) the fraction_of_actives
    is greater than 1.0

    FOD is defined as::

                           __
                 1         \    number_outranking_decoys_in_sampled_set
      FOD = -------------  /   ---------------------------------------
             num_actives   --         total_decoys

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param fraction_of_actives: Decimal notation of the fraction of sampled
                                actives, used to set the sampled set size.
    :type fraction_of_actives: float

    :return: Average fraction of outranking decoys.
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(active_ranks, "active_ranks")

    # Bind default value
    fod = None
    total_decoys = total_ligands - total_actives
    n_actives = int(round(total_actives * fraction_of_actives))
    if total_decoys <= 0:
        raise ValueError("total_ligands must be larger than total_actives.")

    if (fraction_of_actives > 1.0) or (fraction_of_actives < 0.0):
        msg = textwrap.dedent("""
            calc_FOD: fraction_of_actives must be within [0, 1],
            setting FOD as None.
            """)
        logger.warning(msg)
        return fod
    if (n_actives == 0):
        logger.warning("calc_FOD: n_actives is 0, setting FOD as None.")
        return fod
    if (n_actives > len(active_ranks)):
        msg = textwrap.dedent("""
            calc_FOD: fraction_of_actives is generating more actives than
            ranked, setting FOD as None.
            """)
        logger.warning(msg)
        return fod

    # Create a list of the frac. of outranking decoys for each active rank.
    fractions = []
    for rank in active_ranks[0:n_actives]:
        # Number of decoys ranked higher is the rank of the active,
        # not counting itself (-1), and not count any preceeding actives
        # (index.(rank))
        num_outranking_decoys = float((rank - 1) - active_ranks.index(rank))
        fractions.append(num_outranking_decoys / total_decoys)

    # Finally, do FOD calc.
    fod = sum(fractions) / n_actives

    return fod


[docs]def calc_EFF(total_actives, total_ligands, adjusted_active_ranks,
             fraction_of_decoys):
    """
    Calculate efficiency in distinguishing actives from decoys (EFF)
    on an absolute scale of 1 (perfect; all actives come before any decoys)
    to -1 (all decoys come before any actives); a value of 0 means that
    actives and decoys were recovered at equal proportionate rates.
    The returned value is None if the calculation raises a ZeroDivisionError.

    EFF is defined as::

                         frac. actives in sample
      EFF = (2* -----------------------------------------------) - 1
                frac actives in sample + frac. decoys in sample

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param adjusted_active_ranks: Modified active ranks; each rank is improved
                                  by the number of preceding actives. For
                                  example, a screen result that placed three
                                  actives as the first three ranks, [1, 2, 3],
                                  has adjusted ranks of [1, 1, 1]. In this way,
                                  actives are not penalized by being outranked
                                  by other actives.
    :type adjusted_active_ranks: list(int)

    :param fraction_of_decoys: The size of the set is in terms of the number of
                               decoys in the screen. For example, given 1000
                               decoys and fraction_of_decoys = 0.20, actives
                               that appear within the first 200 ranks are
                               counted.
    :type fraction_of_decoys: float

    :return: Active recovery efficiency at a particular sample set size
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(adjusted_active_ranks, "adjusted_active_ranks")

    # Bind default value
    eff = None
    total_decoys = total_ligands - total_actives
    if total_decoys <= 0:
        raise ValueError("total_ligands must be larger than total_actives.")

    # Determine fraction of database to assay.
    n_sampled_set = total_decoys * fraction_of_decoys

    # Number of actives in sampled set.
    n_actives_sampled_set = calc_ActivesInNStar(adjusted_active_ranks,
                                                n_sampled_set)

    frac_actives = n_actives_sampled_set / total_actives
    frac_decoys = n_sampled_set / total_decoys

    # Finally, do eff calc.
    try:
        eff = 2.0 * (frac_actives / (frac_actives + frac_decoys)) - 1
    except ZeroDivisionError:
        logger.warning("calc_EFF: caught ZeroDivisionError.")

    return eff


[docs]def calc_BEDROC(total_actives, total_ligands, active_ranks, alpha=20.0):
    """
    Boltzmann-enhanced Discrimination Receiver Operator Characteristic
    area under the curve.  The value is bounded between 1 and 0,
    with 1 being ideal screen performance.  The default alpha=20
    weights the first ~8% of screen results.  When alpha*Ra << 1,
    where Ra is the radio of total actives to total ligands, and
    alpha is the exponential prefactor, the BEDROC metric takes on
    a probabilistic meaning.  Calculated as described by Trunchon
    and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq 36.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param alpha: Exponential prefactor for adjusting early enrichment
                  emphasis. Larger values more heavily weight the early ranks.
                  alpha = 20 weights the first ~8% of the screen, alpha = 10
                  weights the first ~10% of the screen, alpha = 50 weights
                  the first ~3% of the screen results.
    :type alpha: float

    :return: a tuple of two floats, the first represents the area under
             the curve for the Boltzmann-enhanced discrimination of ROC
             (BEDROC) analysis, the second is the alpha*Ra term.
    :rtype: (float, float)

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(active_ranks, "active_ranks")
    if alpha <= 0:
        raise ValueError("alpha must be greater than zero.")

    bedroc = None
    Ra = total_actives / total_ligands
    alphaRa = alpha * Ra

    rie = calc_RIE(total_actives, total_ligands, active_ranks, alpha)

    term_1 = None
    term_2 = None
    try:
        term_1 = Ra * math.sinh(alpha / 2.0) /\
        (math.cosh(alpha / 2.0) - math.cosh(alpha / 2.0 - Ra * alpha))
        term_2 = 1.0 / (1.0 - math.exp(alpha * (1.0 - Ra)))
        bedroc = rie * term_1 + term_2
    except ZeroDivisionError:
        raise ZeroDivisionError("unexpected ZeroDivisionError.")
    except OverflowError:
        raise OverflowError("unexpected OverflowError.")

    return (bedroc, alphaRa)


[docs]def calc_RIE(total_actives, total_ligands, active_ranks, alpha=20.0):
    """
    Robust Initial Enhancement (RIE). Active ranks are weighted with
    an continuously decreasing exponential term. Large positive
    RIE values indicate better screen performance. Calculated as
    described by Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47,
    488-508 Eq 18.

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param alpha: Exponential prefactor for adjusting early enrichment
                  emphasis. Larger values more heavily weight the early ranks.
                  alpha = 20 weights the first ~8% of the screen, alpha = 10
                  weights the first ~10% of the screen, alpha = 50 weights
                  the first ~3% of the screen results.
    :type alpha: float

    :return: a float for the Robust Initial Enhancement (RIE).
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(active_ranks, "active_ranks")
    if alpha <= 0:
        raise ValueError("alpha must be greater than zero.")

    Ra = total_actives / total_ligands
    wght_sum = sum(
        [math.exp(-1 * alpha * r_i / total_ligands) for r_i in active_ranks])
    try:
        rie = wght_sum / (Ra * (1.0 - math.exp(-alpha)) /
                          (math.exp(alpha / total_ligands) - 1.0))
    except ZeroDivisionError:
        msg = textwrap.dedent("""
            calc_RIE: There were zero total actives for RIE calculation,
            setting RIE to zero.
            """)
        logger.warning(msg)
        rie = 0.0

    return rie


[docs]def calc_AUAC(total_actives, total_ligands, total_ranked, active_ranks):
    """
    Area Under the Accumulation Curve (AUAC). The value is bounded between
    1 and 0, with 1 being ideal screen performance. Calculated as described
    by Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq 8.
    (execept adjusted to a trapezoidal integration, to decrease errors
    for small data sets).

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param total_ranked: The total number of ligands ranked by the virtual
                         screen scoring metric.
    :type total_ranked: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :return: A float representation of the Area Under the Accumulation Curve.
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_positive_int(total_ranked, "total_ranked")
    _check_non_empty_list(active_ranks, "active_ranks")

    # Integrate stepwise over the ranked actives with curve terminating
    # at (Ntotal,Nranked_actives)
    area = sum([total_ligands - rank for rank in active_ranks])

    # Add trapezoidal correction
    area += len(active_ranks) / 2.0

    # Add correction for unranked actives, evenly distributed
    # after last ranked ligand. Curve now drawn to (Ntotal,Nactives)
    area += (total_actives - len(active_ranks)) * (total_ligands -
                                                   total_ranked) / 2.0

    # Normalize by the Ntotal*Nactives area to get a ratio:
    auac = area / (total_actives * total_ligands)

    return auac


[docs]def calc_ROC(total_actives, total_ligands, adjusted_active_ranks):
    """
    Calculates a representation of the Receiver Operator Characteristic
    area underneath the curve. Typically interpreted as the probability
    an active will appear before an inactive. A value of 1.0 reflects
    ideal performance, a value of 0.5 reflects a performance on par with
    random selection. Calculated as described by:
    Trunchon and Bayly, J. Chem. Inf. Model. 2007, 47, 488-508 Eq A.8

    Clasically ROC area is defined as::

                 AUAC     Ra
          ROC = ------ - -----
                  Ri      2Ri

    Where AUAC is the area under the accumulation curve, Ri is the ratio
    of inactives, Ra is the ratio of actives.

    A different method is used here in order to account for unranked
    actives - see PYTHON-3055 & PYTHON-3106

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :return: receiver operator characteristic area underneath the curve
    :rtype: float

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(adjusted_active_ranks, "adjusted_active_ranks")

    total_decoys = total_ligands - total_actives
    if total_decoys <= 0:
        raise ValueError("total_ligands must be larger than total_actives.")

    # Integrate stepwise over the ranked actives with curve terminating at
    # (Ndecoys,Nranked_actives)
    area = sum(
        [total_decoys - adj_rank + 1 for adj_rank in adjusted_active_ranks])

    # Normalize by the Ndecoys*Nactives area
    roc = area / (total_decoys * total_actives)

    return roc


[docs]def calc_MWUROC(total_actives, total_ligands, active_ranks, alpha=0.05):
    """
    Here, the ROC AUC is based on the Mann-Whitney-Wilcoxon U. The U value
    is calculated directly::

        U = R - ((n_a(n_a+1))/2)
        # where n_a is the number of actives and R is the sum of their ranks.

        ROC AUC = ((n_a*n_i) - U)/(n_a*n_i)
        # n_a is the number of actives, n_i is the number of decoys.

        SE = sqrt((A(1-A) + (n_a-1)(Q - A^2) + (n_i -1)(q - A^2))/(n_a*n_i))

        CI = SE * scipy.stats.t.ppf((1+(1-alpha))/2.0, ((n_a+n_i)-1))

    :param total_actives: The total number of active ligands in the screen,
                          ranked and unranked.
    :type total_actives: int

    :param total_ligands: The total number of ligands (actives and unknowns/
                          decoys) used in the screen.
    :type total_ligands: int

    :param active_ranks: List of *unadjusted* integer ranks for the actives
                         found in the screen. For example, a screen result that
                         placed three actives as the first three ranks has an
                         active_ranks list of = [1, 2, 3].
    :type active_ranks: list(int)

    :param alpha: the signficance level.  Default is 0.05
                  (95% confidence interval)
    :type alpha: float

    :return: tuple of ROC AUC, the standard error, and estimated confidence
             interval (lower and upper bounds).
    :rtype: tuple(float)

    """

    _check_positive_int(total_actives, "total_actives")
    _check_positive_int(total_ligands, "total_ligands")
    _check_non_empty_list(active_ranks, "active_ranks")
    if (alpha <= 0) or (alpha >= 1):
        raise ValueError("alpha must be in (0, 1).")

    total_decoys = total_ligands - total_actives
    if total_decoys <= 0:
        raise ValueError("total_ligands must be larger than total_actives.")

    # see Greiner et al. Preventative Veterinary Medicine vol45,
    # 2000 pp23-41 eq C.1
    sum_active_ranks = sum(active_ranks)
    term_u = sum_active_ranks - (((total_actives) * (total_actives + 1)) / 2)
    mwu_roc_auc = 1 - (term_u / (total_actives * total_decoys))

    # Standard Error estimation as Hanley and McNeil Radiology
    # vol143, 1982, pp29-36.
    term_big_q = mwu_roc_auc / (2 - mwu_roc_auc)
    term_small_q = (2 * mwu_roc_auc**2) / (1 + mwu_roc_auc)
    variance = ((mwu_roc_auc * (1 - mwu_roc_auc)) +
                (total_actives - 1) * (term_big_q - mwu_roc_auc**2) +
                (total_decoys - 1) * (term_small_q - mwu_roc_auc**2)) /\
               (total_actives * total_decoys)
    mwu_roc_auc_se = math.sqrt(variance)
    try:
        # Confidence interval estimation.
        interval = mwu_roc_auc_se * scipy.stats.t.ppf(
            (1 + (1 - alpha)) / 2.0, ((total_actives + total_decoys) - 1))
        mwu_roc_auc_lower = mwu_roc_auc - interval
        mwu_roc_auc_upper = mwu_roc_auc + interval
    except ZeroDivisionError:
        msg = "ZeroDivisionError caught in scipy.stats.t.ppf"
        raise ZeroDivisionError(msg)
    except ValueError:
        msg = "ValueError caught in scipy.stats.t.ppf"
        raise ValueError(msg)

    return (mwu_roc_auc, mwu_roc_auc_se, mwu_roc_auc_lower, mwu_roc_auc_upper)