Source code for schrodinger.application.desmond.correlation_tau

"""
This program computes the average correlation coefficient and
the kendall tau rank coefficient between experiment and prediction samples.
Samples are randomly drawn from gaussian distribution centered on each
experimental data point with given error.

The algorithm used in this program is based on the work by
Scott P. Brown, Steven W. Muchmore, Philip J. Hajduk
Drug Discovery Today, Vol. 14, No. 7-8., pp. 420-427

Copyright Schrodinger, LLC. All rights reserved.

"""
# Contributors: Byungchan Kim, Robert Abel, Lingle Wang, Teng Lin

import numpy
from scipy import stats
from dataclasses import dataclass


[docs]@dataclass
class ConfidenceInterval:
    val: float
    lower_bound: float
    upper_bound: float

    def __str__(self):
        return f"{self.val:.2f}_{self.lower_bound:.2f}^{self.upper_bound:.2f}"


[docs]def predict_kendall_tau(experiment,
                        experiment_sigma=0.3,
                        prediction_sigma=0.3,
                        num_sample=1000):
    """
    Computes the average Kendall tau rank correlation coefficient between
    experiment and prediction samples. num_sample independent data for
    each experiment and prediction are sampled from gaussian distribution
    with experiment_sigma and prediction_sigma error.

    :param experiment: sequence of experiment data
    :param experiment_sigma: experimental error
    :param prediction_sigma: prediction error
    :param num_sample: number of samples
    :return: average_tau, sigma_tau
    """

    experiment_data, prediction_data = _prepare_exp_pre_sample(
        experiment, experiment_sigma, prediction_sigma, num_sample)
    # kendall tau test
    kendall_tau = [
        stats.kendalltau(exp, pre)[0]
        for exp, pre in zip(experiment_data, prediction_data)
    ]
    return numpy.average(kendall_tau), numpy.std(kendall_tau)


def _get_ci(data):
    avg = numpy.average(data)
    ci_25 = numpy.percentile(data, 2.5)
    ci_975 = numpy.percentile(data, 97.5)
    return ConfidenceInterval(avg, ci_25, ci_975)


[docs]def predict_expected_slope(experiment,
                           experiment_sigma=0.3,
                           prediction_sigma=0.3,
                           num_sample=1000):
    # add random noise to experiment and prediction data
    experiment_data, prediction_data = _prepare_exp_pre_sample(
        experiment, experiment_sigma, prediction_sigma, num_sample)

    slopes = [
        numpy.polyfit(e, p, 1)[0]
        for e, p in zip(experiment_data, prediction_data)
    ]
    return _get_ci(slopes)


[docs]def predict_correlation(
    experiment,
    experiment_sigma=0.3,
    prediction_sigma=0.3,
    num_sample=1000,
    return_ci=False,
):
    """
    Computes the average correlation coefficient between experiment and
    prediction samples. num_sample independent data for each experiment and
    prediction are sampled from gaussian distribution with experiment_sigma
    and prediction_sigma error.

    :param experiment: sequence of experiment data
    :param experiment_sigma: experimental error
    :param prediction_sigma: prediction error
    :param num_sample: number of samples
    :param return_ci: If True, return the data in ConfidenceInterval format: <R>, <R^2>, <R^2_signed>
                      If False, return: <R>, sigma_R, <R^2>, sigma_R^2, <R^2_signed>, sigma_R^2_signed
    """

    experiment_data, prediction_data = _prepare_exp_pre_sample(
        experiment, experiment_sigma, prediction_sigma, num_sample)

    # compute correlation matrix among experiment and prediction samples
    correlation_matrix = numpy.corrcoef(experiment_data, prediction_data)

    # extract correlation matrix only between experiment and prediction samples
    exp_pre_correlation = correlation_matrix[:num_sample, num_sample:]
    exp_pre_correlation_square = numpy.square(exp_pre_correlation)

    exp_pre_abs_correlation = numpy.absolute(exp_pre_correlation)
    exp_pre_signed_correlation = numpy.multiply(exp_pre_correlation,
                                                exp_pre_abs_correlation)

    if return_ci:
        return (_get_ci(exp_pre_correlation),
               _get_ci(exp_pre_correlation_square),
               _get_ci(exp_pre_signed_correlation)) # yapf: disable

    else:
        return (numpy.average(exp_pre_correlation),
                numpy.std(exp_pre_correlation),
                numpy.average(exp_pre_correlation_square),
                numpy.std(exp_pre_correlation_square),
                numpy.average(exp_pre_signed_correlation),
                numpy.std(exp_pre_signed_correlation))


def _prepare_exp_pre_sample(experiment,
                            experiment_sigma=0.3,
                            prediction_sigma=0.3,
                            num_sample=1000):
    """
    :param experiment: sequence of experiment data
    :param experiment_sigma: experimental error
    :param prediction_sigma: prediction error
    :param num_sample: number of samples
    :return: experiment_sample, prediction_sample
    """

    experiment = numpy.array(experiment)

    # random numbers from normal distribution
    numpy.random.seed(2017)
    experiment_random = numpy.random.normal(0.0, experiment_sigma,
                                            (num_sample, len(experiment)))
    prediction_random = numpy.random.normal(0.0, prediction_sigma,
                                            (num_sample, len(experiment)))

    # add random noise to experiment data
    experiment_sample = numpy.add(experiment, experiment_random)
    prediction_sample = numpy.add(experiment, prediction_random)

    return experiment_sample, prediction_sample


[docs]def compute_rmse(experiment, prediction):
    """
    Computes root mean square error between experiment and prediction.
    Averages of experiment and prediction are aligned before RMSE computation.

    :param experiment: sequence of experiment data
    :param prediction: sequence of prediction data
    :return: root_mean_square_error between experiment and prediction
    """
    experiment = numpy.array(experiment)
    prediction = numpy.array(prediction)
    experiment_average = numpy.average(experiment)
    prediction_average = numpy.average(prediction)
    offset = experiment_average - prediction_average
    return numpy.std(prediction - experiment + offset)