"""
This program computes the average correlation coefficient and
the kendall tau rank coefficient between experiment and prediction samples.
Samples are randomly drawn from gaussian distribution centered on each
experimental data point with given error.
The algorithm used in this program is based on the work by
Scott P. Brown, Steven W. Muchmore, Philip J. Hajduk
Drug Discovery Today, Vol. 14, No. 7-8., pp. 420-427
Copyright Schrodinger, LLC. All rights reserved.
"""
# Contributors: Byungchan Kim, Robert Abel, Lingle Wang, Teng Lin
import numpy
from scipy import stats
from dataclasses import dataclass
[docs]@dataclass
class ConfidenceInterval:
val: float
lower_bound: float
upper_bound: float
def __str__(self):
return f"{self.val:.2f}_{self.lower_bound:.2f}^{self.upper_bound:.2f}"
[docs]def predict_kendall_tau(experiment,
experiment_sigma=0.3,
prediction_sigma=0.3,
num_sample=1000):
"""
Computes the average Kendall tau rank correlation coefficient between
experiment and prediction samples. num_sample independent data for
each experiment and prediction are sampled from gaussian distribution
with experiment_sigma and prediction_sigma error.
:param experiment: sequence of experiment data
:param experiment_sigma: experimental error
:param prediction_sigma: prediction error
:param num_sample: number of samples
:return: average_tau, sigma_tau
"""
experiment_data, prediction_data = _prepare_exp_pre_sample(
experiment, experiment_sigma, prediction_sigma, num_sample)
# kendall tau test
kendall_tau = [
stats.kendalltau(exp, pre)[0]
for exp, pre in zip(experiment_data, prediction_data)
]
return numpy.average(kendall_tau), numpy.std(kendall_tau)
def _get_ci(data):
avg = numpy.average(data)
ci_25 = numpy.percentile(data, 2.5)
ci_975 = numpy.percentile(data, 97.5)
return ConfidenceInterval(avg, ci_25, ci_975)
[docs]def predict_expected_slope(experiment,
experiment_sigma=0.3,
prediction_sigma=0.3,
num_sample=1000):
# add random noise to experiment and prediction data
experiment_data, prediction_data = _prepare_exp_pre_sample(
experiment, experiment_sigma, prediction_sigma, num_sample)
slopes = [
numpy.polyfit(e, p, 1)[0]
for e, p in zip(experiment_data, prediction_data)
]
return _get_ci(slopes)
[docs]def predict_correlation(
experiment,
experiment_sigma=0.3,
prediction_sigma=0.3,
num_sample=1000,
return_ci=False,
):
"""
Computes the average correlation coefficient between experiment and
prediction samples. num_sample independent data for each experiment and
prediction are sampled from gaussian distribution with experiment_sigma
and prediction_sigma error.
:param experiment: sequence of experiment data
:param experiment_sigma: experimental error
:param prediction_sigma: prediction error
:param num_sample: number of samples
:param return_ci: If True, return the data in ConfidenceInterval format: <R>, <R^2>, <R^2_signed>
If False, return: <R>, sigma_R, <R^2>, sigma_R^2, <R^2_signed>, sigma_R^2_signed
"""
experiment_data, prediction_data = _prepare_exp_pre_sample(
experiment, experiment_sigma, prediction_sigma, num_sample)
# compute correlation matrix among experiment and prediction samples
correlation_matrix = numpy.corrcoef(experiment_data, prediction_data)
# extract correlation matrix only between experiment and prediction samples
exp_pre_correlation = correlation_matrix[:num_sample, num_sample:]
exp_pre_correlation_square = numpy.square(exp_pre_correlation)
exp_pre_abs_correlation = numpy.absolute(exp_pre_correlation)
exp_pre_signed_correlation = numpy.multiply(exp_pre_correlation,
exp_pre_abs_correlation)
if return_ci:
return (_get_ci(exp_pre_correlation),
_get_ci(exp_pre_correlation_square),
_get_ci(exp_pre_signed_correlation)) # yapf: disable
else:
return (numpy.average(exp_pre_correlation),
numpy.std(exp_pre_correlation),
numpy.average(exp_pre_correlation_square),
numpy.std(exp_pre_correlation_square),
numpy.average(exp_pre_signed_correlation),
numpy.std(exp_pre_signed_correlation))
def _prepare_exp_pre_sample(experiment,
experiment_sigma=0.3,
prediction_sigma=0.3,
num_sample=1000):
"""
:param experiment: sequence of experiment data
:param experiment_sigma: experimental error
:param prediction_sigma: prediction error
:param num_sample: number of samples
:return: experiment_sample, prediction_sample
"""
experiment = numpy.array(experiment)
# random numbers from normal distribution
numpy.random.seed(2017)
experiment_random = numpy.random.normal(0.0, experiment_sigma,
(num_sample, len(experiment)))
prediction_random = numpy.random.normal(0.0, prediction_sigma,
(num_sample, len(experiment)))
# add random noise to experiment data
experiment_sample = numpy.add(experiment, experiment_random)
prediction_sample = numpy.add(experiment, prediction_random)
return experiment_sample, prediction_sample
[docs]def compute_rmse(experiment, prediction):
"""
Computes root mean square error between experiment and prediction.
Averages of experiment and prediction are aligned before RMSE computation.
:param experiment: sequence of experiment data
:param prediction: sequence of prediction data
:return: root_mean_square_error between experiment and prediction
"""
experiment = numpy.array(experiment)
prediction = numpy.array(prediction)
experiment_average = numpy.average(experiment)
prediction_average = numpy.average(prediction)
offset = experiment_average - prediction_average
return numpy.std(prediction - experiment + offset)