import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
import copy
import csv
from collections import defaultdict
from io import StringIO
from typing import Dict
from typing import List
from typing import Optional
import numpy as np
from schrodinger.application.desmond import cms
from schrodinger.application.desmond import constants
from schrodinger.application.desmond.measurement import Measurement
from schrodinger.application.desmond.correlation_tau import ConfidenceInterval
from schrodinger.structure import Structure
from schrodinger.structutils import analyze
# Default number of sublimation jobs
NUM_SUBLIMATION_JOBS = 5
NUM_POLYMER_SOLVATION_JOBS = 20
OBVIOUS_SASA_CUTOFF = 3.0
# Marker that indicates structure is soluble.
SOLUBLE_MARKER = "< 5kcal/mol"
ParsedReport = Dict[str, Dict]
COMPOUND_NAME = 'Compound Name'
[docs]def get_transfer_free_energy(cms_model: cms.Cms) -> Optional[Measurement]:
"""
Returns the transfer free energy as a Measurement or None if the property
could not be found. Depending on the leg that the cmd corresponds to,
this could be sublimation, solvation, or hydration dGs.
:param cms_model: Input cms object to extract the transfer free energy from.
"""
for ct in cms_model.comp_ct:
transfer_free_ene = ct.property.get(constants.FEP_TRANSFER_FREE_ENERGY)
if transfer_free_ene:
return Measurement.from_string(transfer_free_ene)
[docs]def calculate_solubility_free_energy(
hydration_dg: Measurement, sublimation_dg: Measurement) -> Measurement:
"""
Returns the dissolution free energy.
NOTE:
- "Sublimation" leg actually simulates deposition (gas -> solid)
- Solvation/hydration legs simulate solvation (solid -> solvated)
- Dissolution ("solubility") free energy = Hydration - "Sublimation"
:param hydration_dg: Hydration transfer free energy measurement.
:param sublimation_dg: Sublimation transfer free energy measurement.
"""
return hydration_dg - sublimation_dg
[docs]def set_structure_soluble(ct: Structure) -> None:
"""
Update the fep solubility property to mark the structure as being soluble.
The input ct is modified in place.
:param ct: Structure to mark as soluble.
"""
ct.property[constants.FEP_SOLUBILITY] = SOLUBLE_MARKER
[docs]def is_structure_soluble(ct: Structure) -> bool:
"""
Return True if the structure has been marked as soluble, False otherwise.
:param ct: Structure to check if marked as soluble.
"""
return ct.property.get(constants.FEP_SOLUBILITY) == SOLUBLE_MARKER
[docs]def is_soluble(exposures: List[float]) -> bool:
"""
If more than 10% of the solute molecules have a exposure greater than 90.0,
flag the molecule as soluble (< 5 kcal/mol transfer energy).
:param exposures: List of exposure for each molecule in the structure.
:return: True if this molecule is considered soluble, False otherwise.
"""
exposed_mols = len([1 for score in exposures if score > 90.0])
percent_exposed = float(exposed_mols) / float(len(exposures)) * 100.0
return percent_exposed > 10.0
[docs]def get_molecule_exposures(baseline_ct: Structure,
exposure_ct: Structure,
delete_h_atoms=True) -> List[float]:
"""
Calculates the exposure of each molecule in a structure
based on SASA and returns a list of exposure of each molecule
by index of each molecule in the structure - 1 (since structure.molecule is a
1-indexed list). By default this functiion deletes all H atoms from the
specified structure. To retain H atoms set delete_h_atoms to False.
:param baseline_ct: Structure of an individual molecule used as the reference.
:param exposure_ct: Structure used for the list of molecule exposures.
:param delete_h_atoms: Set to True to delete all H atoms from
the structure when calculating SASA.
:return: List of exposure for each molecule in the structure.
"""
exposures = []
exposure_ct = exposure_ct.copy()
baseline_ct = baseline_ct.copy()
if delete_h_atoms:
for ct in [exposure_ct, baseline_ct]:
ct.deleteAtoms(analyze.evaluate_asl(ct, "atom.ele H"))
for mol in exposure_ct.molecule:
score = 0.0
mol_sasa = analyze.calculate_sasa(exposure_ct, mol.atom)
rel_mol_sasa = mol_sasa / len(mol.atom)
for atom in mol.atom:
# Get SASA of original exposure_ct atom for comparison
baseline_atom_idx = atom.index % baseline_ct.atom_total or baseline_ct.atom_total
baseline_atom = baseline_ct.atom[baseline_atom_idx]
baseline_sasa = analyze.calculate_sasa(baseline_ct, [baseline_atom])
if rel_mol_sasa < OBVIOUS_SASA_CUTOFF:
atom_sasa = 0.0
else:
atom_sasa = analyze.calculate_sasa(exposure_ct, [atom])
if atom_sasa > baseline_sasa:
score += 1.0
score = 100.0 * (score / float(len(mol.atom)))
exposures.append(score)
return exposures
[docs]def get_report(title: str,
hydration_dg: Optional[Measurement] = None,
sublimation_dgs: Optional[List[Measurement]] = None,
solvation_dgs: Optional[List[Measurement]] = None,
is_soluble: bool = False) -> str:
"""
Returns the formatted results as csv file contents.
:param title: Title of the ligand.
:param hydration_dg: Hydration transfer free energy measurement.
Must be specified if is_soluble is False.
:param sublimation_dgs: List of sublimation transfer free energy
measurements.
Must be specified if is_soluble is False.
:param solvation_dgs: List of solvation transfer free energy
measurements.
:param is_soluble: Set to True if this ligand
was determined to be soluble in the MD stage.
"""
from itertools import chain
if is_soluble:
line = f"{title},{SOLUBLE_MARKER}\n"
elif solvation_dgs:
bs_solvation_dg = bootstrapped_solvation_free_energy(solvation_dgs)
bs_str = f'{bs_solvation_dg.val},{bs_solvation_dg.lower_bound},{bs_solvation_dg.upper_bound},'
results = [f'{dg.val},{dg.unc}' for dg in solvation_dgs]
line = (f"{title}," + bs_str + ",".join(results) + "\n")
else:
if sublimation_dgs:
median_sublimation_dg = median_sublimation_free_energy(
sublimation_dgs)
dissolution_dg = calculate_solubility_free_energy(
hydration_dg, median_sublimation_dg)
else:
median_sublimation_dg = 'N/A+-N/A'
dissolution_dg = 'N/A+-N/A'
line = (f"{title}," + ",".join(
f"{str(d).replace('+-', ',')}"
for d in chain((dissolution_dg, hydration_dg,
median_sublimation_dg), sublimation_dgs)) + "\n")
return line
[docs]def parse_report(csv_contents: str) -> ParsedReport:
"""
Parse a report csv and return a `ParsedReport` mapping the ligand to the result.
Each result is a dictionary where the keys are the columns from the csv.
"""
data = {}
f = StringIO(csv_contents)
for row in csv.DictReader(f):
parsed_row = {}
for k, v in row.items():
if k is not None:
k = k.strip()
if v is not None:
v = v.strip()
if k == COMPOUND_NAME:
continue
if v == 'N/A':
v = None
if v is not None and v != SOLUBLE_MARKER:
v = float(v)
parsed_row[k] = v
data[row[COMPOUND_NAME]] = parsed_row
return data
[docs]def calculate_report_rmse(ref_data: ParsedReport,
test_data: ParsedReport) -> Dict[str, float]:
"""
Calculate the rsme between the parsed reference data and the test data.
Missing data is treated as infinite difference
Used in STU and SCIVAL tests.
:return: Dictionary where the keys are the column names and the values
are the rmse for that column.
"""
from schrodinger.application.scisol.packages.fep import fep_stats
result = {}
for col in list(ref_data.values())[0].keys():
if col == COMPOUND_NAME:
continue
ref_array = []
test_array = []
for compound in ref_data.keys():
ref_val = ref_data[compound][col]
# Handle soluble compounds
if ref_val == SOLUBLE_MARKER or ref_val is None:
ref_array.append(0.0)
# If both marked soluble, set as 0 difference
# otherwise set as inf difference.
if test_data[compound][col] == ref_val:
test_array.append(0.0)
else:
test_array.append(float('inf'))
continue
# Not marked soluble
ref_array.append(float(ref_val))
if col in test_data[compound]:
val = test_data[compound][col]
# Should not be marked soluble, if it is
# then give inf difference.
if val == SOLUBLE_MARKER or val is None:
test_array.append(float('inf'))
else:
test_array.append(float(val))
else:
# Treat missing data as inf difference
test_array.append(float('inf'))
_, rmse = fep_stats.calc_stats(
np.abs(np.array(ref_array) - np.array(test_array)))
result[col] = rmse.val
return result
[docs]def parse_output_structure_properties(sts: List[Structure]) -> ParsedReport:
"""
Parse the ct level solubility properties from report structures and return the
dictionary of {compound: {property: value}}. Missing values are set to None.
Used in STU and SCIVAL tests.
"""
FEP_SOLUBILITY_ERR = f'{constants.FEP_SOLUBILITY} err'
result = defaultdict(dict)
for st in sts:
dg_dissolution = st.property.get(constants.FEP_SOLUBILITY)
if dg_dissolution == SOLUBLE_MARKER:
result[st.title][constants.FEP_SOLUBILITY] = SOLUBLE_MARKER
result[st.title][FEP_SOLUBILITY_ERR] = None
result[st.title][constants.FEP_SOLUBILITY_MICROMOLAR] = None
continue
elif dg_dissolution is not None:
dg_dissolution = Measurement.from_string(dg_dissolution)
result[st.title][constants.FEP_SOLUBILITY] = dg_dissolution.val
result[st.title][FEP_SOLUBILITY_ERR] = dg_dissolution.unc
else:
result[st.title][constants.FEP_SOLUBILITY] = None
result[st.title][FEP_SOLUBILITY_ERR] = None
result[st.title][constants.FEP_SOLUBILITY_MICROMOLAR] = st.property.get(
constants.FEP_SOLUBILITY_MICROMOLAR)
return result
[docs]def bootstrapped_solvation_free_energy(
solvation_dgs: List[Measurement]) -> ConfidenceInterval:
data = np.array([dg.val for dg in solvation_dgs])
result = bs.bootstrap(data, stat_func=bs_stats.mean, is_pivotal=False)
return ConfidenceInterval(result.value, result.lower_bound,
result.upper_bound)