import operator
import re
from past.utils import old_div
import numpy as np
from schrodinger import structure
from schrodinger.protein.analysis import Report as ProteinReport
from schrodinger.structutils import analyze
from schrodinger.structutils import measure
from schrodinger.structutils.interactions import hbond
from schrodinger.test import structurecheck
[docs]def partial_structure_reader(filename, frames_to_check=None):
"""
Iterator that returns the specified frames of the structures in
filename {structure} By default return all frames, but if frames_to_check
{ list, set or anything with an "__in__" method" of integers} is set
then the only frames for which their index (starting with 0) are in
frames to check are returned. Note that even if a list is passed with
integers out of order, the frames will always be yielded in the
order that they are in the file.
"""
if isinstance(frames_to_check, str):
frames_to_check = eval(frames_to_check)
if (type(frames_to_check) == int):
frames_to_check = [frames_to_check]
for ict, ct in enumerate(structure.StructureReader(filename)):
if (frames_to_check is not None and ict not in frames_to_check):
continue
yield ct
[docs]def convert_plop_name_to_asl(input_str):
"""
Convert a string specifying a residue or atom into an asl expresion
Input should be in the form <chain>:<resnum><inscode> or
<chain>:<resnum><inscore>:<pdb atom name>
<chain> is the chain id or a _ or no character if space
<resnum> is the residue number (possibly negative)
<inscode> is the pdb insertion code
<pdb atom name> The 4 letter pdb atom name. Either spaces or _ can be used
for padding to get to 4 characters. Examples A:12 :23A B:-1 _:12
The output is a ASL expression stored as a string that corresponds
to the same atom(s) in the input.
"""
atom_match = re.search(r"(.{0,1}):(-{0,1}\d+)([^:]{0,1}):(.+)", input_str)
res_match = re.search(r"(.{0,1}):(-{0,1}\d+)([^:]{0,1})", input_str)
if (atom_match):
chain = atom_match.group(1)
resnum = eval(atom_match.group(2))
inscode = atom_match.group(3)
pdbatom = atom_match.group(4)
pdbatom = pdbatom.replace("_", " ")
elif (res_match):
chain = res_match.group(1)
resnum = eval(res_match.group(2))
inscode = res_match.group(3)
pdbatom = None
else:
raise TypeError("Could not convert input: %s" % input_str)
if (chain == "_" or chain == ""):
chain = " "
if (inscode == ""):
inscode = " "
if (pdbatom is None):
return '(chain.name "%s" AND res.num %d AND res.inscode "%s")' % (
chain, resnum, inscode)
else:
return ('(chain.name "%s" AND res.num %d AND res.inscode "%s" AND ' +
'atom.ptype "%s")') % (chain, resnum, inscode, pdbatom)
[docs]def get_moved_residues(ct, ref_ct, tolerance):
"""
For a given ct {structure.Stucture} and ref_ct {structure.Structure}
with the same number of atoms in the same order, return a sorted list
with all the residue names (__str__() evaluation of their associated
structure._Residue class) that contains atoms that moved more than
a tolerance {float} in Angstroms.
"""
if (len(ct.atom) != len(ref_ct.atom)):
raise RuntimeError('Input structures must have the same ' +
'number of atoms')
residues_moved = set()
for atom1, atom2 in zip(ct.atom, ref_ct.atom):
if (measure.measure_distance(atom1, atom2) > tolerance):
residues_moved.add(str(atom1.getResidue()))
return sorted(list(residues_moved))
[docs]def get_phi_psi(ct):
"""
Return a dictionary with the key being the name of each residues and
the value being a 2 member list containing the PHI and PSI angles
of that residue
"""
output = {}
rep = ProteinReport(ct, ['BACKBONE DIHEDRALS']).data[0]
for point in rep.points:
if (point.descriptor == '-'):
continue
vec = [i for i in point.values]
resnum = point.descriptor.split()[1]
resid = point.descriptor.split(':')[0] + ':' + resnum
output[resid] = vec
return output
[docs]def get_modified_backbone_angles(ct, ref_ct, tolerance):
"""
For a given ct {structure.Stucture} and ref_ct {structure.Structure}
with the same number of atoms in the same order, return a sorted list
with all the residue names (__str__() evaluation of their associated
structure._Residue class) that contains atoms that moved either phi
or psi angles more than tolerance {float} in degrees
"""
phi_psi = get_phi_psi(ct)
ref_phi_psi = get_phi_psi(ref_ct)
changed = set()
for res in ref_phi_psi:
try:
delta_phi = min(abs(phi_psi[res][0] - ref_phi_psi[res][0]),
abs(phi_psi[res][0] - ref_phi_psi[res][0] - 360),
abs(phi_psi[res][0] - ref_phi_psi[res][0] + 360))
delta_psi = min(abs(phi_psi[res][1] - ref_phi_psi[res][1]),
abs(phi_psi[res][1] - ref_phi_psi[res][1] + 360),
abs(phi_psi[res][1] - ref_phi_psi[res][1] - 360))
except TypeError:
continue
if (delta_phi > tolerance or delta_psi > tolerance):
changed.add(res)
return list(changed)
[docs]def check_moved_residues(filename,
reference_fn,
moved_residues,
tolerance=0.01,
check_backbone_dihedrals=False,
frames_to_check=None):
"""
STU workup that will check to see if a list of moved residues { string
the residue names (__str__() evaluation of the structure._Residue class)
separated by spaces} corresponds to the list of all residue that have an
an atom that moved more than tolerance {float} between any of the
frames_to_check in filename (see partial_structure_reader) and the first
structure frame in reference_fn. Returns True if the lists match, False
if they dont
if check_bacbone_dihedrals{logical} is True then check use the residues
that either their phi or psi angle moved by more than tolerance degrees
are used instead of distance in cartesian space
"""
ref_ct = structure.Structure.read(reference_fn)
moved_residues_list = moved_residues.split(" ")
errors = []
for ct in partial_structure_reader(filename, frames_to_check):
if (check_backbone_dihedrals):
this_moved_residues_list = get_modified_backbone_angles(
ct, ref_ct, tolerance)
else:
this_moved_residues_list = get_moved_residues(ct, ref_ct, tolerance)
should_have_moved = set(moved_residues_list) - \
set(this_moved_residues_list)
should_not_have_moved = set(this_moved_residues_list) - \
set(moved_residues_list)
if (should_have_moved):
errors.append("ERROR: %s Residues did not moved %s" %
(ct.title, " ".join(list(should_have_moved))))
if (should_not_have_moved):
errors.append("ERROR: %s Residues moved %s" %
(ct.title, " ".join(list(should_not_have_moved))))
if errors:
raise AssertionError('\n'.join(errors))
return True
[docs]def check_protein_health(filename,
frames_to_check=None,
steric_delta=0.3,
bond_length_deviation=0.15,
protein_health_kwargs=None):
"""
Use the featurs in structurecheck to identify potential problems in
the structure. By default this will find any instances where
there is a steric clash of more than 30% of the vdW radius (set
by steric delta {float}) or any bond length deviations of more than
0.15 angstroms (set by bond_length_deviation {float})
The number of allowable clashes, bond length deviations and other
structual properties can be set by passing a dictionary into
protein_health_kwargs {dict} as decirbed in the help for
structurecheck.ProteinReportCheck.
frames_to_check is descirbed above
Note that protein_health_kwargs needs to have a dictionary passed to it
to be useful as a STU test. Currently STU does not allow this.
"""
if (protein_health_kwargs is None):
protein_health_kwargs = {'STERIC CLASHES': 0, 'BOND LENGTHS': 0}
for ct in partial_structure_reader(filename, frames_to_check):
prc = structurecheck.ProteinReportCheck(
ct,
steric_delta=steric_delta,
bond_length_deviation=bond_length_deviation)
prc.assertProteinHealth(ct.title, protein_health_kwargs)
return True
[docs]def measure_distance(ct1, name1, ct2, name2):
"""
For a given structure ct1 {structure.Structure} and the Plop residue or
atom name name1 {string, described convert_plop_name_to_asl}
and a second ct2 {structure.Structure}, name2 {string} pair following the
same convention, return the minimum distance between the two. If
either name matches no values in the structure then None is returned.
"""
# This could be either an atom name or a residue name
list1 = analyze.evaluate_asl(ct1, convert_plop_name_to_asl(name1))
list2 = analyze.evaluate_asl(ct2, convert_plop_name_to_asl(name2))
min_dist = None
for iatom1 in list1:
for iatom2 in list2:
one_dist = measure.measure_distance(ct1.atom[iatom1],
ct2.atom[iatom2])
if (min_dist is None or one_dist < min_dist):
min_dist = one_dist
return min_dist
[docs]def check_distance(filename,
name1,
name2,
distance,
tolerance,
frames_to_check=None):
"""
STU workup
For the specified structure frames in a filename {string} return True
if the minimum distance between the atom or residue name1 {string} is
distance {float} +- tolerance{float} Angstroms. Otherwise return false
(see partial_structrue_reader for description of frames_to check)
"""
errors = []
for ct in partial_structure_reader(filename, frames_to_check):
one_dist = measure_distance(ct, name1, ct, name2)
if (abs(one_dist - distance) > tolerance):
msg = "ERROR: %s Distance " % ct.title + \
"{}-{}({:f} A) != {:f} +/- {:f} A".format(name1, name2, one_dist,
distance, tolerance)
errors.append(msg)
if errors:
raise AssertionError('\n'.join(errors))
return True
[docs]def check_torsion(filename,
name1,
name2,
name3,
name4,
value,
tolerance=10,
frames_to_check=None):
"""
For the specified structure frames in a filename {string} return True if
the torsion between atoms with names name1-4 {string} is value {float} +-
tolerance{float} degrees. Otherwise return false (see
partial_structure_reader for description of frames_to check) The format for
name is <chain>:<residue number>:<pdb atom name> where a _ is replaced with
a space (note that the <pdb atom name> should be 4 characters and padded
with _ for spaces ie A:12:_CA_.
"""
errors = []
for ct in partial_structure_reader(filename, frames_to_check):
iatoms = []
for name in name1, name2, name3, name4:
alist = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name))
if len(alist) != 1:
raise RuntimeError("%s in %s should match exactly 1 atom(%d)" %
(name, filename, len(alist)))
iatoms.append(alist[0])
one_tor = ct.measure(*iatoms)
diff = min(abs(one_tor - value), abs(one_tor - 360 - value),
abs(one_tor + 360 - value))
if diff > tolerance:
output = False
msg = "ERROR: %s Torsion " % ct.title + \
"%s(%d)-%s(%d)-%s(%d)-%s(%d)(%f deg) != %f +/- %f deg" % (
name1, iatoms[0], name2, iatoms[1], name3, iatoms[2],
name4, iatoms[3], one_tor, value, tolerance)
errors.append(msg)
if errors:
raise AssertionError('\n'.join(errors))
return True
[docs]def check_for_residue_list(filename, residues, frames_to_check=None):
"""
For the specified structure frames in a filename {string} return True if
the residues match the residues listed in residues {string, space seperated
list of residue names as given by the __str__ operator of
structure._Residue} then return True. (see partial_structrue_reader for
description of frames_to check)
"""
errors = []
input_residues = set(residues.split())
for ct in partial_structure_reader(filename, frames_to_check):
ct_residues = {str(res) for res in ct.residue}
for input_res in input_residues:
if (input_res not in ct_residues):
msg = f"{ct.title} has missing residue {input_res}"
errors.append(msg)
for ct_res in ct_residues:
if (ct_res not in input_residues):
msg = f"{ct.title} has extra residue {ct_res}"
errors.append(msg)
if errors:
raise AssertionError('\n'.join(errors))
return True
[docs]def check_structure_counts(filename,
atoms=None,
residues=None,
molecules=None,
frames_to_check=None):
"""
STU workup
For the specified structure frames in a filename {string} return True
if all contain the specified numbers of atoms{integer},
residues{integer}, and molecules{integer}, If any of these are not set
then that test is skipped. Otherwise return False.
(see partial_structrue_reader for description of frames_to check)
"""
errors = []
for ct in partial_structure_reader(filename, frames_to_check):
if (atoms is not None and len(ct.atom) != atoms):
msg = "%s # Atoms %d != %d" % (ct.title, len(ct.atom), atoms)
errors.append(msg)
if (residues is not None and len(ct.residue) != residues):
msg = "%s # Residues %d != %d" % (ct.title, len(
ct.residue), residues)
errors.append(msg)
if (molecules is not None and len(ct.molecule) != molecules):
msg = "%s # Molecules %d != %d" % (ct.title, len(
ct.molecule), molecules)
errors.append(msg)
if errors:
raise AssertionError('\n'.join(errors))
return True
[docs]def determine_truth_table(filename,
property,
reference_property,
cutoff=None,
reverse=False,
frames_to_check=None):
"""
Return a truth table as a dictionary with keys tp, tn, fp, fn & "no
prediction" with the number of frames where the prediction is a True
Positive, True Negative, False Positive, False Negative, or no prediction
was provided.
For an input filename{string} containing one or more structures readable by
StructureReader.
property{string} the property name of predicted value stored as ct-level
property filename
referecne property{string} the property name of true value stored as
ct-level property filename
By default the value in property will be converted to a boolean, however
the cutoff {variable that supports >=} is defined then a predicted value of
True will given whenever the value is greater than this cutoff.
If reverse is True then the predicted value will be reversed (this makes
the most sense when cutoff is also used)
frame_to_check is passed to partial_structure_reader
"""
truth_table = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'no prediction': 0}
for ct in partial_structure_reader(filename, frames_to_check):
if (reference_property not in ct.property):
continue
reference = bool(ct.property[reference_property])
if (property not in ct.property):
truth_table['no prediction'] += 1
continue
if (cutoff is None):
pred = bool(ct.property[property])
else:
pred = (ct.property[property] >= cutoff)
if (reverse):
pred = not pred
if (pred and reference):
truth_table['tp'] += 1
elif (not pred and not reference):
truth_table['tn'] += 1
elif (pred and not reference):
truth_table['fp'] += 1
elif (not pred and reference):
truth_table['fn'] += 1
return truth_table
[docs]def check_prime_accuracy(min_accuracy,
filename,
property,
reference_property,
cutoff=None,
reverse=False,
frames_to_check=None):
"""
STU Workup that will check if the accuracy of a predictors is more than
some cutoff value and raise an assertion error otherwise usage:
:param min_accuracy: 0.0 - 1.0
:type min_accuracy: float
:param filename: path of file readable by StructureReader
:type filename: str
:param property: property name of predicted value stored as ct-level
property in the structures within filename
:type property: str
:param reference_property: property name of the reference value stored as
ct-level property filename
:type reference_property: str
:param cutoff: variable that supports >= on property
:type cutoff: int or float
:type reverse: bool
:param frame_to_check: indicies of frames within filename to process
:type frames_to_check: str
By default the value in property will be converted to a boolean, however
the cutoff {variable that supports >=} is defined then a predicted value of
True will given whenever the value is greater than this cutoff.
If reverse is True then the predicted value will be reversed (this makes
the most sense when cutoff is also used).
"""
truth_table = determine_truth_table(filename,
property,
reference_property,
cutoff=cutoff,
reverse=reverse,
frames_to_check=frames_to_check)
try:
accuracy = old_div(float(truth_table['tp'] + truth_table['tn']), \
(truth_table['tp'] + truth_table['tn'] +
truth_table['fp'] + truth_table['fn']))
except ZeroDivisionError:
raise AssertionError("No predictions to check")
if (accuracy < min_accuracy):
raise AssertionError(
"Accuracy %5.3f < %5.3f TP %d TN %d FP %d FN %d" %
(accuracy, min_accuracy, truth_table['tp'], truth_table['tn'],
truth_table['fp'], truth_table['fn']))
return True
[docs]def check_F1_score(min_F1,
filename,
property,
reference_property,
cutoff=None,
reverse=False,
frames_to_check=None):
"""
STU Workup that will check if the F1 Score of a predictors is more
than some cutoff value and raise an assertion error otherwise
:param min_f1: 0.0 -1.0
:type min_f1: float
:param filename: path of file readable by StructureReader
:type filename: str
:param property: property name of predicted value stored as ct-level
property in the structures within filename
:type property: str
:param reference_property: property name of the reference value stored as
ct-level property filename
:type reference_property: str
:param cufoff: cutoff variable that supports >= on property
:type cutoff: int or float
:param reverse: reverse
:type reverse: bool
:param frame_to_check: indicies of frames within filename to process
:type frames_to_check: str
By default the value in property will be converted to a boolean, however
the cutoff {variable that supports >=} is defined then a predicted value of
True will given whenever the value is greater than this cutoff.
If reverse is True then the predicted value will be reversed (this makes
the most sense when cutoff is also used)
"""
truth_table = determine_truth_table(filename,
property,
reference_property,
cutoff=cutoff,
reverse=reverse,
frames_to_check=frames_to_check)
try:
prec = 1.0 * truth_table['tp'] / (truth_table['tp'] + truth_table['fp'])
rec = 1.0 * truth_table['tp'] / (truth_table['tp'] + truth_table['fn'])
F1 = 2.0 * prec * rec / (prec + rec)
except ZeroDivisionError:
F1 = 0.0
if (F1 < min_F1):
raise AssertionError("F1Score %5.3f < %5.3f TP %d TN %d FP %d FN %d" %
(F1, min_F1, truth_table['tp'], truth_table['tn'],
truth_table['fp'], truth_table['fn']))
return True
[docs]def check_ct_property(filename,
property_name,
value,
tolerance="=",
frames_to_check=None):
"""
STU Workup
For a specified structure in a filename {string} return True
if the property named property_name{string} is equal to
value {any type}
If tolerance is ">", "<", ">=", "<=" or "in" than use that operation
instead of equals. Otherwise is tolerance is provided and not
in the above list than return True iff all frames are within
tolerance of value.
"""
errors = []
operator_map = {
'<': operator.lt,
'<=': operator.le,
'>': operator.gt,
'>=': operator.ge,
'=': operator.eq,
'==': operator.eq,
'!=': operator.ne,
'in': operator.contains
}
for ct in partial_structure_reader(filename, frames_to_check):
try:
prop_value = ct.property[property_name]
except KeyError:
msg = "Entry {} does not have property {}".format(
ct.title, property_name)
raise AssertionError(msg)
if tolerance in operator_map:
if (not operator_map[tolerance](prop_value, value)):
msg = "Entry {} property {} ({}) is NOT {} {}".format(
ct.title, property_name, prop_value, tolerance, value)
errors.append(msg)
else:
if (abs(prop_value - value) > tolerance):
msg = "Entry {} property {} ({}) is NOT == {} +/- {}".format(
ct.title, property_name, prop_value, value, tolerance)
errors.append(msg)
if errors:
raise AssertionError('\n'.join(errors))
return True
[docs]def check_ct_property_unique(filename,
property_name,
tolerance=0.00001,
frames_to_check=None):
"""
Raise an AssertionError if there are entries with duplicated values of a
given ct-level property.
:param filename: Maestro formated structure file to read
:type filename: str
:param property_name: ct-level property name. This should start with `r_`
or `i_`
:type tolerance: float
:param tolerance: Values within this tolerance are considered equal
:type frames_to_check: list of int
:param frames_to_check: check these frames in filename as described
in partial structure reader
"""
values = []
for ct in partial_structure_reader(filename, frames_to_check):
if (len(values) > 0):
closest = np.min(
np.abs(np.array(values) - ct.property[property_name]))
if (closest < tolerance):
raise AssertionError(
"Entry {} in {}".format(ct.title, filename) +
" property {}:{:f} is not unique".format(
property_name, ct.property[property_name]))
values.append(ct.property[property_name])
return True
[docs]def check_hbond(filename,
name1,
name2,
frames_to_check=None,
distance_max=2.5,
donor_angle=120,
acceptor_angle=90):
"""
Return True if the atom pair specified by `name1` and `name2` is a hydrogen
bond in every frame in a structure file.
For example, `check_hbond('some_file.maegz', 'A:1:H', 'A:4:O')`
:type frames_to_check: list(int)
:param frames_to_check: Only cts in `filename` for which their index
(starting with 0) are in `frames_to_check` are
returned.
For example, `check_hbond('some_file.maegz', 'A:1:H', 'A:4:O', "[0, 6, 2]")`
will check_hbonds in cts zero, six, and two. All selected cts must have the
defined hbond for this check to return True
If this argument is not set, then all cts in `filename` are checked.
distance_max, donor_angleh, acceptor_angle: See description in
schrodinger.structutils.interactions.hbond.match_hbond
"""
for ct in partial_structure_reader(filename, frames_to_check):
list1 = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name1))
list2 = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name2))
if len(list1) != 1:
error = "name {} in ct {} in filename {} matches more than one atom"
error = error.format(name1, ct.title, filename)
raise RuntimeError(error)
if len(list2) != 1:
error = "name {} in ct {} in filename {} matches more than one atom"
error = error.format(name2, ct.title, filename)
raise RuntimeError(error)
atom1 = ct.atom[list1[0]]
atom2 = ct.atom[list2[0]]
hbond_found = hbond.match_hbond(atom1, atom2, distance_max, donor_angle,
acceptor_angle)
if not hbond_found:
raise AssertionError('{} is not h-bonded to {} in structure {}'.\
format(name1, name2, ct.title))
return True
[docs]def check_position(filename,
name,
target_x,
target_y,
target_z,
tolerance,
frames_to_check=None):
"""
For the specified structures in a filename `string` return True if the
specified residue or atom identified by `name` is within `tolerance` of
the coordinate at `target_x, target_y, target_z`.
If a residue is given, then any atom within `tolerance` of the target
coordinates will return True.
The distance must be less than or equal to the `tolerance`.
For example::
check_position('some_file.maegz', 'A:1', 2.38, 0.91, 3.79, 1.0)
:param frames_to_check: list(int)
:type frames_to_check: Only cts in `filename` for which their index
(starting with 0) are in frames_to_check are
returned.
For example::
check_position('some_file.maegz', 'A:1', 2.38, 0.91, 3.79, 1.0,
"[0, 6, 2]")
will check the position of residue A:1 in cts zero, six, and two. All
selected cts must individually return True for check_position to return
True.
If this argument is not set, then all cts in `filename` are checked.
"""
tol_squared = tolerance**2
target_vector = np.array([target_x, target_y, target_z])
for ct in partial_structure_reader(filename, frames_to_check):
iatoms = analyze.evaluate_asl(ct, convert_plop_name_to_asl(name))
if len(iatoms) == 0:
error = "name {} in ct {} in filename {} matches nothing"
error = error.format(name, ct.title, filename)
raise RuntimeError(error)
atom_within_tol = False
for iatom in iatoms:
atom = ct.atom[iatom]
delta_vec = target_vector - atom.xyz
dist_squared = np.dot(delta_vec, delta_vec)
if dist_squared <= tol_squared:
atom_within_tol = True
break
if not atom_within_tol:
# Example full message:
# No atom in A:2:O was found within 2.00 A of
# (2.38, 0.91, 3.79) across all cts in dummyfile.maegz
msg = 'No atom in {} was found within {:.2f} A of '.\
format(name, tolerance)
msg += '({:.2f}, {:.2f}, {:.2f}) '.format(*target_vector)
if frames_to_check:
msg += f'in cts {frames_to_check} '
else:
msg += 'across all cts '
msg += f'in {filename}'
raise AssertionError(msg)
return True
[docs]def check_sequence(filename, chain, sequence):
"""
Check that the first structure in a file has a given 1-letter sequence
inputs:
:param filename: Maestro formatted file to check
:type filename: str
:param chain: Chain name to check
:type chain: str (string of length 1 character)
:param sequence: protein sequence to check for. The sequence given here
must match that in the structure file exactly to pass. Any
chain breaks (no matter the length in sequence space (ie
A:2 - A:3 or A:2 to A:103)) are represented by a "-"
:type sequence: str of upper-case single character protein codes
:return: True is returned if this criteria is met, An AssertionError is
raised if this criteria is not met.
:rtype: bool
"""
ct = structure.Structure.read(filename)
ct_sequence = ""
for one_chain in ct.chain:
if one_chain.name != chain:
continue
last_res = None
for res in structure.get_residues_by_connectivity(one_chain):
# Skip if this is an isolated residue
if len(res.atom[1].getMolecule().residue) == 1:
continue
# Add a "-" if there is a chain break
if last_res is not None and not last_res.isConnectedToResidue(res):
ct_sequence += "-"
ct_sequence += res.getCode()
last_res = res
if sequence != ct_sequence:
raise AssertionError("Provided Sequence: %s\nSequence in File : %s" %
(sequence, ct_sequence))
return True