"""
Module with phase_hypo_refine option parsing and validation functionality.
Copyright Schrodinger LLC, All Rights Reserved.
"""
import argparse
import csv
import os
import tempfile
import zipfile
from schrodinger.application.phase.packages import conformer_reader
from schrodinger.application.phase.packages import phase_utils
from schrodinger.application.phase.packages.hypo_refine import project_utils
from schrodinger.infra import phase
from schrodinger.utils import cmdline
from schrodinger.utils import fileutils
PHASE_HYPO_REFINE = "phase_hypo_refine"
DEFAULT_NUM_MISS = 1
DEFAULT_TOL_CHANGE = 1.0
DEFAULT_TOL_STEPS = 4
DEFAULT_XVOL_BUFFER = 2.0
DEFAULT_XVOL_SPACING = 2.0
DEFAULT_BEDROC_ALPHA1 = 160.9
DEFAULT_BEDROC_WEIGHT1 = 0.5
DEFAULT_BEDROC_ALPHA2 = 20.0
DEFAULT_BEDROC_WEIGHT2 = 0.5
# Types of files that may be supplied as <actives> or <decoys>:
LEGAL_ACTIVE_DECOY_FORMATS = [
phase.PhpFileFormat_PHP_FORMAT_MAE, phase.PhpFileFormat_PHP_FORMAT_SD,
phase.PhpFileFormat_PHP_FORMAT_PHZIP
]
[docs]def add_project_creation_options(parser):
"""
Adds project creation options to the provided parser.
:param parser: Argument parser object.
:type parser: argparser.ArgumentParser
"""
project_options = parser.add_argument_group(
title="Project Creation Options",
description="These options control the creation of multi-conformer "
"Phase projects from\nthe actives/decoys if those compounds are "
"provided in Maestro or SD files.")
conformer_reader.add_standard_confgen_options(project_options, False)
project_options.add_argument(
"-save_projects",
action="store_true",
help="Return a Zip archive of each created project. For example, if "
"the active structures are provided in the file actives.maegz, the "
"zipped project actives.phzip would be returned. This option is "
"recommended if subsequent jobs are to be run with the same actives "
"and decoys.")
project_options.add_argument(
"-exit",
action="store_true",
help="Exit after creating projects. Valid only with -save_projects.")
[docs]def add_refinement_options(parser):
"""
Adds hypothesis refinement options to the provided parser.
:param parser: Argument parser object.
:type parser: argparser.ArgumentParser
"""
refinement_options = parser.add_argument_group(
title="Refinement Options",
description="These options provide control over modifications made to "
"the geometry of the\nhypothesis, its matching rules, matching "
"tolerances and excluded volumes. All\nrefinements are designed to "
"increase w1*BEDROC1 + w2*BEDROC2, where the two\nBEDROC scores "
"correspond to different early recognition parameters, alpha1\nand "
"alpha2, respectively.")
refinement_options.add_argument(
"-move",
type=int,
metavar="<maxcycle>",
choices=[phase_utils.RestrictedRange(0, None, True)],
default=phase.DEFAULT_MAX_CYCLES,
help="Maximum number of cycles in optimization that adjusts positions "
"and orientations of hypothesis features to improve BEDROC scores. "
"This requires replacement of the reference ligand with a set of "
"fragments that can be moved independently. Since the fragment-based "
"reference ligand fails to accurately represent the overall shape of "
"the original reference ligand, the volume score weight is set to 0 "
"for the entire refinement process, and when performing any subsequent "
"screens with the hypothesis. Set <maxcycle> to 0 to disable movement "
"of features (default: {}).".format(phase.DEFAULT_MAX_CYCLES))
refinement_options.add_argument(
"-frag",
action="store_true",
help="Convert reference ligand to fragments even if movement of "
"features is disabled. This allows a fully consistent comparison of "
"refinement results obtained with and without movement of features.")
refinement_options.add_argument(
"-miss",
type=int,
metavar="<m>",
choices=[phase_utils.RestrictedRange(0, None, True)],
default=DEFAULT_NUM_MISS,
help="Maximum number of hypothesis features for which matching is "
"optional. This stage of refinement explores whether BEDROC scores "
"can be improved by allowing certain features to be missed. Set to 0 "
"to disable (default: {}).".format(DEFAULT_NUM_MISS))
refinement_options.add_argument(
"-ex",
action="store_true",
help="Exhaustively enumerate all partial matches when optional "
"matching is in effect. This may or may not ultimately yield higher "
"BEDROC scores, but it does improve the effectiveness of subsequently "
"added excluded volumes by preventing the promotion of partially "
"matched decoys which were not found prior to adding excluded volumes.")
refinement_options.add_argument(
"-tol",
type=float,
metavar="<change>",
choices=[phase_utils.RestrictedRange(0.0, None, True)],
default=DEFAULT_TOL_CHANGE,
help="Maximum amount, up or down, to adjust matching tolerances. Set "
"to 0 to disable (default: {}).".format(DEFAULT_TOL_CHANGE))
refinement_options.add_argument(
"-steps",
type=int,
metavar="<n>",
choices=[phase_utils.RestrictedRange(0, None, True)],
default=DEFAULT_TOL_STEPS,
help="Number of steps over which to apply maximum tolerance change. "
"For example, if the maximum tolerance change is 1.0 and the number "
"of steps is 4, each step will involve a tolerance adjustment of 0.25. "
"(default: {}).".format(DEFAULT_TOL_STEPS))
refinement_options.add_argument(
"-xvol",
type=float,
metavar="<dbuff>",
choices=[phase_utils.RestrictedRange(0, None, True)],
default=DEFAULT_XVOL_BUFFER,
help="Use hits from the highest scoring hypothesis to identify "
"locations for the placement of excluded volume spheres which clash "
"with one or more decoys but not with any actives. A buffer distance "
"of <dbuff> is enforced between the excluded volumes surface and the "
"van der Waals surface of each active. Set to 0 to disable. "
"(default: {}).".format(DEFAULT_XVOL_BUFFER))
refinement_options.add_argument(
"-grid",
type=float,
metavar="<spacing>",
choices=[phase_utils.RestrictedRange(0, None, False)],
default=DEFAULT_XVOL_SPACING,
help="The spacing between adjacent excluded volume spheres and the "
"radii of the spheres. (default: {}).".format(DEFAULT_XVOL_SPACING))
refinement_options.add_argument(
"-a1",
type=float,
metavar="<alpha1>",
choices=[phase_utils.RestrictedRange(0.0, None, False)],
default=DEFAULT_BEDROC_ALPHA1,
help="BEDROC early recognition parameter 1 "
"(default: {}).".format(DEFAULT_BEDROC_ALPHA1))
refinement_options.add_argument(
"-w1",
type=float,
metavar="<weight1>",
choices=[phase_utils.RestrictedRange(0.0, None, True)],
default=DEFAULT_BEDROC_WEIGHT1,
help="Weighting factor for BEDROC scores calculated using <alpha1> "
"(default: {}).".format(DEFAULT_BEDROC_WEIGHT1))
refinement_options.add_argument(
"-a2",
type=float,
metavar="<alpha2>",
choices=[phase_utils.RestrictedRange(0.0, None, False)],
default=DEFAULT_BEDROC_ALPHA2,
help="BEDROC early recognition parameter 2 "
"(default: {}).".format(DEFAULT_BEDROC_ALPHA2))
refinement_options.add_argument(
"-w2",
type=float,
metavar="<weight2>",
choices=[phase_utils.RestrictedRange(0.0, None, True)],
default=DEFAULT_BEDROC_WEIGHT2,
help="Weighting factor for BEDROC scores calculated using <alpha2> "
"(default: {}).".format(DEFAULT_BEDROC_WEIGHT2))
refinement_options.add_argument(
"-testonly",
action="store_true",
help="Run a single screen against <actives> and <decoys> using "
"the supplied hypothesis and report the weighted BEDROC score. This "
"option may be used to validate a refined hypothesis if -valid wasn't "
"used in the original job. May be combined with -frag in case the "
"supplied hypothesis has not been refined, but you wish to simulate "
"the baseline conditions of a refinement where movement is enabled.")
[docs]def get_parser():
"""
Creates argparse.ArgumentParser with supported command line options.
:return: Argument parser object
:rtype: argparser.ArgumentParser
"""
parser = argparse.ArgumentParser(
prog=PHASE_HYPO_REFINE,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("hypo",
metavar="<hypo>.phypo",
help="The pharmacophore hypothesis to be refined.")
parser.add_argument(
"actives",
metavar="<actives>",
help="Maestro file, SD file, or zipped Phase project (.phzip) "
"containing the actives. If zipped project, all ligands in the project "
"will be used, and the project feature definitions must be identical "
"to those of the hypothesis.")
parser.add_argument(
"decoys",
metavar="<decoys>",
help=
"Maestro file, SD file, or zipped Phase project containing the decoys. "
"The same caveats noted for <actives> apply if zipped project.")
parser.add_argument(
"-o",
metavar="<out>.phypo",
help="Output file for refined hypothesis (default: <hypo>-out.phypo).")
parser.add_argument(
"-valid",
metavar="<actives2>,<decoys2>",
help="Validate refined hypothesis against actives and decoys that "
"differ from those employed in the refinement. Use of this option is "
"strongly recommended. See -testonly for an alternate means of "
"validating.")
add_refinement_options(parser)
add_project_creation_options(parser)
jobcontrol_options = [cmdline.HOST, cmdline.JOBNAME, cmdline.TMPDIR]
cmdline.add_jobcontrol_options(parser, options=jobcontrol_options)
# Hidden options for subjobs that do BEDROC screens:
parser.add_argument("-subjob", help=argparse.SUPPRESS)
parser.add_argument("-mask_screen",
action="store_true",
help=argparse.SUPPRESS)
parser.add_argument("-tol_screen",
action="store_true",
help=argparse.SUPPRESS)
return parser
[docs]def validate_actives_decoys(args):
"""
Checks the validity of <actives>, <decoys>, and, if applicable, <actives2>
and <decoys2>.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
to_screen = {"<actives>": args.actives, "<decoys>": args.decoys}
valid = args.valid
if valid is not None:
tokens = next(csv.reader([valid]))
if len(tokens) != 2:
mesg = "Invalid <actives2>,<decoys2> syntax: \"%s\"" % valid
return False, mesg
to_screen["<actives2>"] = tokens[0]
to_screen["<decoys2>"] = tokens[1]
# Ensure that file formats are legal and base names are unique. Base names
# have to be unique so that we don't get collisions when creating projects,
# (e.g., /path/to/actives.maegz --> actives.phzip), not to mention the fact
# that someone might be trying to supply the same file twice.
base_names = set()
for key, value in to_screen.items():
file_format = phase.get_phase_file_format(value)
if file_format not in LEGAL_ACTIVE_DECOY_FORMATS:
mesg = "Illegal %s file format: \"%s\"" % (key, value)
return False, mesg
base_name = fileutils.get_basename(value)
if base_name in base_names:
mesg = "All active/decoy files must have unique base names"
return False, mesg
base_names.add(base_name)
# Still need proper paths for <actives2>,<decoys2>.
file_path = phase_utils.get_proper_path(value)
if not os.path.isfile(file_path):
mesg = "%s file \"%s\" not found" % (key, file_path)
return False, mesg
if file_format == phase.PhpFileFormat_PHP_FORMAT_PHZIP:
sites_ok, mesg = validate_project_sites(file_path)
if not sites_ok:
return False, mesg
fd_ok, mesg = validate_project_fd(file_path, args.hypo)
if not fd_ok:
return False, mesg
return True, ""
[docs]def validate_args(args):
"""
Checks the validity of command line arguments.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
hypo_ok, mesg = validate_hypo(args)
if not hypo_ok:
return False, mesg
hypo_out = args.o
if hypo_out is not None:
if not fileutils.is_hypothesis_file(hypo_out):
mesg = "Illegal output hypothesis file name: \"%s\"" % hypo_out
return False, mesg
actives_decoys_ok, mesg = validate_actives_decoys(args)
if not actives_decoys_ok:
return False, mesg
refine_options_ok, mesg = validate_refinement_options(args)
if not refine_options_ok:
return False, mesg
proj_options_ok, mesg = validate_project_creation_options(args)
if not proj_options_ok:
return False, mesg
return True, ""
[docs]def validate_hypo(args):
"""
Checks the validity of hypothesis.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
if not fileutils.is_hypothesis_file(args.hypo):
return False, "Illegal hypothesis file name: \"%s\"" % args.hypo
if not os.path.isfile(args.hypo):
return False, "Hypothesis file \"%s\" not found" % args.hypo
hypo = phase.PhpHypoAdaptor(args.hypo)
if hypo.getSiteCount() < 3:
return False, "Hypothesis must have at least 3 features"
if args.miss:
site_count = hypo.getSiteCount()
# Don't require the user to specify -miss 0 simply to avoid failing on
# a 3-point hypothesis.
if site_count > 3:
max_miss = site_count - 3
if args.miss > max_miss:
mesg = "Number of missed features cannot exceed %d" % max_miss
return False, mesg
if args.tol:
min_tol = phase.PHASE_DEFAULT_TOL
if hypo.hasTol():
min_tol = args.tol + 1.0
for site in hypo.getHypoSites():
min_tol = min(min_tol, site.getTol())
if args.tol >= min_tol:
mesg = "Maximum tolerance change must be less than %.2f" % min_tol
return False, mesg
return True, ""
[docs]def validate_project_creation_options(args):
"""
Checks the validity of project creation options.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
nddo_ok, mesg = conformer_reader.validate_confgen_nddo(args)
if not nddo_ok:
return False, mesg
if args.exit and not args.save_projects:
return False, "-exit is allowed only with -save_projects"
return True, ""
[docs]def validate_project_fd(project_path, hypo_path):
"""
Validates that the feature definitions in the provided zipped project
are equivalent to those in the provided hypothesis.
:param project_path: Path to zipped project file
:type project_path: str
:param hypo_path: Path to hypothesis file
:type hypo_path: str
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
fd_hypo = phase.PhpHypoAdaptor(hypo_path).getFd()
fd_file_base = phase.getDefaultFdFileBaseName()
fd_proj_zip, fd_proj_disk = project_utils.get_project_file_names(
project_path, fd_file_base)
with tempfile.TemporaryDirectory() as tmpdir_name:
with zipfile.ZipFile(project_path, 'r') as zfile:
zfile.extract(fd_proj_zip, tmpdir_name)
fd_proj_tmpdir = os.path.join(tmpdir_name, fd_proj_disk)
project = phase.PhpProject()
fd_proj = project.getFeatureDef(fd_proj_tmpdir)
if fd_proj != fd_hypo:
mesg = "Hypothesis feature definitions differ from those in " + \
"project \"%s\"" % project_path
return False, mesg
return True, ""
[docs]def validate_project_sites(project_path):
"""
Validates that the provided zipped project contains pharmacophores sites.
:param project_path: Path to zipped project file
:type project_path: str
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
settings_zip, _settings_disk = project_utils.get_project_file_names(
project_path, "Settings.dat")
with zipfile.ZipFile(project_path, 'r') as zfile:
with zfile.open(settings_zip, 'r') as settings_file:
settings_string = str(settings_file.read())
if "step import" in settings_string:
mesg = "Project \"{}\" contains no pharmacophore sites"
return False, mesg.format(project_path)
return True, ""
[docs]def validate_refinement_options(args):
"""
Checks the validity of hypothesis refinement options.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
if args.w1 + args.w2 == 0:
return False, "Weights for BEDROC1 and BEDROC2 cannot both be 0"
return True, ""