Source code for schrodinger.application.phase.packages.phase_screen_driver_utils

"""
Module with functionality used by phase_screen_driver.py.

Copyright Schrodinger LLC, All Rights Reserved.
"""

import argparse

import os
import tempfile
import zipfile
from enum import Enum

from schrodinger import structure
from schrodinger.application.phase.packages import conformer_reader
from schrodinger.infra import mm
from schrodinger.infra import phase
from schrodinger.job import jobcontrol
from schrodinger.utils import fileutils

from . import phase_screen_utils
from . import phase_utils

PHASE_SCREEN = "phase_screen"

# Types of files that may be supplied as <source>:
LEGAL_SOURCE_FORMATS = [
    phase.PhpFileFormat_PHP_FORMAT_MAE, phase.PhpFileFormat_PHP_FORMAT_SD,
    phase.PhpFileFormat_PHP_FORMAT_PHDB, phase.PhpFileFormat_PHP_FORMAT_LIST
]

# Types of files that may be supplied within a <source> .list file:
LEGAL_LIST_FORMATS = [
    phase.PhpFileFormat_PHP_FORMAT_MAE, phase.PhpFileFormat_PHP_FORMAT_SD,
    phase.PhpFileFormat_PHP_FORMAT_PHDB, phase.PhpFileFormat_PHP_FORMAT_PHZIP
]

# Types of files that may be supplied as <hypo>:
LEGAL_HYPO_FORMATS = [
    phase.PhpFileFormat_PHP_FORMAT_PHYPO, phase.PhpFileFormat_PHP_FORMAT_ZIP
]

SourceFormat = Enum("SourceFormat", "file database project")
SOURCE_FORMAT_DESCRIPTIONS = {
    phase.PhpFileFormat_PHP_FORMAT_MAE: SourceFormat.file,
    phase.PhpFileFormat_PHP_FORMAT_SD: SourceFormat.file,
    phase.PhpFileFormat_PHP_FORMAT_PHDB: SourceFormat.database,
    phase.PhpFileFormat_PHP_FORMAT_PHZIP: SourceFormat.project
}


[docs]def add_hidden_options(parser):
    """
    Adds options that the user doesn't need to know about.

    :param parser: Argument parser object.
    :type parser: argparser.ArgumentParser
    """

    # Running as a subjob:
    parser.add_argument("-subjob", action="store_true", help=argparse.SUPPRESS)

    # A .list file of database subset files is being supplied for a subjob:
    parser.add_argument("-isub_list", help=argparse.SUPPRESS)

    # Check out a PHASE_ELEMENTS licence rather than PHASE_DBSEARCH.
    parser.add_argument("-elements",
                        action="store_true",
                        help=argparse.SUPPRESS)

    # Allow -sites as an alias for dbsites on.
    parser.add_argument("-sites", action="store_true", help=argparse.SUPPRESS)

    # Flag to create <jobname>.okay if job completes successfully.
    parser.add_argument("-okay", action="store_true", help=argparse.SUPPRESS)

    # These old options do nothing but allow them for the time being:
    parser.add_argument("-verbose", action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("-NOCHECKPOINT",
                        action="store_true",
                        help=argparse.SUPPRESS)
    parser.add_argument("-NO_CHECKPOINT",
                        action="store_true",
                        help=argparse.SUPPRESS)


[docs]def add_jobcontrol_options(parser):
    """
    Adds job control options to the provided parser.

    :param parser: Argument parser object.
    :type parser: argparser.ArgumentParser
    """

    # Note that we are not using cmdline.add_jobcontrol_options because some
    # of the help messages aren't quite detailed enough.
    jc_options = parser.add_argument_group(title="Job Control Options")
    jc_options.add_argument(
        "-HOST",
        metavar="<host>[:<m>]",
        help="Run job on a remote host. Include \":<m>\" to split over <m> CPUS. "
        "Note that a multi-conformer Maestro/SD file cannot be divided over "
        "different CPUs, so <m> will be reduced accordingly if it exceeds the "
        "number of multi-conformer files. Note also that a Phase database must "
        "be accessible to the job host via the absolute path specified in "
        "<source>. By default, the job will run on a single CPU on localhost.")
    jc_options.add_argument(
        "-NJOBS",
        dest="njobs",
        type=int,
        metavar="<n>",
        help="Divide work over <n> subjobs, where <n> may be greater than the "
        "number of CPUs requested. Allows finer granularity of work units and "
        "shorter delays when a failed subjob has to be rerun.")
    jc_options.add_argument("-TMPDIR",
                            metavar="<dir>",
                            help="Store temporary job files in <dir>.")


[docs]def add_database_options(parser):
    """
    Adds database screening options to the provided parser.

    :param parser: Argument parser object
    :type parser: argparser.ArgumentParser
    """

    db_options = phase_screen_utils.add_database_options(parser)
    site_treatment = db_options.add_mutually_exclusive_group(required=False)
    site_treatment.add_argument(
        "-dbsites",
        choices=["on", "off", "auto"],
        help="Controls whether pharmacophore sites should be generated "
        "on-the-fly using the hypothesis feature definitons: on = always "
        "generate sites; off = never generate sites; auto = generate sites "
        "only if the hypothesis and database feature definitions differ. "
        "Note that the job will fail if the feature definitions differ and "
        "-dbsites off is used. The default is auto, which allows fast "
        "pre-filtering using the 2D/3D index if the database and "
        "hypothesis feature definitions are the same.")
    site_treatment.add_argument(
        "-noindex",
        action="store_true",
        help="Do not pre-filter using 2D/3D index. Sites will be generated "
        "on-the-fly if database and hypothesis feature definitions differ.")


[docs]def add_matching_options(parser):
    """
    Adds matching options to the provided parser.

    :param parser: Argument parser object.
    :type parser: argparser.ArgumentParser
    """

    matching_options = parser.add_argument_group(title="Matching Options")
    dist_tol = matching_options.add_mutually_exclusive_group(required=False)
    default_tol = phase.PHASE_DEFAULT_DELTA_DIST
    dist_tol.add_argument(
        "-d",
        type=float,
        metavar="<dist>",
        choices=[phase_utils.RestrictedRange(0.0, None, False)],
        help="Intersite distance matching tolerance in angstroms. The default is "
        "%.1f if neither -d <dist> nor -usetol is specified." % default_tol)
    dist_tol.add_argument(
        "-usetol",
        action="store_true",
        help=
        "Use the sum of the two largest positional tolerances as the intersite "
        "distance matching tolerance. Defaults back to %.1f if the hypothesis "
        "has no positional tolerances." % default_tol)
    matching_options.add_argument(
        "-match",
        type=int,
        metavar="<minsites>",
        help=
        "Minimum number of hypothesis sites to match. The default is all sites."
    )
    matching_options.add_argument(
        "-ex",
        action="store_true",
        help="Do an exhaustive screen that considers matches to n sites, n-1 "
        "sites,...,<minsites>. By default, if a molecule yields matches to a "
        "given number of sites, matches to smaller numbers of sites will not "
        "be considered.")
    matching_options.add_argument(
        "-t",
        type=float,
        metavar="<tlimit>",
        choices=[phase_utils.RestrictedRange(0.0, None, False)],
        help=
        "CPU time limit in seconds for finding matches to each molecule. Does "
        "not include the time to generate conformers. The default is no time "
        "limit.")
    matching_options.add_argument(
        "-inplace",
        action="store_true",
        help="Find and score matches without aligning to the hypothesis. "
        "Appropriate only when the structures being screened are already in "
        "the same frame of reference as the hypothesis, such as docked poses "
        "that are being screened with a receptor-based hypothesis.")
    matching_options.add_argument(
        "-notol",
        action="store_true",
        help="Ignore positional tolerances if present in hypothesis.")
    matching_options.add_argument(
        "-nocnst",
        action="store_true",
        help=
        "Ignore distance/angle/dihedral constraints if present in hypothesis.")
    matching_options.add_argument(
        "-nomask",
        action="store_true",
        help="Ignore required match conditions if present in hypothesis.")
    matching_options.add_argument(
        "-norules",
        action="store_true",
        help="Ignore feature-matching rules if present in hypothesis.")


[docs]def add_reporting_options(parser):
    """
    Adds reporting options to the provided parser.

    :param parser: Argument parser object
    :type parser: argparser.ArgumentParser
    """

    reporting_options = phase_screen_utils.add_reporting_options(
        parser, "PHASE_SCREEN_SCORE")
    reporting_options.add_argument(
        "-prehypo",
        action="store_true",
        help="Prepend pharmacophore hypothesis to hit file. Ignored if -osd "
        "is used.")
    reporting_options.add_argument(
        "-report",
        type=int,
        metavar="<n>",
        choices=[phase_utils.RestrictedRange(0, None, False)],
        help="Report up to <n> hits per molecule, grouped and sorted by "
        "PHASE_SCREEN_SCORE. The default is 1.")
    reporting_options.add_argument("-noqsar",
                                   action="store_true",
                                   help="Do not apply QSAR model to hits.")


[docs]def add_scoring_options(parser):
    """
    Adds scoring/filtering options to the provided parser.

    :param parser: Argument parser object.
    :type parser: argparser.ArgumentParser
    """

    scoring_options = parser.add_argument_group(
        title="Scoring and Filtering Options")
    scoring_options.add_argument(
        "-noref",
        action="store_true",
        help="Ignore reference ligand. This turns off vector and volume scoring."
    )
    scoring_options.add_argument(
        "-noxvol",
        action="store_true",
        help="Ignore excluded volumes if present in hypothesis.")
    scoring_options.add_argument(
        "-noivol",
        action="store_true",
        help="Ignore included volumes if present in hypothesis.")
    scoring_options.add_argument(
        "-atypes",
        action="store_true",
        help="Use MacroModel atom types when computing volume scores.")
    scoring_options.add_argument(
        "-aw",
        type=float,
        metavar="<weight>",
        choices=[phase_utils.RestrictedRange(0.0, None, True)],
        help="Alignment score weight. Must be >= 0. The default is %.1f." %
        phase.PHASE_DEFAULT_ALIGN_WEIGHT)
    scoring_options.add_argument(
        "-ac",
        type=float,
        metavar="<cutoff>",
        choices=[phase_utils.RestrictedRange(0.0, None, False)],
        help="Alignment score cutoff. Must be > 0. The default is %.1f." %
        phase.PHASE_DEFAULT_ALIGN_CUTOFF)
    scoring_options.add_argument(
        "-hard",
        action="store_true",
        help=
        "Apply alignment score cutoff as a hard filter to eliminate hits with "
        "alignment scores that exceed the cutoff. By default, the alignment "
        "score cutoff is used only as a parameter in the alignment score "
        "formula.")
    scoring_options.add_argument(
        "-ap",
        type=float,
        metavar="<penalty>",
        choices=[phase_utils.RestrictedRange(0.0, None, True)],
        help=
        "Partial matching alignment score penalty. Must be >= 0. The default "
        "is %.1f." % phase.PHASE_DEFAULT_ALIGN_PENALTY)
    scoring_options.add_argument(
        "-vw",
        type=float,
        metavar="<weight>",
        choices=[phase_utils.RestrictedRange(0.0, None, True)],
        help="Vector score weight. Must be >= 0. The default is %.1f." %
        phase.PHASE_DEFAULT_VECTOR_WEIGHT)
    scoring_options.add_argument(
        "-vc",
        type=float,
        metavar="<cutoff>",
        choices=[phase_utils.RestrictedRange(-1.0, 1.0)],
        help="Eliminate hits with vector scores below this value. Must lie on "
        "[-1, 1]. The default is %.1f." % phase.PHASE_DEFAULT_VECTOR_CUTOFF)
    scoring_options.add_argument(
        "-volw",
        type=float,
        metavar="<weight>",
        choices=[phase_utils.RestrictedRange(0.0, None, True)],
        help="Volume score weight. Must be >= 0. The default is %.1f." %
        phase.PHASE_DEFAULT_VOLUME_WEIGHT)
    scoring_options.add_argument(
        "-volc",
        type=float,
        metavar="<cutoff>",
        choices=[phase_utils.RestrictedRange(0.0, 1.0)],
        help="Eliminate hits with volume scores below this value. Must lie on "
        "[0, 1]. The default is %.1f." % phase.PHASE_DEFAULT_VOLUME_CUTOFF)
    scoring_options.add_argument(
        "-ivolw",
        type=float,
        metavar="<weight>",
        choices=[phase_utils.RestrictedRange(0.0, None, True)],
        help="Included volume score weight. Must be >= 0. The default is %.1f."
        % phase.PHASE_DEFAULT_IVOL_WEIGHT)
    scoring_options.add_argument(
        "-ivolc",
        type=float,
        metavar="<cutoff>",
        choices=[phase_utils.RestrictedRange(0.0, 1.0)],
        help=
        "Eliminate hits with included volume scores below this value. Must lie "
        "on [0, 1]. The default is %.1f." % phase.PHASE_DEFAULT_IVOL_CUTOFF)


[docs]def combine_hit_files(args, subjobs):
    """
    Combines hit files for the supplied subjobs.

    :param args: Command line arguments
    :type args: argparse.Namespace

    :param subjobs: Subjob names
    :type subjobs: list(str)
    """

    hit_files_in = [subjob + phase.PHASE_HIT_FILE_EXT_MAE for subjob in subjobs]

    if args.nosort:
        max_hits = None
        sort_prop = None
    else:
        max_hits = phase_screen_utils.get_max_hits(args)
        sort_prop = phase.PHASE_GROUP_FITNESS

    if args.osd:
        hit_file_out = args.jobname + phase.PHASE_HIT_FILE_EXT_SDF
    else:
        hit_file_out = args.jobname + phase.PHASE_HIT_FILE_EXT_MAE

    phase_screen_utils.combine_hit_files(hit_files_in, hit_file_out, max_hits,
                                         sort_prop)


[docs]def distribute_hypos(hypos, num_zip_files, jobname):
    """
    Distributes the supplied hypotheses equally over the indicated number of
    zip files and returns the names of those zip files.

    :param hypos: Hypotheses
    :type hypos: list(PhpHypoAdaptor)

    :param num_zip_files: Number of zip files to create
    :type num_zip_files: int

    :param jobname: Job name
    :type jobname: str

    :return: Names of zip files
    :rtype: list(str)
    """

    zip_subdir = jobname + "_hypo_files"
    if os.path.isdir(zip_subdir):
        fileutils.force_rmtree(zip_subdir)
    os.mkdir(zip_subdir)

    # Save hypotheses to zip_subdir so that they will be at the root level
    # of each zip file.
    hypo_files = []
    for hypo in hypos:
        hypo_file = hypo.getHypoID() + phase.PHASE_HYPO_FILE_EXT
        hypo_files.append(hypo_file)
        hypo_subdir_file = os.path.join(zip_subdir, hypo_file)
        hypo.save(hypo_subdir_file, True)

    # Number of hypotheses in each zip file:
    subjob_hypo_counts = phase.partitionValues(len(hypos), num_zip_files)

    # Create zip files.
    zip_files = []
    mesg = ""
    with fileutils.chdir(zip_subdir):
        jstart = 0
        for i in range(num_zip_files):
            zip_file_name = "%s_sub_%d_hypos.zip" % (jobname, i + 1)
            zip_files.append(os.path.join(zip_subdir, zip_file_name))
            jstop = jstart + subjob_hypo_counts[i]
            with zipfile.ZipFile(zip_file_name, 'w') as zfile:
                for j in range(jstart, jstop):
                    hypo_file = hypo_files[j]
                    zfile.write(hypo_file)
                    fileutils.force_remove(hypo_file)
            jstart = jstop

    return zip_files


[docs]def get_common_args(args):
    """
    Returns a command containing arguments that are common to all subjobs.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :return: Command with common arguments
    :rtype: list(str)
    """

    common_args = []
    for key, value in vars(args).items():
        if key not in ["source", "hypo", "jobname", "njobs", "isub"]:
            flag = "-" + key
            if value is not None:
                if value is True:
                    common_args.append(flag)
                elif value is not False:
                    common_args.extend([flag, str(value)])

    return common_args


[docs]def get_hypos(hypo_file):
    """
    Reads hypothesis or hypotheses from a .phypo or .zip file.

    :param hypo_file: A .phypo or .zip file
    :type args: str

    :return: list of one or more hypotheses
    :rtype: list(PhpHypoAdaptor)
    """

    hypo_dir = ""
    hypo_path = phase_utils.get_proper_path(hypo_file)
    hypo_files = [hypo_path]
    file_format = phase.get_phase_file_format(hypo_path)
    if file_format == phase.PhpFileFormat_PHP_FORMAT_ZIP:
        hypo_files = []
        with zipfile.ZipFile(hypo_path, 'r') as zfile:
            for file_name in zfile.namelist():
                file_format = phase.get_phase_file_format(file_name)
                if file_format == phase.PhpFileFormat_PHP_FORMAT_PHYPO:
                    hypo_files.append(file_name)
            if not hypo_files:
                mesg = "No hypotheses found in \"%s\"" % hypo_path
                raise OSError(mesg)
            # TODO: Use a tempfile.TemporaryDirectory context manager when
            # we switch over to Python 3.
            hypo_dir = tempfile.mkdtemp()
            zfile.extractall(hypo_dir)

    hypos = []
    hypo_ids = []
    try:
        for file_name in hypo_files:
            hypo = phase.PhpHypoAdaptor(os.path.join(hypo_dir, file_name))
            hypos.append(hypo)
            hypo_ids.append(hypo.getHypoID())
        # The screening code assumes that the hypothesis IDs are unique.
        hypo_ids_set = set(hypo_ids)
        if len(hypo_ids_set) < len(hypo_ids):
            mesg = "Hypothesis IDs are not unique: %s" % ", ".join(hypo_ids)
            raise phase.PhpException(mesg)
    except Exception:
        raise
    finally:
        if hypo_dir:
            fileutils.force_rmtree(hypo_dir)

    return hypos


[docs]def get_min_sites(hypo, user_match):
    """
    Returns the minimum number of sites that must be matched in the supplied
    hypothesis. This may come from user_match or from the PHASE_MIN_SITES
    property in the hypothesis. If neither is specified, it will be the total
    number of sites in the hypothesis.

    :param hypo: pharmacophore hypothesis
    :type hypo: PhpHypoAdaptor

    :param user_match: User-specified minimum number of sites or None
    :type args: int

    :return: Minimum number of sites to match
    :rtype: int
    """

    # User supplied:
    if user_match is not None:
        return user_match

    # Get from hypothesis:
    try:
        min_sites = int(hypo.getProp(phase.PHASE_MIN_SITES))
    except phase.PhpException as exc:
        min_sites = hypo.getSiteCount()
    return min_sites


[docs]def get_num_subjobs(args):
    """
    Returns the number of subjobs requested on the command line via the
    -NJOBS or -HOST option.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :return: Number of subjobs
    :rtype: int
    """

    nsub = args.njobs
    host_list = jobcontrol.get_backend_host_list()
    ncpu = jobcontrol.calculate_njobs(host_list)
    if nsub is None or nsub < ncpu:
        nsub = ncpu

    return nsub


[docs]def get_parser():
    """
    Creates argparse.ArgumentParser with supported command line options.

    :return: Argument parser object
    :rtype: argparser.ArgumentParser
    """
    parser = argparse.ArgumentParser(
        prog=PHASE_SCREEN, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        "source",
        metavar="<source>",
        help=
        "The source of structures to screen. Must be a Maestro file, SD file, "
        "an absolute path to a Phase database (.phdb), or a list file (.list) "
        "that contains the names of one or more Maestro file, SD files or "
        "Phase databases, with one name per line. A list file that mixes "
        "Maestro/SD files and databases is not permitted.")
    parser.add_argument(
        "hypo",
        metavar="<hypo>",
        help=
        "Hypothesis file (.phypo) or a Zip archive (.zip) containing multiple "
        "hypothesis files at the root level. Each returned hit will contain "
        "the property %s to indicate the hypothesis it matched." %
        phase.PHASE_HYPOID)
    parser.add_argument(
        "jobname",
        metavar="<jobname>",
        help="Job name. Hits are returned in <jobname>-hits.maegz.")

    conformer_reader.add_file_options(parser)
    add_database_options(parser)
    conformer_reader.add_confgen_options(parser, True)
    add_matching_options(parser)
    add_reporting_options(parser)
    add_scoring_options(parser)
    add_jobcontrol_options(parser)
    add_hidden_options(parser)

    return parser


[docs]def get_source_files(source):
    """
    Returns the names of the files/databases/zipped projects to be screened,
    taking proper account of whether the current process is running under job
    control.

    :param source: A legal source of structures to screen
    :type source: str

    :return: Names of files/database/zipped projects to screen
    :rtype: list(str)
    """

    source_files = [phase_utils.get_proper_path(source)]
    source_format = phase.get_phase_file_format(source)
    if source_format == phase.PhpFileFormat_PHP_FORMAT_LIST:
        source_files = phase_utils.get_file_names_from_list_file(source)
    return source_files


[docs]def get_source_format(source):
    """
    Returns the format of source as a SourceFormat object.

    param source: The name of a file, database or zipped project
    type source: str

    :return: The format of source
    :rtype: SourceFormat
    """

    return SOURCE_FORMAT_DESCRIPTIONS[phase.get_phase_file_format(source)]


[docs]def prepend_hypos(args):
    """
    Prepends pharmacophore hypotheses to the hit file.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace
    """

    # Write hypotheses to a temporary Maestro file, excluding any additional
    # CTs because it might be annoying and/or confusing to include them among
    # the hits.
    hypos = get_hypos(args.hypo)
    jobname = args.jobname
    ext = phase.PHASE_HIT_FILE_EXT_MAE
    hypos_file = jobname + "_hypos_" + ext
    with structure.StructureWriter(hypos_file) as writer:
        for hypo in hypos:
            hypo_id = hypo.getHypoID()
            subgroup = jobname + mm.M2IO_SUBGROUP_SEPARATOR + hypo_id
            hypo.addProp(mm.M2IO_DATA_SUBGROUPID, subgroup)
            st = structure.Structure(hypo.getHypoCt())
            writer.append(st)

    # Concatenate hypotheses and hits to a temporary file, then rename it.
    hit_file = jobname + ext
    files_in = [hypos_file, hit_file]
    hit_file_tmp = jobname + "_tmp_" + ext
    fileutils.cat(files_in, hit_file_tmp)
    fileutils.force_rename(hit_file_tmp, hit_file)


[docs]def remove_output_files(args):
    """
    Removes output files that would be created in the launch directory by the
    parent job.

    :param args: Command line arguments
    :type args: argparse.Namespace
    """

    if not jobcontrol.under_job_control():
        jobname = args.jobname
        fileutils.force_remove(jobname + ".log")
        if args.okay:
            fileutils.force_remove(jobname + ".okay")
        hit_file = jobname + phase.PHASE_HIT_FILE_EXT_MAE
        if args.osd:
            hit_file = jobname + phase.PHASE_HIT_FILE_EXT_SDF
        fileutils.force_remove(hit_file)


[docs]def setup_db_screen(args, db_paths):
    """
    Does setup for a distributed database screen.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :param db_paths: Databases to screen
    :type db_paths: list(str)

    :return: list of subjob commands
    :rtype: list(list(str))
    """

    nsub = get_num_subjobs(args)
    subset = phase_utils.get_proper_path(
        phase_screen_utils.get_subset_file(args))
    if subset == "":
        subset = phase.PHASE_SCREEN_SUBSET
    prefix = args.jobname + "_sub"
    splitter = phase.PhpSubsetSplitter(db_paths, nsub, prefix, subset)
    commands = []
    common_args = get_common_args(args)
    hypo = phase_utils.get_proper_path(args.hypo)
    nsub = splitter.getSubjobCount()  # Could be smaller than requested
    for subjob_number in range(1, nsub + 1):
        subjob_name = "%s_sub_%d" % (args.jobname, subjob_number)
        db_list_file = "%s_dbs.list" % subjob_name
        subset_list_file = "%s_subsets.list" % subjob_name
        command = [
            PHASE_SCREEN, db_list_file, hypo, subjob_name, "-isub_list",
            subset_list_file, "-subjob"
        ] + common_args
        commands.append(command)
        subjob_input_files = [db_list_file, hypo, subset_list_file]
        db_paths_subjob = list(splitter.getDbPaths(subjob_number))
        phase_utils.write_list_to_file(db_list_file, db_paths_subjob)
        subset_files_subjob = list(splitter.getSubsetFiles(subjob_number))
        subjob_input_files.extend(subset_files_subjob)
        phase_utils.write_list_to_file(subset_list_file, subset_files_subjob)
        input_list_file = subjob_name + "_inputs.list"
        phase_utils.write_list_to_file(input_list_file, subjob_input_files)

    return commands


[docs]def setup_distributed_screen(args):
    """
    Does all the setup required to launch distributed subjobs. This includes
    splitting input files or database subsets, and creation of the files
    <subjob>_inputs.list, which contain the names of the input files for each
    subjob. Returns a list of subjob commands that can be supplied directly to
    JobDJ.addJob. The number of commands may be larger than the number CPUs
    requested if the -NJOBS option is used to divide the work over a larger
    number of work units. Conversely, the number of commands may be smaller
    than requested if the provided source(s) of structures cannot be subdivided
    as requested (e.g., 2 multi-conformer files cannot be split over more than
    2 subjobs).

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :return: list of subjob commands
    :rtype: list(list(str))
    """

    source_files = get_source_files(args.source)
    source_format = phase.get_phase_file_format(source_files[0])

    if source_format == phase.PhpFileFormat_PHP_FORMAT_PHDB:
        return setup_db_screen(args, source_files)
    elif source_format == phase.PhpFileFormat_PHP_FORMAT_PHZIP:
        return setup_project_screen(args, source_files)
    elif args.flex or args.distinct:
        return setup_split_file_screen(args, source_files)
    else:
        return setup_fixed_file_screen(args, source_files)


[docs]def setup_fixed_file_screen(args, file_names):
    """
    Does setup for a distributed file screen where multiple conformers per
    molecule are present and thus the files cannot be split. Note that the
    maximum number of subjobs will not exceed the number of input files, and
    the load balancing may be less than optimal if the input files differ
    significantly in their numbers of molecules and/or conformers.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :param file_names: Files to screen with runtime paths
    :type file_names: list(str)

    :return: list of subjob commands
    :rtype: list(list(str))
    """

    file_count = len(file_names)
    nsub = min(get_num_subjobs(args), file_count)
    # Number of files assigned to each subjob:
    subjob_file_counts = phase.partitionValues(file_count, nsub)

    commands = []
    common_args = get_common_args(args)
    hypo = phase_utils.get_proper_path(args.hypo)
    jstart = 0
    for i in range(nsub):
        subjob_name = "%s_sub_%d" % (args.jobname, i + 1)
        file_list_file = "%s_files.list" % subjob_name
        command = [PHASE_SCREEN, file_list_file, hypo, subjob_name, "-subjob"
                  ] + common_args
        commands.append(command)
        jstop = jstart + subjob_file_counts[i]
        files_i = [file_names[j] for j in range(jstart, jstop)]
        jstart = jstop
        phase_utils.write_list_to_file(file_list_file, files_i)
        subjob_input_files = files_i + [file_list_file, hypo]
        input_list_file = subjob_name + "_inputs.list"
        phase_utils.write_list_to_file(input_list_file, subjob_input_files)

    return commands


[docs]def setup_project_screen(args, project_names):
    """
    Does setup for a distributed screen of zipped projects. This workflow is
    used only by phase_find_common, where a project of actives and a project of
    decoys are screened against the top-n pharmacophore hypotheses found by the
    common pharmacophore algorithm. Because we can't unzip a project and hope
    that its database lands on a cross-mounted disk, we can't readily divide
    the record numbers of the project database over multiple subjobs, as we do
    for a standard database screen. The most practical approach is to divide
    the hypotheses equally over the subjobs and have each subjob screen its own
    local copies of the unzipped project databases.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :param project_names: Zipped projects to screen with runtime paths
    :type project_names: list(str)

    :return: list of subjob commands
    :rtype: list(list(str))
    """

    hypos = get_hypos(args.hypo)
    hypo_count = len(hypos)
    nsub = min(get_num_subjobs(args), hypo_count)
    subjob_hypo_files = distribute_hypos(hypos, nsub, args.jobname)

    commands = []
    common_args = get_common_args(args)
    project_list_file = phase_utils.get_proper_path(args.source)
    for i in range(nsub):
        subjob_name = "%s_sub_%d" % (args.jobname, i + 1)
        hypo_file = subjob_hypo_files[i]
        command = [
            PHASE_SCREEN, project_list_file, hypo_file, subjob_name, "-subjob"
        ] + common_args
        commands.append(command)
        subjob_input_files = project_names + [project_list_file, hypo_file]
        input_list_file = subjob_name + "_inputs.list"
        phase_utils.write_list_to_file(input_list_file, subjob_input_files)

    return commands


[docs]def setup_split_file_screen(args, file_names):
    """
    Does setup for a distributed file screen with splitting of the input files
    so that each subjob receives a single file with approximately the same
    number of structures as the other subjobs.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :param file_names: Files to screen with runtime paths
    :type file_names: list(str)

    :return: list of subjob commands
    :rtype: list(list(str))
    """

    nsub = get_num_subjobs(args)
    prefix = args.jobname + "_sub"
    splitter = phase.PhpStructureSplitter(file_names, nsub, prefix)

    commands = []
    common_args = get_common_args(args)
    hypo = phase_utils.get_proper_path(args.hypo)
    split_files = splitter.getFileNamesOut()
    nsub = len(split_files)
    for i, split_file in enumerate(split_files):
        subjob_name = "%s_sub_%d" % (args.jobname, i + 1)
        command = [PHASE_SCREEN, split_file, hypo, subjob_name, "-subjob"
                  ] + common_args
        commands.append(command)
        subjob_input_files = [split_file, hypo]
        input_list_file = subjob_name + "_inputs.list"
        phase_utils.write_list_to_file(input_list_file, subjob_input_files)

    return commands


[docs]def validate_args(args):
    """
    Checks the validity of command line options.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :return: tuple of validity and error message if not valid
    :rtype: bool, str
    """

    conflict_ok, mesg = conformer_reader.validate_confgen_conflicts(args)
    if not conflict_ok:
        return False, mesg

    if args.inplace and (args.flex or args.refine):
        return False, '-inplace is not allowed with -flex or -refine'

    source_ok, mesg = validate_source(args)
    if not source_ok:
        return False, mesg

    hypo_ok, mesg = validate_hypo(args)
    if not hypo_ok:
        return False, mesg

    subset_ok, mesg = phase_screen_utils.validate_subset(args)
    if not subset_ok:
        return False, mesg

    if jobcontrol.under_job_control():
        dbsites_ok, mesg = validate_dbsites(args)
        if not dbsites_ok:
            return False, mesg

    return True, ""


[docs]def validate_dbsites(args):
    """
    Checks the legality of the -dbsites option w.r.t. to all databases and
    hypotheses. Should be called only after job is running on remote host.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :return: tuple of validity and error message if invalid
    :rtype: bool, str
    """

    if args.dbsites != "off":
        return True, ""

    source_files = get_source_files(args.source)
    if phase_utils.is_phase_database_path(source_files[0]):
        hypos = get_hypos(args.hypo)
        for source in source_files:
            for hypo in hypos:
                if not phase.same_feature_definitions(source, hypo):
                    hypo_id = hypo.getHypoID()
                    s = "Feature definitions for database %s and " + \
                        "hypothesis %s differ. Cannot use -dbsites off."
                    return False, s % (source, hypo_id)

    return True, ""


[docs]def validate_hypo(args):
    """
    Checks the validity of the hypothesis or hypotheses.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :return: tuple of validity and error message if not valid
    :rtype: bool, str
    """

    hypo_format = phase.get_phase_file_format(args.hypo)
    if hypo_format not in LEGAL_HYPO_FORMATS:
        mesg = "\"%s\" is not a Phase hypothesis or Zip archive" % args.hypo
        return False, mesg

    hypo_file = phase_utils.get_proper_path(args.hypo)
    if not os.path.isfile(hypo_file):
        return False, "Hypothesis file \"%s\" not found" % hypo_file

    hypos = get_hypos(hypo_file)

    # Verify that partial matching settings are legal.
    smallest_min_sites = 1 if args.inplace else 2
    for hypo in hypos:
        min_sites = get_min_sites(hypo, args.match)
        num_sites = hypo.getSiteCount()
        hypo_id = hypo.getHypoID()
        if min_sites > num_sites:
            s = "Hypothesis %s: Minimum number of sites to match (%d) " + \
                "exceeds number of sites in hypothesis (%d)"
            return False, s % (hypo_id, min_sites, num_sites)
        if min_sites < num_sites and min_sites < smallest_min_sites:
            s = "Hypothesis %s: Illegal minimum number of sites to match: " + \
                "%d. Must be %d or greater."
            return False, s % (hypo_id, min_sites, smallest_min_sites)

    return True, ""


[docs]def validate_source(args):
    """
    Checks the validity of the source of structures to screen and the validity
    of the command line options w.r.t. the source type.

    :param args: argparser.Namespace with command line options
    :type args: argparser.Namespace

    :return: tuple of validity and error message if not valid
    :rtype: bool, str
    """

    source_format = phase.get_phase_file_format(args.source)
    if source_format not in LEGAL_SOURCE_FORMATS:
        mesg = "\"%s\" is not a valid source of structures" % args.source
        return False, mesg

    source_files = get_source_files(args.source)
    source_is_list = source_format == phase.PhpFileFormat_PHP_FORMAT_LIST
    source_formats = [phase.get_phase_file_format(f) for f in source_files]
    if source_is_list:
        if not source_files:
            return False, "No source file names found in \"%s\"" % args.source
        for source_file, source_format in zip(source_files, source_formats):
            if source_format not in LEGAL_LIST_FORMATS:
                mesg = "Unrecognized structure file type: \"%s\"" % source_file
                return False, mesg

    unique_formats = set(source_formats)
    screen_db = phase.PhpFileFormat_PHP_FORMAT_PHDB in unique_formats
    screen_proj = phase.PhpFileFormat_PHP_FORMAT_PHZIP in unique_formats
    if len(unique_formats) > 1:
        mesg = ""
        if screen_db:
            mesg = "Cannot screen a mixture of databases and other file types"
        elif screen_proj:
            mesg = "Cannot screen a mixture of projects and other file types"
        elif args.distinct or args.flex:
            mesg = "Cannot provide a mixture of Maestro and SD files with " + \
               "-distinct or -flex"
        if mesg:
            return False, mesg

    file_options = args.distinct or args.connect or args.stereo or args.title
    if (screen_db or screen_proj) and file_options:
        return False, "File screening options are not legal for this job"

    if not screen_db and (args.isub or args.dbsites or args.sites or
                          args.noindex):
        return False, "Database screening options are not legal for this job"

    if screen_db:
        if len(source_files) > 1 and args.isub is not None:
            mesg = "-isub is not allowed when screening multiple databases"
            return False, mesg
        db_ok, mesg = phase_screen_utils.validate_source_dbs(source_files)
        if not db_ok:
            return False, mesg
    else:
        for source_file in source_files:
            if not os.path.isfile(source_file):
                return False, "Structure file \"%s\" not found" % source_file

    return True, ""