Source code for schrodinger.application.phase.packages.oned_option_utils

"""
Provides argument parsing and validation for the 1D similarity driver.

Copyright Schrodinger LLC, All Rights Reserved.
"""

import argparse
import csv
import os

import schrodinger.application.phase.packages.oned_task_utils as task_utils
from schrodinger import structure
from schrodinger.application.phase.packages import phase_utils
from schrodinger.application.phase.packages.phase_utils import ValidationError
from schrodinger.infra import phase
from schrodinger.utils import cmdline
from schrodinger.utils import fileutils


[docs]def add_create_args(parser):
    """
    Adds arguments for TASK_CREATE.

    :param parser: Argument parser object.
    :type parser: argparse.ArgumentParser
    """

    required_args = parser.add_argument_group('Required Arguments')
    required_args.add_argument(
        '-source',
        metavar='<input>',
        required=True,
        help='File of structures from which to create the 1D data file. '
        'Supported formats are SMILES, SMILES-CSV, Maestro and SD. Creation '
        'of a series of smaller 1D data files rather than a single large file '
        'is strongly recommended for the fastest multi-CPU screens.')
    required_args.add_argument(
        '-dest',
        metavar='<output>.1dbin',
        required=True,
        help='Name of 1D data file to create. Must include an absolute path if '
        '-nocopy is supplied.')

    creation_args = parser.add_argument_group('Creation Options')
    creation_args.add_argument(
        '-treatment',
        choices=task_utils.ONED_TREATMENTS,
        default=task_utils.ONED_PHARM,
        help='Treat each structure as a set of Phase pharmacophore features, '
        'as a set of atoms distinguished by elemental type, or as a set of '
        'atoms distinguished by Macromodel type (default: %(default)s).')
    creation_args.add_argument(
        '-fd',
        metavar='<fd_file>',
        help='Use pharmacophore feature definitions in the supplied file, '
        'rather than the default definitions in the software distribution. '
        f'Applicable only when the treatment is {task_utils.ONED_PHARM}.')
    creation_args.add_argument(
        '-props',
        metavar='<list>',
        help='Comma-separated list of properties in the source file that '
        'should be stored in the 1D data file. Property names must be of the '
        'form <t>_<source>_<name>, where <t> is the property type ("b", "i", '
        '"r", "s"), <source> indicates the origin of the property ("phase", '
        '"user", etc.), and <name> is a descriptive name ("Fitness", "pIC50", '
        'etc.). By default, the 1D data file will contain the SMILES, name '
        'and a 1D encoding of each structure.')

    # Running as a subjob:
    parser.add_argument("-subjob", help=argparse.SUPPRESS)


[docs]def add_describe_args(parser):
    """
    Adds arguments for TASK_DESCRIBE.

    :param parser: Argument parser object.
    :type parser: argparse.ArgumentParser
    """

    required_args = parser.add_argument_group('Required Arguments')
    required_args.add_argument(
        '-source',
        metavar='<input>.1dbin',
        required=True,
        help='1D data file to describe. Must include an absolute path if '
        '-nocopy is supplied.')

    description_args = parser.add_argument_group('Description Options')
    description_args.add_argument(
        '-stats',
        action='store_true',
        help='Report statistics of any numeric properties stored in the 1D '
        'data file.')
    description_args.add_argument(
        '-fd',
        metavar='<fd_file>',
        help='Save feature definitions to a file. Valid only when the 1D data '
        f'file was created with treatment {task_utils.ONED_PHARM}.')


[docs]def add_export_args(parser):
    """
    Adds arguments for TASK_EXPORT.

    :param parser: Argument parser object.
    :type parser: argparse.ArgumentParser
    """

    required_args = parser.add_argument_group('Required Arguments')
    required_args.add_argument(
        '-source',
        metavar='<input>.1dbin',
        required=True,
        help='1D data file from which to export. Must include an absolute path '
        'if -nocopy is supplied. Exported rows are written to '
        '<jobname>.csv.gz.')

    export_args = parser.add_argument_group('Export Options')
    rows_or_match = export_args.add_mutually_exclusive_group(required=False)
    rows_or_match.add_argument(
        '-rows',
        metavar='<ranges>',
        help='Export a subset of rows specfied as comma-separated ranges. For '
        'example, "-rows 1:100,200:300" means rows 1 through 100 plus rows 200 '
        'through 300.')
    rows_or_match.add_argument(
        '-match',
        metavar='<filename>',
        help='Match a list of property values. The first line in the supplied '
        'file must be the name of a property in the 1D data file (e.g., '
        '"s_m_title" or "s_sd_Vendor_ID"), and each subsequent line should '
        'contain a value for that property. All rows that match one of the '
        'supplied property values will be exported. Use with floating point '
        'properties is not recommended.')


[docs]def add_merge_args(parser):
    """
    Adds arguments for TASK_MERGE.

    :param parser: Argument parser object.
    :type parser: argparse.ArgumentParser
    """

    required_args = parser.add_argument_group('Required Arguments')
    required_args.add_argument(
        '-source',
        metavar='<input>.list',
        required=True,
        help='Text file containing the names of the 1D data files to merge, '
        'with one name per line. Absolute paths must be provided if -nocopy is '
        'supplied.')
    required_args.add_argument(
        '-dest',
        metavar='<output>.1dbin',
        required=True,
        help='Destination of merged 1D data file. Must include an absolute '
        'path if -nocopy is supplied.')


[docs]def add_run_args(parser):
    """
    Adds arguments TASK_RUN.

    :param parser: Argument parser object.
    :type parser: argparse.ArgumentParser
    """

    required_args = parser.add_argument_group('Required Arguments')
    required_args.add_argument(
        '-query',
        metavar='<qfile>',
        required=True,
        help='File containing one or more query structures. Supported formats '
        'are SMILES, SMILES-CSV, Maestro and SD. If a single query is '
        'supplied, hits are returned in <jobname>-hits.csv.gz, where <jobname> '
        'is derived from <qfile>. If there are multiple queries, hits are '
        'returned in <jobname>_1-hits.csv.gz,...,<jobname>_<n>-hits.csv.gz, '
        'where <n> is number of queries.')
    required_args.add_argument(
        '-screen',
        metavar='<source>',
        required=True,
        help='1D data file (.1dbin) to screen, or list file (.list) '
        'containing the names of the 1D data files to screen, with one name '
        'per line. Use of multiple smaller data files rather than a single '
        'large data file is strongly recommended for the fastest multi-CPU '
        'screens. Absolute paths must be provided if -nocopy is supplied.')

    output_args = parser.add_argument_group('Output Options')
    nosort_or_keep = output_args.add_mutually_exclusive_group(required=False)
    nosort_or_keep.add_argument(
        '-nosort',
        action='store_true',
        help='Output hits in the order they are screened. The default is to '
        'sort hits by decreasing similarity to the query.')
    nosort_or_keep.add_argument(
        '-keep',
        type=int,
        metavar='<maxhits>',
        default=task_utils.ONED_MAX_HITS,
        choices=[phase_utils.RestrictedRange(1, None, True)],
        help='Cap sorted hits at <maxhits> (default: %(default)d).')
    output_args.add_argument(
        '-limit',
        type=int,
        metavar='<maxrows>',
        default=task_utils.ONED_MAX_ROWS,
        choices=[phase_utils.RestrictedRange(1, None, True)],
        help='Limit on the number of rows held in memory when sorting '
        '(default: %(default)d).')
    output_args.add_argument(
        '-filter',
        type=float,
        metavar='<min>',
        default=0.0,
        choices=[phase_utils.RestrictedRange(0.0, 1.0, True, False)],
        help='Filter out molecules whose similarities fall below <min>. The '
        'default is to apply no filter.')

    # Guard against user unintentionally supplying a query file with a large
    # number of structures since a separate file of hits is produced for
    # each query. Will issue a warning telling user how to override if limit
    # is exceeded.
    parser.add_argument('-maxq',
                        type=int,
                        default=task_utils.ONED_MAX_QUERIES,
                        help=argparse.SUPPRESS)

    # Running as a subjob:
    parser.add_argument('-subjob', help=argparse.SUPPRESS)


[docs]def add_split_args(parser):
    """
    Adds arguments for TASK_SPLIT.

    :param parser: Argument parser object.
    :type parser: argparse.ArgumentParser
    """

    required_args = parser.add_argument_group('Required Arguments')
    required_args.add_argument(
        '-source',
        metavar='<input>.1dbin',
        required=True,
        help='1D data file to be split. Must include an absolute path if '
        '-nocopy is supplied.')
    required_args.add_argument(
        '-dest',
        metavar='<prefix>',
        required=True,
        help='Prefix of output files to create. File names will be '
        '<prefix>_1.1dbin, <prefix>_2.1dbin, etc. Must include an absolute '
        'path if -nocopy is supplied.')
    required_args.add_argument(
        '-number',
        type=int,
        metavar='<n>',
        required=True,
        choices=[phase_utils.RestrictedRange(2, None, True)],
        help='Number of output files to create.')


[docs]def get_parser():
    """
    Creates argparse.ArgumentParser with supported command line options.

    :return: Argument parser object
    :rtype: argparse.ArgumentParser
    """

    parser = argparse.ArgumentParser(
        prog=task_utils.ONED_SCREEN,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    subparsers = parser.add_subparsers(
        dest='task',
        metavar='<task>',
        help='The task to perform. For detailed help on a specific task, use '
        f'{task_utils.ONED_SCREEN} <task> -h. Multiple CPUs may be utilized '
        f'for tasks "{task_utils.TASK_CREATE}" and "{task_utils.TASK_RUN}".')

    parser_create = subparsers.add_parser(
        task_utils.TASK_CREATE,
        help='Create a 1D data file (.1dbin) from a set of structures.')
    add_create_args(parser_create)

    parser_run = subparsers.add_parser(
        task_utils.TASK_RUN,
        help='Run a screen against one or more 1D data files.')
    add_run_args(parser_run)

    parser_merge = subparsers.add_parser(
        task_utils.TASK_MERGE,
        help='Merge multiple 1D data files into a single file.')
    add_merge_args(parser_merge)

    parser_split = subparsers.add_parser(
        task_utils.TASK_SPLIT, help='Split a 1D data file into multiple files.')
    add_split_args(parser_split)

    parser_describe = subparsers.add_parser(
        task_utils.TASK_DESCRIBE,
        help='Describe the contents of a 1D data file.')
    add_describe_args(parser_describe)

    parser_export = subparsers.add_parser(
        task_utils.TASK_EXPORT,
        help='Export rows of a 1D data file to a compressed CSV file.')
    add_export_args(parser_export)

    parser.add_argument(
        '-nocopy',
        action='store_true',
        help='Do not copy source or destination 1D data files between the '
        'local host and job host. 1D data file names must include an absolute '
        'path that exists on the job host and which is accessible to all '
        'compute nodes of that host. Note that this argument must precede all '
        'others, e.g., oned_screen -nocopy create -source ...')

    # Used to indicate that arguments have been set at startup time.
    parser.add_argument('-startup', action='store_true', help=argparse.SUPPRESS)

    jobcontrol_options = [cmdline.HOST, cmdline.JOBNAME, cmdline.TMPDIR]
    cmdline.add_jobcontrol_options(parser, options=jobcontrol_options)

    return parser


[docs]def validate_absolute_source_path(source):
    """
    Raises a ValidationError if any 1D data file in source doesn't contain
    an absolute path. source is assumed to be a 1D data file or a .list
    file containing the names of 1D data files.

    :param source: 1D data file or .list file
    :type source: str

    :raise: ValidationError if validation fails
    """

    for filename in task_utils.get_oned_data_file_names(source):
        if not os.path.isabs(filename):
            raise ValidationError(f'File path "{filename}" is not absolute')


[docs]def validate_oned_data_file(oned_data_file, must_exist=False):
    """
    Raises a ValidationError if oned_data_file has the wrong extension or
    if must_exist is True and oned_data_file doesn't exist.

    :param oned_data_file: The name of the 1D data file
    :type oned_data_file: str

    :param must_exist: Whether the file must exist
    :type must_exist: bool

    :raise: ValidationError if validation fails
    """

    if not task_utils.is_oned_data_file(oned_data_file):
        msg = f'Illegal 1D data file extension: "{oned_data_file}"'
        raise ValidationError(msg)

    if must_exist and not os.path.isfile(oned_data_file):
        raise ValidationError(f'1D data file "{oned_data_file}" not found')


[docs]def validate_oned_data_file_attributes(oned_data_files):
    """
    Raises a ValidationError unless all of the supplied 1D data files
    contain the same attributes.

    :param oned_data_files: The names of the 1D data files
    :type oned_data_files: list(str)

    :raise: ValidationError if validation fails
    """

    if len(oned_data_files) < 2:
        return

    ref_attr = task_utils.get_oned_data_file_attributes(oned_data_files[0])
    pharm = ref_attr[1] == phase.ONED_TREATMENT_PHARM
    for i in range(1, len(oned_data_files)):
        filenames = f'{oned_data_files[0]} and {oned_data_files[i]}'
        attr = task_utils.get_oned_data_file_attributes(oned_data_files[i])
        if attr[0] != ref_attr[0]:
            raise ValidationError(f'Version numbers differ for {filenames}')
        if attr[1] != ref_attr[1]:
            msg = f'Structure treatments differ for {filenames}'
            raise ValidationError(msg)
        if pharm and attr[2] != ref_attr[2]:
            raise ValidationError(f'Feature definitions differ for {filenames}')


[docs]def validate_oned_data_file_source(source, must_exist=False):
    """
    Raises a ValidationError if the provided source is not a 1D data file
    or a list file containing the names of 1D data files. If must_exist
    is True, a RuntimeError is raised if any 1D data file doesn't exist.

    :param source: The name of the screening source file
    :type oned_data_file: str

    :param must_exist: Whether 1D data files must exist
    :type must_exist: bool

    :raise: ValidationError if validation fails
    """

    source_format = phase.get_phase_file_format(source)
    legal_formats = (phase.PhpFileFormat_PHP_FORMAT_1DBIN,
                     phase.PhpFileFormat_PHP_FORMAT_LIST)
    if source_format not in legal_formats:
        raise ValidationError(f'Illegal screening source format: "{source}"')
    # A list file should exist irrespective of must_exist.
    is_list_file = source_format == phase.PhpFileFormat_PHP_FORMAT_LIST
    if is_list_file and not os.path.isfile(source):
        raise ValidationError(f'Source file "{source}" not found')
    for filename in task_utils.get_oned_data_file_names(source):
        validate_oned_data_file(filename, must_exist)


[docs]def validate_property_names(property_names):
    """
    Raises a ValidationError if any members of the supplied list are not
    m2io-style properties.

    :param property_names: The property names to check
    :type property_names: list(str)

    :raise: ValidationError if validation fails
    """

    if not property_names:
        return

    for prop in property_names:
        try:
            structure.PropertyName(prop)
        except Exception as err:
            raise ValidationError(str(err))


[docs]def validate_structure_file(structure_file):
    """
    Raises a ValidationError if structure_file is not one of the supported
    types or if it doesn't exist.

    :param structure_file: The name of the structure file
    :type structure_file: str

    :raise: ValidationError if validation fails
    """

    file_format = fileutils.get_structure_file_format(structure_file)
    if file_format not in task_utils.LEGAL_STRUCTURE_FILE_FORMATS:
        file_types = ', '.join(task_utils.LEGAL_STRUCTURE_FILE_TYPES)
        msg = (f'Illegal structure file format: "{structure_file}". '
               f'Must be one of the following: {file_types}.')
        raise ValidationError(msg)

    if not os.path.isfile(structure_file):
        raise ValidationError(f'Structure file "{structure_file}" not found')


[docs]def validate_create_args(args):
    """
    Raises a ValidationError if arguments for TASK_CREATE are invalid.

    :param args: argparser.Namespace with command line arguments
    :type args: argparser.Namespace

    :raise: ValidationError if validation fails
    """

    validate_oned_data_file(args.dest, False)
    if args.nocopy and args.startup:
        if not os.path.isabs(args.dest):
            raise ValidationError(f'File path "{args.dest}" is not absolute')
    validate_structure_file(args.source)
    if args.fd and not os.path.isfile(args.fd):
        raise ValidationError(f'Feature definition file "{args.fd}" not found')
    if args.props:
        validate_property_names(next(csv.reader([args.props])))


[docs]def validate_describe_args(args):
    """
    Raises a ValidationError if arguments for TASK_DESCRIBE are invalid.

    :param args: argparser.Namespace with command line arguments
    :type args: argparser.Namespace

    :raise: ValidationError if validation fails
    """

    must_exist = not args.nocopy or not args.startup
    validate_oned_data_file(args.source, must_exist)
    if args.nocopy and args.startup:
        validate_absolute_source_path(args.source)
    if must_exist and args.fd:
        _, treatment, _ = task_utils.get_oned_data_file_attributes(args.source)
        if treatment != phase.ONED_TREATMENT_PHARM:
            msg = f'{args.source} does not contain feature definitions'
            raise ValidationError(msg)


[docs]def validate_export_args(args):
    """
    Raises a ValidationError if arguments for TASK_EXPORT are invalid.

    :param args: argparser.Namespace with command line arguments
    :type args: argparser.Namespace

    :raise: ValidationError if validation fails
    """

    must_exist = not args.nocopy or not args.startup
    validate_oned_data_file(args.source, must_exist)
    if args.nocopy and args.startup:
        validate_absolute_source_path(args.source)
    if args.rows:
        try:
            task_utils.get_rows_to_export(args.rows)
        except ValueError as err:
            raise ValidationError(str(err))
    if args.match and not os.path.isfile(args.match):
        msg = f'File of property values "{args.match}" not found'
        raise ValidationError(msg)


[docs]def validate_merge_args(args):
    """
    Raises a ValidationError if arguments for TASK_MERGE are invalid.

    :param args: argparser.Namespace with command line arguments
    :type args: argparser.Namespace

    :raise: ValidationError if validation fails
    """

    source_format = phase.get_phase_file_format(args.source)
    if source_format != phase.PhpFileFormat_PHP_FORMAT_LIST:
        raise ValidationError('Source must be a .list file')

    must_exist = not args.nocopy or not args.startup
    validate_oned_data_file_source(args.source, must_exist)
    if args.nocopy and args.startup:
        validate_absolute_source_path(args.source)
        if not os.path.isabs(args.dest):
            raise ValidationError(f'File path "{args.dest}" is not absolute')
    if not args.startup:
        prefer_cwd = not args.nocopy
        oned_data_files = task_utils.get_oned_data_file_names(
            args.source, prefer_cwd)
        validate_oned_data_file_attributes(oned_data_files)
    validate_oned_data_file(args.dest, False)


[docs]def validate_run_args(args):
    """
    Raises a ValidationError if arguments for TASK_RUN are invalid.

    :param args: argparser.Namespace with command line arguments
    :type args: argparser.Namespace

    :raise: ValidationError if validation fails
    """

    validate_structure_file(args.query)
    query_count = structure.count_structures(args.query)
    if query_count > args.maxq:
        msg = (f'Number of queries exceeds {args.maxq}. You may override this '
               'limit with -maxq <n>, but note that a separate file of hits is '
               'produced for each query.')
        raise ValidationError(msg)
    must_exist = not args.nocopy or not args.startup
    validate_oned_data_file_source(args.screen, must_exist)
    if args.nocopy and args.startup:
        validate_absolute_source_path(args.screen)
    if not args.startup and not args.subjob:
        prefer_cwd = not args.nocopy
        oned_data_files = task_utils.get_oned_data_file_names(
            args.screen, prefer_cwd)
        validate_oned_data_file_attributes(oned_data_files)


[docs]def validate_split_args(args):
    """
    Raises a ValidationError if arguments for TASK_SPLIT are invalid.

    :param args: argparser.Namespace with command line arguments
    :type args: argparser.Namespace

    :raise: ValidationError if validation fails
    """

    must_exist = not args.nocopy or not args.startup
    validate_oned_data_file(args.source, must_exist)
    if args.nocopy and args.startup:
        validate_absolute_source_path(args.source)
        if not os.path.isabs(args.dest):
            msg = f'Output file prefix "{args.dest}" is not absolute'
            raise ValidationError(msg)


VALIDATE_TASK_DICT = {
    task_utils.TASK_CREATE: validate_create_args,
    task_utils.TASK_DESCRIBE: validate_describe_args,
    task_utils.TASK_EXPORT: validate_export_args,
    task_utils.TASK_MERGE: validate_merge_args,
    task_utils.TASK_RUN: validate_run_args,
    task_utils.TASK_SPLIT: validate_split_args
}


[docs]def validate_args(args):
    """
    Checks the validity of command line arguments.

    :param args: argparser.Namespace with command line arguments
    :type args: argparser.Namespace

    :return: tuple of validity and non-empty error message if not valid
    :rtype: bool, str
    """

    try:
        VALIDATE_TASK_DICT[args.task](args)
    except ValidationError as err:
        return False, str(err)

    return True, ''