Source code for schrodinger.application.phase.packages.oned_option_utils

"""
Provides argument parsing and validation for the 1D similarity driver.

Copyright Schrodinger LLC, All Rights Reserved.
"""

import argparse
import csv
import os

import schrodinger.application.phase.packages.oned_task_utils as task_utils
from schrodinger import structure
from schrodinger.application.phase.packages import phase_utils
from schrodinger.application.phase.packages.phase_utils import ValidationError
from schrodinger.infra import phase
from schrodinger.utils import cmdline
from schrodinger.utils import fileutils


[docs]def add_create_args(parser): """ Adds arguments for TASK_CREATE. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ required_args = parser.add_argument_group('Required Arguments') required_args.add_argument( '-source', metavar='<input>', required=True, help='File of structures from which to create the 1D data file. ' 'Supported formats are SMILES, SMILES-CSV, Maestro and SD. Creation ' 'of a series of smaller 1D data files rather than a single large file ' 'is strongly recommended for the fastest multi-CPU screens.') required_args.add_argument( '-dest', metavar='<output>.1dbin', required=True, help='Name of 1D data file to create. Must include an absolute path if ' '-nocopy is supplied.') creation_args = parser.add_argument_group('Creation Options') creation_args.add_argument( '-treatment', choices=task_utils.ONED_TREATMENTS, default=task_utils.ONED_PHARM, help='Treat each structure as a set of Phase pharmacophore features, ' 'as a set of atoms distinguished by elemental type, or as a set of ' 'atoms distinguished by Macromodel type (default: %(default)s).') creation_args.add_argument( '-fd', metavar='<fd_file>', help='Use pharmacophore feature definitions in the supplied file, ' 'rather than the default definitions in the software distribution. ' f'Applicable only when the treatment is {task_utils.ONED_PHARM}.') creation_args.add_argument( '-props', metavar='<list>', help='Comma-separated list of properties in the source file that ' 'should be stored in the 1D data file. Property names must be of the ' 'form <t>_<source>_<name>, where <t> is the property type ("b", "i", ' '"r", "s"), <source> indicates the origin of the property ("phase", ' '"user", etc.), and <name> is a descriptive name ("Fitness", "pIC50", ' 'etc.). By default, the 1D data file will contain the SMILES, name ' 'and a 1D encoding of each structure.') # Running as a subjob: parser.add_argument("-subjob", help=argparse.SUPPRESS)
[docs]def add_describe_args(parser): """ Adds arguments for TASK_DESCRIBE. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ required_args = parser.add_argument_group('Required Arguments') required_args.add_argument( '-source', metavar='<input>.1dbin', required=True, help='1D data file to describe. Must include an absolute path if ' '-nocopy is supplied.') description_args = parser.add_argument_group('Description Options') description_args.add_argument( '-stats', action='store_true', help='Report statistics of any numeric properties stored in the 1D ' 'data file.') description_args.add_argument( '-fd', metavar='<fd_file>', help='Save feature definitions to a file. Valid only when the 1D data ' f'file was created with treatment {task_utils.ONED_PHARM}.')
[docs]def add_export_args(parser): """ Adds arguments for TASK_EXPORT. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ required_args = parser.add_argument_group('Required Arguments') required_args.add_argument( '-source', metavar='<input>.1dbin', required=True, help='1D data file from which to export. Must include an absolute path ' 'if -nocopy is supplied. Exported rows are written to ' '<jobname>.csv.gz.') export_args = parser.add_argument_group('Export Options') rows_or_match = export_args.add_mutually_exclusive_group(required=False) rows_or_match.add_argument( '-rows', metavar='<ranges>', help='Export a subset of rows specfied as comma-separated ranges. For ' 'example, "-rows 1:100,200:300" means rows 1 through 100 plus rows 200 ' 'through 300.') rows_or_match.add_argument( '-match', metavar='<filename>', help='Match a list of property values. The first line in the supplied ' 'file must be the name of a property in the 1D data file (e.g., ' '"s_m_title" or "s_sd_Vendor_ID"), and each subsequent line should ' 'contain a value for that property. All rows that match one of the ' 'supplied property values will be exported. Use with floating point ' 'properties is not recommended.')
[docs]def add_merge_args(parser): """ Adds arguments for TASK_MERGE. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ required_args = parser.add_argument_group('Required Arguments') required_args.add_argument( '-source', metavar='<input>.list', required=True, help='Text file containing the names of the 1D data files to merge, ' 'with one name per line. Absolute paths must be provided if -nocopy is ' 'supplied.') required_args.add_argument( '-dest', metavar='<output>.1dbin', required=True, help='Destination of merged 1D data file. Must include an absolute ' 'path if -nocopy is supplied.')
[docs]def add_run_args(parser): """ Adds arguments TASK_RUN. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ required_args = parser.add_argument_group('Required Arguments') required_args.add_argument( '-query', metavar='<qfile>', required=True, help='File containing one or more query structures. Supported formats ' 'are SMILES, SMILES-CSV, Maestro and SD. If a single query is ' 'supplied, hits are returned in <jobname>-hits.csv.gz, where <jobname> ' 'is derived from <qfile>. If there are multiple queries, hits are ' 'returned in <jobname>_1-hits.csv.gz,...,<jobname>_<n>-hits.csv.gz, ' 'where <n> is number of queries.') required_args.add_argument( '-screen', metavar='<source>', required=True, help='1D data file (.1dbin) to screen, or list file (.list) ' 'containing the names of the 1D data files to screen, with one name ' 'per line. Use of multiple smaller data files rather than a single ' 'large data file is strongly recommended for the fastest multi-CPU ' 'screens. Absolute paths must be provided if -nocopy is supplied.') output_args = parser.add_argument_group('Output Options') nosort_or_keep = output_args.add_mutually_exclusive_group(required=False) nosort_or_keep.add_argument( '-nosort', action='store_true', help='Output hits in the order they are screened. The default is to ' 'sort hits by decreasing similarity to the query.') nosort_or_keep.add_argument( '-keep', type=int, metavar='<maxhits>', default=task_utils.ONED_MAX_HITS, choices=[phase_utils.RestrictedRange(1, None, True)], help='Cap sorted hits at <maxhits> (default: %(default)d).') output_args.add_argument( '-limit', type=int, metavar='<maxrows>', default=task_utils.ONED_MAX_ROWS, choices=[phase_utils.RestrictedRange(1, None, True)], help='Limit on the number of rows held in memory when sorting ' '(default: %(default)d).') output_args.add_argument( '-filter', type=float, metavar='<min>', default=0.0, choices=[phase_utils.RestrictedRange(0.0, 1.0, True, False)], help='Filter out molecules whose similarities fall below <min>. The ' 'default is to apply no filter.') # Guard against user unintentionally supplying a query file with a large # number of structures since a separate file of hits is produced for # each query. Will issue a warning telling user how to override if limit # is exceeded. parser.add_argument('-maxq', type=int, default=task_utils.ONED_MAX_QUERIES, help=argparse.SUPPRESS) # Running as a subjob: parser.add_argument('-subjob', help=argparse.SUPPRESS)
[docs]def add_split_args(parser): """ Adds arguments for TASK_SPLIT. :param parser: Argument parser object. :type parser: argparse.ArgumentParser """ required_args = parser.add_argument_group('Required Arguments') required_args.add_argument( '-source', metavar='<input>.1dbin', required=True, help='1D data file to be split. Must include an absolute path if ' '-nocopy is supplied.') required_args.add_argument( '-dest', metavar='<prefix>', required=True, help='Prefix of output files to create. File names will be ' '<prefix>_1.1dbin, <prefix>_2.1dbin, etc. Must include an absolute ' 'path if -nocopy is supplied.') required_args.add_argument( '-number', type=int, metavar='<n>', required=True, choices=[phase_utils.RestrictedRange(2, None, True)], help='Number of output files to create.')
[docs]def get_parser(): """ Creates argparse.ArgumentParser with supported command line options. :return: Argument parser object :rtype: argparse.ArgumentParser """ parser = argparse.ArgumentParser( prog=task_utils.ONED_SCREEN, formatter_class=argparse.RawDescriptionHelpFormatter) subparsers = parser.add_subparsers( dest='task', metavar='<task>', help='The task to perform. For detailed help on a specific task, use ' f'{task_utils.ONED_SCREEN} <task> -h. Multiple CPUs may be utilized ' f'for tasks "{task_utils.TASK_CREATE}" and "{task_utils.TASK_RUN}".') parser_create = subparsers.add_parser( task_utils.TASK_CREATE, help='Create a 1D data file (.1dbin) from a set of structures.') add_create_args(parser_create) parser_run = subparsers.add_parser( task_utils.TASK_RUN, help='Run a screen against one or more 1D data files.') add_run_args(parser_run) parser_merge = subparsers.add_parser( task_utils.TASK_MERGE, help='Merge multiple 1D data files into a single file.') add_merge_args(parser_merge) parser_split = subparsers.add_parser( task_utils.TASK_SPLIT, help='Split a 1D data file into multiple files.') add_split_args(parser_split) parser_describe = subparsers.add_parser( task_utils.TASK_DESCRIBE, help='Describe the contents of a 1D data file.') add_describe_args(parser_describe) parser_export = subparsers.add_parser( task_utils.TASK_EXPORT, help='Export rows of a 1D data file to a compressed CSV file.') add_export_args(parser_export) parser.add_argument( '-nocopy', action='store_true', help='Do not copy source or destination 1D data files between the ' 'local host and job host. 1D data file names must include an absolute ' 'path that exists on the job host and which is accessible to all ' 'compute nodes of that host. Note that this argument must precede all ' 'others, e.g., oned_screen -nocopy create -source ...') # Used to indicate that arguments have been set at startup time. parser.add_argument('-startup', action='store_true', help=argparse.SUPPRESS) jobcontrol_options = [cmdline.HOST, cmdline.JOBNAME, cmdline.TMPDIR] cmdline.add_jobcontrol_options(parser, options=jobcontrol_options) return parser
[docs]def validate_absolute_source_path(source): """ Raises a ValidationError if any 1D data file in source doesn't contain an absolute path. source is assumed to be a 1D data file or a .list file containing the names of 1D data files. :param source: 1D data file or .list file :type source: str :raise: ValidationError if validation fails """ for filename in task_utils.get_oned_data_file_names(source): if not os.path.isabs(filename): raise ValidationError(f'File path "{filename}" is not absolute')
[docs]def validate_oned_data_file(oned_data_file, must_exist=False): """ Raises a ValidationError if oned_data_file has the wrong extension or if must_exist is True and oned_data_file doesn't exist. :param oned_data_file: The name of the 1D data file :type oned_data_file: str :param must_exist: Whether the file must exist :type must_exist: bool :raise: ValidationError if validation fails """ if not task_utils.is_oned_data_file(oned_data_file): msg = f'Illegal 1D data file extension: "{oned_data_file}"' raise ValidationError(msg) if must_exist and not os.path.isfile(oned_data_file): raise ValidationError(f'1D data file "{oned_data_file}" not found')
[docs]def validate_oned_data_file_attributes(oned_data_files): """ Raises a ValidationError unless all of the supplied 1D data files contain the same attributes. :param oned_data_files: The names of the 1D data files :type oned_data_files: list(str) :raise: ValidationError if validation fails """ if len(oned_data_files) < 2: return ref_attr = task_utils.get_oned_data_file_attributes(oned_data_files[0]) pharm = ref_attr[1] == phase.ONED_TREATMENT_PHARM for i in range(1, len(oned_data_files)): filenames = f'{oned_data_files[0]} and {oned_data_files[i]}' attr = task_utils.get_oned_data_file_attributes(oned_data_files[i]) if attr[0] != ref_attr[0]: raise ValidationError(f'Version numbers differ for {filenames}') if attr[1] != ref_attr[1]: msg = f'Structure treatments differ for {filenames}' raise ValidationError(msg) if pharm and attr[2] != ref_attr[2]: raise ValidationError(f'Feature definitions differ for {filenames}')
[docs]def validate_oned_data_file_source(source, must_exist=False): """ Raises a ValidationError if the provided source is not a 1D data file or a list file containing the names of 1D data files. If must_exist is True, a RuntimeError is raised if any 1D data file doesn't exist. :param source: The name of the screening source file :type oned_data_file: str :param must_exist: Whether 1D data files must exist :type must_exist: bool :raise: ValidationError if validation fails """ source_format = phase.get_phase_file_format(source) legal_formats = (phase.PhpFileFormat_PHP_FORMAT_1DBIN, phase.PhpFileFormat_PHP_FORMAT_LIST) if source_format not in legal_formats: raise ValidationError(f'Illegal screening source format: "{source}"') # A list file should exist irrespective of must_exist. is_list_file = source_format == phase.PhpFileFormat_PHP_FORMAT_LIST if is_list_file and not os.path.isfile(source): raise ValidationError(f'Source file "{source}" not found') for filename in task_utils.get_oned_data_file_names(source): validate_oned_data_file(filename, must_exist)
[docs]def validate_property_names(property_names): """ Raises a ValidationError if any members of the supplied list are not m2io-style properties. :param property_names: The property names to check :type property_names: list(str) :raise: ValidationError if validation fails """ if not property_names: return for prop in property_names: try: structure.PropertyName(prop) except Exception as err: raise ValidationError(str(err))
[docs]def validate_structure_file(structure_file): """ Raises a ValidationError if structure_file is not one of the supported types or if it doesn't exist. :param structure_file: The name of the structure file :type structure_file: str :raise: ValidationError if validation fails """ file_format = fileutils.get_structure_file_format(structure_file) if file_format not in task_utils.LEGAL_STRUCTURE_FILE_FORMATS: file_types = ', '.join(task_utils.LEGAL_STRUCTURE_FILE_TYPES) msg = (f'Illegal structure file format: "{structure_file}". ' f'Must be one of the following: {file_types}.') raise ValidationError(msg) if not os.path.isfile(structure_file): raise ValidationError(f'Structure file "{structure_file}" not found')
[docs]def validate_create_args(args): """ Raises a ValidationError if arguments for TASK_CREATE are invalid. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :raise: ValidationError if validation fails """ validate_oned_data_file(args.dest, False) if args.nocopy and args.startup: if not os.path.isabs(args.dest): raise ValidationError(f'File path "{args.dest}" is not absolute') validate_structure_file(args.source) if args.fd and not os.path.isfile(args.fd): raise ValidationError(f'Feature definition file "{args.fd}" not found') if args.props: validate_property_names(next(csv.reader([args.props])))
[docs]def validate_describe_args(args): """ Raises a ValidationError if arguments for TASK_DESCRIBE are invalid. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :raise: ValidationError if validation fails """ must_exist = not args.nocopy or not args.startup validate_oned_data_file(args.source, must_exist) if args.nocopy and args.startup: validate_absolute_source_path(args.source) if must_exist and args.fd: _, treatment, _ = task_utils.get_oned_data_file_attributes(args.source) if treatment != phase.ONED_TREATMENT_PHARM: msg = f'{args.source} does not contain feature definitions' raise ValidationError(msg)
[docs]def validate_export_args(args): """ Raises a ValidationError if arguments for TASK_EXPORT are invalid. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :raise: ValidationError if validation fails """ must_exist = not args.nocopy or not args.startup validate_oned_data_file(args.source, must_exist) if args.nocopy and args.startup: validate_absolute_source_path(args.source) if args.rows: try: task_utils.get_rows_to_export(args.rows) except ValueError as err: raise ValidationError(str(err)) if args.match and not os.path.isfile(args.match): msg = f'File of property values "{args.match}" not found' raise ValidationError(msg)
[docs]def validate_merge_args(args): """ Raises a ValidationError if arguments for TASK_MERGE are invalid. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :raise: ValidationError if validation fails """ source_format = phase.get_phase_file_format(args.source) if source_format != phase.PhpFileFormat_PHP_FORMAT_LIST: raise ValidationError('Source must be a .list file') must_exist = not args.nocopy or not args.startup validate_oned_data_file_source(args.source, must_exist) if args.nocopy and args.startup: validate_absolute_source_path(args.source) if not os.path.isabs(args.dest): raise ValidationError(f'File path "{args.dest}" is not absolute') if not args.startup: prefer_cwd = not args.nocopy oned_data_files = task_utils.get_oned_data_file_names( args.source, prefer_cwd) validate_oned_data_file_attributes(oned_data_files) validate_oned_data_file(args.dest, False)
[docs]def validate_run_args(args): """ Raises a ValidationError if arguments for TASK_RUN are invalid. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :raise: ValidationError if validation fails """ validate_structure_file(args.query) query_count = structure.count_structures(args.query) if query_count > args.maxq: msg = (f'Number of queries exceeds {args.maxq}. You may override this ' 'limit with -maxq <n>, but note that a separate file of hits is ' 'produced for each query.') raise ValidationError(msg) must_exist = not args.nocopy or not args.startup validate_oned_data_file_source(args.screen, must_exist) if args.nocopy and args.startup: validate_absolute_source_path(args.screen) if not args.startup and not args.subjob: prefer_cwd = not args.nocopy oned_data_files = task_utils.get_oned_data_file_names( args.screen, prefer_cwd) validate_oned_data_file_attributes(oned_data_files)
[docs]def validate_split_args(args): """ Raises a ValidationError if arguments for TASK_SPLIT are invalid. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :raise: ValidationError if validation fails """ must_exist = not args.nocopy or not args.startup validate_oned_data_file(args.source, must_exist) if args.nocopy and args.startup: validate_absolute_source_path(args.source) if not os.path.isabs(args.dest): msg = f'Output file prefix "{args.dest}" is not absolute' raise ValidationError(msg)
VALIDATE_TASK_DICT = { task_utils.TASK_CREATE: validate_create_args, task_utils.TASK_DESCRIBE: validate_describe_args, task_utils.TASK_EXPORT: validate_export_args, task_utils.TASK_MERGE: validate_merge_args, task_utils.TASK_RUN: validate_run_args, task_utils.TASK_SPLIT: validate_split_args }
[docs]def validate_args(args): """ Checks the validity of command line arguments. :param args: argparser.Namespace with command line arguments :type args: argparser.Namespace :return: tuple of validity and non-empty error message if not valid :rtype: bool, str """ try: VALIDATE_TASK_DICT[args.task](args) except ValidationError as err: return False, str(err) return True, ''