"""
Provides argument parsing and validation for the 1D similarity driver.
Copyright Schrodinger LLC, All Rights Reserved.
"""
import argparse
import csv
import os
import schrodinger.application.phase.packages.oned_task_utils as task_utils
from schrodinger import structure
from schrodinger.application.phase.packages import phase_utils
from schrodinger.application.phase.packages.phase_utils import ValidationError
from schrodinger.infra import phase
from schrodinger.utils import cmdline
from schrodinger.utils import fileutils
[docs]def add_create_args(parser):
"""
Adds arguments for TASK_CREATE.
:param parser: Argument parser object.
:type parser: argparse.ArgumentParser
"""
required_args = parser.add_argument_group('Required Arguments')
required_args.add_argument(
'-source',
metavar='<input>',
required=True,
help='File of structures from which to create the 1D data file. '
'Supported formats are SMILES, SMILES-CSV, Maestro and SD. Creation '
'of a series of smaller 1D data files rather than a single large file '
'is strongly recommended for the fastest multi-CPU screens.')
required_args.add_argument(
'-dest',
metavar='<output>.1dbin',
required=True,
help='Name of 1D data file to create. Must include an absolute path if '
'-nocopy is supplied.')
creation_args = parser.add_argument_group('Creation Options')
creation_args.add_argument(
'-treatment',
choices=task_utils.ONED_TREATMENTS,
default=task_utils.ONED_PHARM,
help='Treat each structure as a set of Phase pharmacophore features, '
'as a set of atoms distinguished by elemental type, or as a set of '
'atoms distinguished by Macromodel type (default: %(default)s).')
creation_args.add_argument(
'-fd',
metavar='<fd_file>',
help='Use pharmacophore feature definitions in the supplied file, '
'rather than the default definitions in the software distribution. '
f'Applicable only when the treatment is {task_utils.ONED_PHARM}.')
creation_args.add_argument(
'-props',
metavar='<list>',
help='Comma-separated list of properties in the source file that '
'should be stored in the 1D data file. Property names must be of the '
'form <t>_<source>_<name>, where <t> is the property type ("b", "i", '
'"r", "s"), <source> indicates the origin of the property ("phase", '
'"user", etc.), and <name> is a descriptive name ("Fitness", "pIC50", '
'etc.). By default, the 1D data file will contain the SMILES, name '
'and a 1D encoding of each structure.')
# Running as a subjob:
parser.add_argument("-subjob", help=argparse.SUPPRESS)
[docs]def add_describe_args(parser):
"""
Adds arguments for TASK_DESCRIBE.
:param parser: Argument parser object.
:type parser: argparse.ArgumentParser
"""
required_args = parser.add_argument_group('Required Arguments')
required_args.add_argument(
'-source',
metavar='<input>.1dbin',
required=True,
help='1D data file to describe. Must include an absolute path if '
'-nocopy is supplied.')
description_args = parser.add_argument_group('Description Options')
description_args.add_argument(
'-stats',
action='store_true',
help='Report statistics of any numeric properties stored in the 1D '
'data file.')
description_args.add_argument(
'-fd',
metavar='<fd_file>',
help='Save feature definitions to a file. Valid only when the 1D data '
f'file was created with treatment {task_utils.ONED_PHARM}.')
[docs]def add_export_args(parser):
"""
Adds arguments for TASK_EXPORT.
:param parser: Argument parser object.
:type parser: argparse.ArgumentParser
"""
required_args = parser.add_argument_group('Required Arguments')
required_args.add_argument(
'-source',
metavar='<input>.1dbin',
required=True,
help='1D data file from which to export. Must include an absolute path '
'if -nocopy is supplied. Exported rows are written to '
'<jobname>.csv.gz.')
export_args = parser.add_argument_group('Export Options')
rows_or_match = export_args.add_mutually_exclusive_group(required=False)
rows_or_match.add_argument(
'-rows',
metavar='<ranges>',
help='Export a subset of rows specfied as comma-separated ranges. For '
'example, "-rows 1:100,200:300" means rows 1 through 100 plus rows 200 '
'through 300.')
rows_or_match.add_argument(
'-match',
metavar='<filename>',
help='Match a list of property values. The first line in the supplied '
'file must be the name of a property in the 1D data file (e.g., '
'"s_m_title" or "s_sd_Vendor_ID"), and each subsequent line should '
'contain a value for that property. All rows that match one of the '
'supplied property values will be exported. Use with floating point '
'properties is not recommended.')
[docs]def add_merge_args(parser):
"""
Adds arguments for TASK_MERGE.
:param parser: Argument parser object.
:type parser: argparse.ArgumentParser
"""
required_args = parser.add_argument_group('Required Arguments')
required_args.add_argument(
'-source',
metavar='<input>.list',
required=True,
help='Text file containing the names of the 1D data files to merge, '
'with one name per line. Absolute paths must be provided if -nocopy is '
'supplied.')
required_args.add_argument(
'-dest',
metavar='<output>.1dbin',
required=True,
help='Destination of merged 1D data file. Must include an absolute '
'path if -nocopy is supplied.')
[docs]def add_run_args(parser):
"""
Adds arguments TASK_RUN.
:param parser: Argument parser object.
:type parser: argparse.ArgumentParser
"""
required_args = parser.add_argument_group('Required Arguments')
required_args.add_argument(
'-query',
metavar='<qfile>',
required=True,
help='File containing one or more query structures. Supported formats '
'are SMILES, SMILES-CSV, Maestro and SD. If a single query is '
'supplied, hits are returned in <jobname>-hits.csv.gz, where <jobname> '
'is derived from <qfile>. If there are multiple queries, hits are '
'returned in <jobname>_1-hits.csv.gz,...,<jobname>_<n>-hits.csv.gz, '
'where <n> is number of queries.')
required_args.add_argument(
'-screen',
metavar='<source>',
required=True,
help='1D data file (.1dbin) to screen, or list file (.list) '
'containing the names of the 1D data files to screen, with one name '
'per line. Use of multiple smaller data files rather than a single '
'large data file is strongly recommended for the fastest multi-CPU '
'screens. Absolute paths must be provided if -nocopy is supplied.')
output_args = parser.add_argument_group('Output Options')
nosort_or_keep = output_args.add_mutually_exclusive_group(required=False)
nosort_or_keep.add_argument(
'-nosort',
action='store_true',
help='Output hits in the order they are screened. The default is to '
'sort hits by decreasing similarity to the query.')
nosort_or_keep.add_argument(
'-keep',
type=int,
metavar='<maxhits>',
default=task_utils.ONED_MAX_HITS,
choices=[phase_utils.RestrictedRange(1, None, True)],
help='Cap sorted hits at <maxhits> (default: %(default)d).')
output_args.add_argument(
'-limit',
type=int,
metavar='<maxrows>',
default=task_utils.ONED_MAX_ROWS,
choices=[phase_utils.RestrictedRange(1, None, True)],
help='Limit on the number of rows held in memory when sorting '
'(default: %(default)d).')
output_args.add_argument(
'-filter',
type=float,
metavar='<min>',
default=0.0,
choices=[phase_utils.RestrictedRange(0.0, 1.0, True, False)],
help='Filter out molecules whose similarities fall below <min>. The '
'default is to apply no filter.')
# Guard against user unintentionally supplying a query file with a large
# number of structures since a separate file of hits is produced for
# each query. Will issue a warning telling user how to override if limit
# is exceeded.
parser.add_argument('-maxq',
type=int,
default=task_utils.ONED_MAX_QUERIES,
help=argparse.SUPPRESS)
# Running as a subjob:
parser.add_argument('-subjob', help=argparse.SUPPRESS)
[docs]def add_split_args(parser):
"""
Adds arguments for TASK_SPLIT.
:param parser: Argument parser object.
:type parser: argparse.ArgumentParser
"""
required_args = parser.add_argument_group('Required Arguments')
required_args.add_argument(
'-source',
metavar='<input>.1dbin',
required=True,
help='1D data file to be split. Must include an absolute path if '
'-nocopy is supplied.')
required_args.add_argument(
'-dest',
metavar='<prefix>',
required=True,
help='Prefix of output files to create. File names will be '
'<prefix>_1.1dbin, <prefix>_2.1dbin, etc. Must include an absolute '
'path if -nocopy is supplied.')
required_args.add_argument(
'-number',
type=int,
metavar='<n>',
required=True,
choices=[phase_utils.RestrictedRange(2, None, True)],
help='Number of output files to create.')
[docs]def get_parser():
"""
Creates argparse.ArgumentParser with supported command line options.
:return: Argument parser object
:rtype: argparse.ArgumentParser
"""
parser = argparse.ArgumentParser(
prog=task_utils.ONED_SCREEN,
formatter_class=argparse.RawDescriptionHelpFormatter)
subparsers = parser.add_subparsers(
dest='task',
metavar='<task>',
help='The task to perform. For detailed help on a specific task, use '
f'{task_utils.ONED_SCREEN} <task> -h. Multiple CPUs may be utilized '
f'for tasks "{task_utils.TASK_CREATE}" and "{task_utils.TASK_RUN}".')
parser_create = subparsers.add_parser(
task_utils.TASK_CREATE,
help='Create a 1D data file (.1dbin) from a set of structures.')
add_create_args(parser_create)
parser_run = subparsers.add_parser(
task_utils.TASK_RUN,
help='Run a screen against one or more 1D data files.')
add_run_args(parser_run)
parser_merge = subparsers.add_parser(
task_utils.TASK_MERGE,
help='Merge multiple 1D data files into a single file.')
add_merge_args(parser_merge)
parser_split = subparsers.add_parser(
task_utils.TASK_SPLIT, help='Split a 1D data file into multiple files.')
add_split_args(parser_split)
parser_describe = subparsers.add_parser(
task_utils.TASK_DESCRIBE,
help='Describe the contents of a 1D data file.')
add_describe_args(parser_describe)
parser_export = subparsers.add_parser(
task_utils.TASK_EXPORT,
help='Export rows of a 1D data file to a compressed CSV file.')
add_export_args(parser_export)
parser.add_argument(
'-nocopy',
action='store_true',
help='Do not copy source or destination 1D data files between the '
'local host and job host. 1D data file names must include an absolute '
'path that exists on the job host and which is accessible to all '
'compute nodes of that host. Note that this argument must precede all '
'others, e.g., oned_screen -nocopy create -source ...')
# Used to indicate that arguments have been set at startup time.
parser.add_argument('-startup', action='store_true', help=argparse.SUPPRESS)
jobcontrol_options = [cmdline.HOST, cmdline.JOBNAME, cmdline.TMPDIR]
cmdline.add_jobcontrol_options(parser, options=jobcontrol_options)
return parser
[docs]def validate_absolute_source_path(source):
"""
Raises a ValidationError if any 1D data file in source doesn't contain
an absolute path. source is assumed to be a 1D data file or a .list
file containing the names of 1D data files.
:param source: 1D data file or .list file
:type source: str
:raise: ValidationError if validation fails
"""
for filename in task_utils.get_oned_data_file_names(source):
if not os.path.isabs(filename):
raise ValidationError(f'File path "{filename}" is not absolute')
[docs]def validate_oned_data_file(oned_data_file, must_exist=False):
"""
Raises a ValidationError if oned_data_file has the wrong extension or
if must_exist is True and oned_data_file doesn't exist.
:param oned_data_file: The name of the 1D data file
:type oned_data_file: str
:param must_exist: Whether the file must exist
:type must_exist: bool
:raise: ValidationError if validation fails
"""
if not task_utils.is_oned_data_file(oned_data_file):
msg = f'Illegal 1D data file extension: "{oned_data_file}"'
raise ValidationError(msg)
if must_exist and not os.path.isfile(oned_data_file):
raise ValidationError(f'1D data file "{oned_data_file}" not found')
[docs]def validate_oned_data_file_attributes(oned_data_files):
"""
Raises a ValidationError unless all of the supplied 1D data files
contain the same attributes.
:param oned_data_files: The names of the 1D data files
:type oned_data_files: list(str)
:raise: ValidationError if validation fails
"""
if len(oned_data_files) < 2:
return
ref_attr = task_utils.get_oned_data_file_attributes(oned_data_files[0])
pharm = ref_attr[1] == phase.ONED_TREATMENT_PHARM
for i in range(1, len(oned_data_files)):
filenames = f'{oned_data_files[0]} and {oned_data_files[i]}'
attr = task_utils.get_oned_data_file_attributes(oned_data_files[i])
if attr[0] != ref_attr[0]:
raise ValidationError(f'Version numbers differ for {filenames}')
if attr[1] != ref_attr[1]:
msg = f'Structure treatments differ for {filenames}'
raise ValidationError(msg)
if pharm and attr[2] != ref_attr[2]:
raise ValidationError(f'Feature definitions differ for {filenames}')
[docs]def validate_oned_data_file_source(source, must_exist=False):
"""
Raises a ValidationError if the provided source is not a 1D data file
or a list file containing the names of 1D data files. If must_exist
is True, a RuntimeError is raised if any 1D data file doesn't exist.
:param source: The name of the screening source file
:type oned_data_file: str
:param must_exist: Whether 1D data files must exist
:type must_exist: bool
:raise: ValidationError if validation fails
"""
source_format = phase.get_phase_file_format(source)
legal_formats = (phase.PhpFileFormat_PHP_FORMAT_1DBIN,
phase.PhpFileFormat_PHP_FORMAT_LIST)
if source_format not in legal_formats:
raise ValidationError(f'Illegal screening source format: "{source}"')
# A list file should exist irrespective of must_exist.
is_list_file = source_format == phase.PhpFileFormat_PHP_FORMAT_LIST
if is_list_file and not os.path.isfile(source):
raise ValidationError(f'Source file "{source}" not found')
for filename in task_utils.get_oned_data_file_names(source):
validate_oned_data_file(filename, must_exist)
[docs]def validate_property_names(property_names):
"""
Raises a ValidationError if any members of the supplied list are not
m2io-style properties.
:param property_names: The property names to check
:type property_names: list(str)
:raise: ValidationError if validation fails
"""
if not property_names:
return
for prop in property_names:
try:
structure.PropertyName(prop)
except Exception as err:
raise ValidationError(str(err))
[docs]def validate_structure_file(structure_file):
"""
Raises a ValidationError if structure_file is not one of the supported
types or if it doesn't exist.
:param structure_file: The name of the structure file
:type structure_file: str
:raise: ValidationError if validation fails
"""
file_format = fileutils.get_structure_file_format(structure_file)
if file_format not in task_utils.LEGAL_STRUCTURE_FILE_FORMATS:
file_types = ', '.join(task_utils.LEGAL_STRUCTURE_FILE_TYPES)
msg = (f'Illegal structure file format: "{structure_file}". '
f'Must be one of the following: {file_types}.')
raise ValidationError(msg)
if not os.path.isfile(structure_file):
raise ValidationError(f'Structure file "{structure_file}" not found')
[docs]def validate_create_args(args):
"""
Raises a ValidationError if arguments for TASK_CREATE are invalid.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:raise: ValidationError if validation fails
"""
validate_oned_data_file(args.dest, False)
if args.nocopy and args.startup:
if not os.path.isabs(args.dest):
raise ValidationError(f'File path "{args.dest}" is not absolute')
validate_structure_file(args.source)
if args.fd and not os.path.isfile(args.fd):
raise ValidationError(f'Feature definition file "{args.fd}" not found')
if args.props:
validate_property_names(next(csv.reader([args.props])))
[docs]def validate_describe_args(args):
"""
Raises a ValidationError if arguments for TASK_DESCRIBE are invalid.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:raise: ValidationError if validation fails
"""
must_exist = not args.nocopy or not args.startup
validate_oned_data_file(args.source, must_exist)
if args.nocopy and args.startup:
validate_absolute_source_path(args.source)
if must_exist and args.fd:
_, treatment, _ = task_utils.get_oned_data_file_attributes(args.source)
if treatment != phase.ONED_TREATMENT_PHARM:
msg = f'{args.source} does not contain feature definitions'
raise ValidationError(msg)
[docs]def validate_export_args(args):
"""
Raises a ValidationError if arguments for TASK_EXPORT are invalid.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:raise: ValidationError if validation fails
"""
must_exist = not args.nocopy or not args.startup
validate_oned_data_file(args.source, must_exist)
if args.nocopy and args.startup:
validate_absolute_source_path(args.source)
if args.rows:
try:
task_utils.get_rows_to_export(args.rows)
except ValueError as err:
raise ValidationError(str(err))
if args.match and not os.path.isfile(args.match):
msg = f'File of property values "{args.match}" not found'
raise ValidationError(msg)
[docs]def validate_merge_args(args):
"""
Raises a ValidationError if arguments for TASK_MERGE are invalid.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:raise: ValidationError if validation fails
"""
source_format = phase.get_phase_file_format(args.source)
if source_format != phase.PhpFileFormat_PHP_FORMAT_LIST:
raise ValidationError('Source must be a .list file')
must_exist = not args.nocopy or not args.startup
validate_oned_data_file_source(args.source, must_exist)
if args.nocopy and args.startup:
validate_absolute_source_path(args.source)
if not os.path.isabs(args.dest):
raise ValidationError(f'File path "{args.dest}" is not absolute')
if not args.startup:
prefer_cwd = not args.nocopy
oned_data_files = task_utils.get_oned_data_file_names(
args.source, prefer_cwd)
validate_oned_data_file_attributes(oned_data_files)
validate_oned_data_file(args.dest, False)
[docs]def validate_run_args(args):
"""
Raises a ValidationError if arguments for TASK_RUN are invalid.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:raise: ValidationError if validation fails
"""
validate_structure_file(args.query)
query_count = structure.count_structures(args.query)
if query_count > args.maxq:
msg = (f'Number of queries exceeds {args.maxq}. You may override this '
'limit with -maxq <n>, but note that a separate file of hits is '
'produced for each query.')
raise ValidationError(msg)
must_exist = not args.nocopy or not args.startup
validate_oned_data_file_source(args.screen, must_exist)
if args.nocopy and args.startup:
validate_absolute_source_path(args.screen)
if not args.startup and not args.subjob:
prefer_cwd = not args.nocopy
oned_data_files = task_utils.get_oned_data_file_names(
args.screen, prefer_cwd)
validate_oned_data_file_attributes(oned_data_files)
[docs]def validate_split_args(args):
"""
Raises a ValidationError if arguments for TASK_SPLIT are invalid.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:raise: ValidationError if validation fails
"""
must_exist = not args.nocopy or not args.startup
validate_oned_data_file(args.source, must_exist)
if args.nocopy and args.startup:
validate_absolute_source_path(args.source)
if not os.path.isabs(args.dest):
msg = f'Output file prefix "{args.dest}" is not absolute'
raise ValidationError(msg)
VALIDATE_TASK_DICT = {
task_utils.TASK_CREATE: validate_create_args,
task_utils.TASK_DESCRIBE: validate_describe_args,
task_utils.TASK_EXPORT: validate_export_args,
task_utils.TASK_MERGE: validate_merge_args,
task_utils.TASK_RUN: validate_run_args,
task_utils.TASK_SPLIT: validate_split_args
}
[docs]def validate_args(args):
"""
Checks the validity of command line arguments.
:param args: argparser.Namespace with command line arguments
:type args: argparser.Namespace
:return: tuple of validity and non-empty error message if not valid
:rtype: bool, str
"""
try:
VALIDATE_TASK_DICT[args.task](args)
except ValidationError as err:
return False, str(err)
return True, ''