Source code for schrodinger.test.stu.joberrors
import os
import pathlib
import zipfile
import sys
from schrodinger.infra.mmjob import mmjob_is_job_server_job
from schrodinger.job import jobcontrol
from schrodinger.job import queue
from schrodinger.job import remote_command
from schrodinger.utils import mmutil
from schrodinger.utils import subprocess
from . import common
logger = common.logger
WINDOWS_LONG_PATH_PREFIX = "\\\\?\\"
[docs]def run_postmortem(job, product_name):
"""
This is any queue.BaseJob. If it has a jobid, run postmortem for that
particular job. Otherwise, run postmortem for the job database only.
"""
job.infoStatus('running postmortem')
_, jobid, _ = job.getStatusStrings()
if jobid and jobid.strip() == "[none]":
jobid = None
job_directory = job.getCommandDir()
args = []
if not job_directory:
return
if jobid:
args.append(jobid)
elif mmutil.feature_flag_is_enabled(mmutil.JOB_SERVER):
return
else:
args.append("-jobdbonly")
postmortem_log_path = pathlib.Path(job_directory).joinpath('postmortem.log')
with open(postmortem_log_path, 'w') as logf:
if mmjob_is_job_server_job(jobid):
command = [
"jsc",
"postmortem",
"--with-subjobs",
"--without-redaction",
] + args
else:
command = ["postmortem"] + args
proc = subprocess.run(command,
cwd=job_directory,
stderr=subprocess.STDOUT,
stdout=logf)
if proc.returncode:
msg = f'postmortem failed with return code: {proc.returncode}'
logf.write(msg + '\n')
msg += f' while investigating {job}'
logger.warning(msg)
if os.path.isfile(postmortem_log_path) and jobid:
unzip_postmortem(postmortem_log_path)
collect_queue_stats(job)
[docs]def unzip_postmortem(postmortem_log_path: str):
"""
Find postmortem file name from the log file, then extract
:param postmortem_log_path: output of postmortem command
:param extraction_dir: If None, extract to current working directory
"""
# Absolute path to postmortem zip should be at the end of the log. Take
# the last line from the log containing a .zip file
postmortem_zip_path = None
with open(postmortem_log_path) as f:
for line in f:
if '.zip' in line:
postmortem_zip_path = pathlib.Path(
line.strip().strip('"').split(': ')[-1])
if not postmortem_zip_path:
logger.warning(
f'Postmortem file not found. See log at {postmortem_log_path}')
return
if not postmortem_zip_path.is_absolute():
postmortem_zip_path = postmortem_log_path.parent.joinpath(
postmortem_zip_path)
postmortem_extraction_dir = postmortem_zip_path.parent
if sys.platform == "win32":
postmortem_extraction_dir = WINDOWS_LONG_PATH_PREFIX + str(
postmortem_extraction_dir)
with zipfile.ZipFile(postmortem_zip_path) as zf:
zf.extractall(postmortem_extraction_dir)
# Remove zip archive and file manifest
os.remove(postmortem_zip_path)
postmortem_files = postmortem_zip_path.with_suffix('.files')
if postmortem_files.exists():
os.remove(postmortem_files)
[docs]def collect_queue_stats(job):
"""
Invoke remote queue commands to collect queue stats to debug
a killed queue job.
:type job: schrodinger.job.queue.JobControlJob
:param job:
Instance of schrodinger.job.queue.JobControlJob to collect
the stats from queue job has run.
"""
if not is_killed_queue_job(job):
return
jobObj = job.getJob()
job_directory = job.getCommandDir()
cmd = "bash --login -c '{}'"
submission_hostname = jobcontrol.get_host(jobObj.HostEntry).host
submission_username = jobcontrol.get_host(jobObj.HostEntry).user
rcmd = remote_command._rsh_cmd(submission_hostname,
remoteuser=submission_username)
with open(os.path.join(job_directory, 'qstat.log'), 'w') as f:
bash_command = cmd.format('date && echo Running Cmd: qstat && qstat')
proc = subprocess.run(rcmd + [bash_command], stdout=f)
if proc.returncode != 0:
logger.warning(f"{bash_command} exited abnormally")
with open(os.path.join(job_directory, 'clusutil.log'), 'w') as f:
clusutil_cmd = 'perl /nfs/working/sysmgr/sysmgr-repo/scripts/clustutil.pl -u -a'
bash_command = cmd.format('date && echo Running Cmd: {} && {}'.format(
clusutil_cmd, clusutil_cmd))
proc = subprocess.run(rcmd + [bash_command], stdout=f)
if proc.returncode != 0:
logger.warning(f"{bash_command} exited abnormally")
[docs]def is_killed_queue_job(job):
"""
Return True if the job was killed on the queueing system.
:type job: schrodinger.job.queue.JobControlJob
:param job: Instance to collect the stats from queue job has run.
"""
# subprocess jobs don't submit to the queu
if not isinstance(job, queue.JobControlJob):
return False
# job failed to launch
jobObj = job.getJob()
if not jobObj:
return False
# not a queued job
if not jobObj.isQueued():
return False
return job.canceled_by_timeout