import ftplib
import json
import math
import os
import random
import requests.exceptions
import schrodinger
from schrodinger.protein import getpdb
# File with the list of all PDB IDs. Created if it does not already exist.
CACHE_FILE = 'pdb_id.json'
# These are very bad structures, with lots of errors. They are also very large,
# and so it takes time to figure out what went wrong. We'll skip these
# structures for now so that we can see any new problems. See SHARED-3507 for
# discussion.
PDBIDS_TO_SKIP = [
'4BTS', '4L47', '4L71', '4LEL', '4LFZ', '4LNT', '4LSK', '4LT8', '4QYK',
'4TVX', '4U1U', '4U1V', '4U20', '4U24', '4U25', '4U26', '4U27', '4UBV',
'4V42', '4V4B', '4V4H', '4V4I', '4V4J', '4V4M', '4V4O', '4V4P', '4V4Q',
'4V4R', '4V4S', '4V4T', '4V4X', '4V4Y', '4V4Z', '4V50', '4V52', '4V53',
'4V54', '4V55', '4V56', '4V57', '4V5B', '4V5F', '4V5G', '4V5I', '4V5L',
'4V5O', '4V5P', '4V5Q', '4V5R', '4V5S', '4V5Y', '4V64', '4V68', '4V6B',
'4V6C', '4V6D', '4V6E', '4V6K', '4V6T', '4V6Y', '4V6Z', '4V70', '4V71',
'4V72', '4V73', '4V74', '4V75', '4V76', '4V77', '4V78', '4V79', '4V7A',
'4V7B', '4V7C', '4V7D', '4V7G', '4V7H', '4V7N', '4V7P', '4V7S', '4V7T',
'4V7U', '4V7V', '4V80', '4V81', '4V85', '4V89', '4V8A', '4V8N', '4V8O',
'4V8Q', '4V8R', '4V8S', '4V8U', '4V90', '4V91', '4V93', '4V94', '4V98',
'4V9C', '4V9D', '4V9F', '4V9H', '4V9I', '4V9J', '4V9K', '4V9L', '4V9M',
'4W29', '4WF1', '4WZJ'
]
# This structure is known to be invalid. See analysis in CONV-908
PDBIDS_TO_SKIP.append('1VVJ')
[docs]def write_pdb_cache(pdb_ids):
if schrodinger.in_dev_env():
with open(CACHE_FILE, 'w') as fh:
json.dump(pdb_ids, fh)
def _download_pdb_list():
"""
Download the list of PDB ids.
In the dev environment, the list is serialized.
"""
# See list of potential indices here:
# https://www.rcsb.org/pages/general/summaries
server = 'ftp.wwpdb.org'
pdb_index_file = '/pub/pdb/derived_data/index/resolu.idx'
ftp = ftplib.FTP(server)
ftp.login()
pdb_ids = []
def store(line):
# e.g. "100D\t;\t1.9"
if len(line) > 5 and line[5] == ';':
pdb_ids.append(line[:4])
# This is slow (2-10s):
ftp.retrlines(f'RETR {pdb_index_file}', store)
ftp.close()
pdb_ids = sorted(pdb_ids)
if len(pdb_ids) == 0:
# in case format changes or something.
raise RuntimeError(f'No PDB IDs found at {server}{pdb_index_file}')
write_pdb_cache(pdb_ids)
return pdb_ids
[docs]def get_pdb_list():
"""
Get all PDB IDs either from a serialized file or the PDB website.
:return: list of PDB ids
"""
if schrodinger.in_dev_env():
try:
with open(CACHE_FILE) as fh:
pdbs = json.load(fh)
return pdbs
except OSError:
pass
return _download_pdb_list()
[docs]def get_pdb(pdb_id):
"""
Download a PDB if it is not already in the local repository
:return: Filename of PDB structure file.
"""
local_filename = getpdb.retrieve_pdb(pdb_id)
if local_filename is None:
return getpdb.download_pdb(pdb_id)
return local_filename
[docs]def sample_pdbs(fraction=1,
number=None,
min_value=0,
max_value=100,
allow_download_failure=True):
"""
Iterate over a fraction of all available structures in the PDB. For each one
download the structure and yield the filename of the structure file.
:param fraction: Fraction of all PDBs that should be downloaded.
@yield: filenames of PDB structure files.
"""
all_pdbs = get_pdb_list()
if number is not None:
sample = random.sample(all_pdbs, number)
print('%i PDBs found, %i sampled' % (len(all_pdbs), len(sample)))
elif fraction != 1:
sample = random.sample(all_pdbs, int(fraction * len(all_pdbs)))
print('%i PDBs found, %i sampled' % (len(all_pdbs), len(sample)))
else:
sample = all_pdbs
if min_value != 0:
slice_start = int(math.floor((min_value / 100.0) * len(sample)))
else:
slice_start = 0
slice_end = int(math.ceil((max_value / 100.0) * len(sample)))
sample = sample[slice_start:slice_end]
# Delete the 90K element list.
del all_pdbs
for pdb_id in sample:
if pdb_id.upper() in PDBIDS_TO_SKIP:
continue
try:
# NOTE: This will download the file as *.pdb or *.cif:
yield get_pdb(pdb_id)
except (RuntimeError, requests.exceptions.HTTPError) as err:
if '404' in str(err):
print(f'{pdb_id} is missing from the PDB web server.')
continue
elif allow_download_failure:
print('missed connection to RCSB')
continue
else:
raise
[docs]def require_local_pdb():
"""
Ensure that a local mirror of the PDB is available and active for use
with getpdb.
This limits hitting the pdb server.
"""
# Attempt to retrieve a PDB file from the local mirror
filename = getpdb.retrieve_pdb('2DAN')
if filename:
try:
if os.path.getsize(filename) > 0:
os.unlink(filename)
return
except FileNotFoundError:
pass
msg = ('Local PDB repository not available. Install the PDB in '
'$SCHRODINGER/thirdparty/database or set the SCHRODINGER_PDB '
'environment variable (probably to '
'/builds/thirdparty/current/database/pdb)')
raise RuntimeError(msg)