Source code for schrodinger.test.stu.get_pdbs

import ftplib
import json
import math
import os
import random

import requests.exceptions

import schrodinger
from schrodinger.protein import getpdb

# File with the list of all PDB IDs.  Created if it does not already exist.
CACHE_FILE = 'pdb_id.json'

# These are very bad structures, with lots of errors. They are also very large,
# and so it takes time to figure out what went wrong. We'll skip these
# structures for now so that we can see any new problems. See SHARED-3507 for
# discussion.
PDBIDS_TO_SKIP = [
    '4BTS', '4L47', '4L71', '4LEL', '4LFZ', '4LNT', '4LSK', '4LT8', '4QYK',
    '4TVX', '4U1U', '4U1V', '4U20', '4U24', '4U25', '4U26', '4U27', '4UBV',
    '4V42', '4V4B', '4V4H', '4V4I', '4V4J', '4V4M', '4V4O', '4V4P', '4V4Q',
    '4V4R', '4V4S', '4V4T', '4V4X', '4V4Y', '4V4Z', '4V50', '4V52', '4V53',
    '4V54', '4V55', '4V56', '4V57', '4V5B', '4V5F', '4V5G', '4V5I', '4V5L',
    '4V5O', '4V5P', '4V5Q', '4V5R', '4V5S', '4V5Y', '4V64', '4V68', '4V6B',
    '4V6C', '4V6D', '4V6E', '4V6K', '4V6T', '4V6Y', '4V6Z', '4V70', '4V71',
    '4V72', '4V73', '4V74', '4V75', '4V76', '4V77', '4V78', '4V79', '4V7A',
    '4V7B', '4V7C', '4V7D', '4V7G', '4V7H', '4V7N', '4V7P', '4V7S', '4V7T',
    '4V7U', '4V7V', '4V80', '4V81', '4V85', '4V89', '4V8A', '4V8N', '4V8O',
    '4V8Q', '4V8R', '4V8S', '4V8U', '4V90', '4V91', '4V93', '4V94', '4V98',
    '4V9C', '4V9D', '4V9F', '4V9H', '4V9I', '4V9J', '4V9K', '4V9L', '4V9M',
    '4W29', '4WF1', '4WZJ'
]

# This structure is known to be invalid. See analysis in CONV-908
PDBIDS_TO_SKIP.append('1VVJ')


[docs]def write_pdb_cache(pdb_ids):
    if schrodinger.in_dev_env():
        with open(CACHE_FILE, 'w') as fh:
            json.dump(pdb_ids, fh)


def _download_pdb_list():
    """
    Download the list of PDB ids.

    In the dev environment, the list is serialized.

    """
    # See list of potential indices here:
    # https://www.rcsb.org/pages/general/summaries

    server = 'ftp.wwpdb.org'
    pdb_index_file = '/pub/pdb/derived_data/index/resolu.idx'

    ftp = ftplib.FTP(server)
    ftp.login()

    pdb_ids = []

    def store(line):
        # e.g. "100D\t;\t1.9"
        if len(line) > 5 and line[5] == ';':
            pdb_ids.append(line[:4])

    # This is slow (2-10s):
    ftp.retrlines(f'RETR {pdb_index_file}', store)
    ftp.close()

    pdb_ids = sorted(pdb_ids)
    if len(pdb_ids) == 0:
        # in case format changes or something.
        raise RuntimeError(f'No PDB IDs found at {server}{pdb_index_file}')

    write_pdb_cache(pdb_ids)

    return pdb_ids


[docs]def get_pdb_list():
    """
    Get all PDB IDs either from a serialized file or the PDB website.
    :return: list of PDB ids
    """
    if schrodinger.in_dev_env():
        try:
            with open(CACHE_FILE) as fh:
                pdbs = json.load(fh)
                return pdbs
        except OSError:
            pass
    return _download_pdb_list()


[docs]def get_pdb(pdb_id):
    """
    Download a PDB if it is not already in the local repository

    :return: Filename of PDB structure file.
    """
    local_filename = getpdb.retrieve_pdb(pdb_id)
    if local_filename is None:
        return getpdb.download_pdb(pdb_id)

    return local_filename


[docs]def sample_pdbs(fraction=1,
                number=None,
                min_value=0,
                max_value=100,
                allow_download_failure=True):
    """
    Iterate over a fraction of all available structures in the PDB. For each one
    download the structure and yield the filename of the structure file.

    :param fraction: Fraction of all PDBs that should be downloaded.

    @yield: filenames of PDB structure files.
    """

    all_pdbs = get_pdb_list()
    if number is not None:
        sample = random.sample(all_pdbs, number)
        print('%i PDBs found, %i sampled' % (len(all_pdbs), len(sample)))
    elif fraction != 1:
        sample = random.sample(all_pdbs, int(fraction * len(all_pdbs)))
        print('%i PDBs found, %i sampled' % (len(all_pdbs), len(sample)))
    else:
        sample = all_pdbs
    if min_value != 0:
        slice_start = int(math.floor((min_value / 100.0) * len(sample)))
    else:
        slice_start = 0
    slice_end = int(math.ceil((max_value / 100.0) * len(sample)))
    sample = sample[slice_start:slice_end]

    # Delete the 90K element list.
    del all_pdbs

    for pdb_id in sample:
        if pdb_id.upper() in PDBIDS_TO_SKIP:
            continue
        try:
            # NOTE: This will download the file as *.pdb or *.cif:
            yield get_pdb(pdb_id)
        except (RuntimeError, requests.exceptions.HTTPError) as err:
            if '404' in str(err):
                print(f'{pdb_id} is missing from the PDB web server.')
                continue
            elif allow_download_failure:
                print('missed connection to RCSB')
                continue
            else:
                raise


[docs]def require_local_pdb():
    """
    Ensure that a local mirror of the PDB is available and active for use
    with getpdb.

    This limits hitting the pdb server.
    """

    # Attempt to retrieve a PDB file from the local mirror
    filename = getpdb.retrieve_pdb('2DAN')
    if filename:
        try:
            if os.path.getsize(filename) > 0:
                os.unlink(filename)
                return
        except FileNotFoundError:
            pass

    msg = ('Local PDB repository not available. Install the PDB in '
           '$SCHRODINGER/thirdparty/database or set the SCHRODINGER_PDB '
           'environment variable (probably to '
           '/builds/thirdparty/current/database/pdb)')
    raise RuntimeError(msg)