"""
This script downloads sequence and residue data from the GPCR DB and stores it
in a sqlite database.
Copyright Schrodinger, LLC. All rights reserved.
"""
import contextlib
import itertools
import json
from typing import Iterable
import uuid
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from schrodinger.utils import fileutils
from schrodinger.utils import subprocess
from . import gpcrdb
from . import sql
[docs]def create_entry_database():
"""
Create a sqlite database of sequences and residues from the GPCR DB.
The sqlite database is created using a random filename and must be moved
to the appropriate location for use.
:return: Path to sqlite database
:rtype: str
"""
db_filename = f"{uuid.uuid4()}.sqlite"
conn = _init_database(db_filename)
row_gen = gpcrdb.download_all_entry_data()
cur = conn.cursor()
with contextlib.closing(conn):
all_residues = dict()
next_residue_pk, next_entry_pk = 1, 1
for i, row in enumerate(row_gen):
next_residue_pk, next_entry_pk = _insert_row(
cur,
row,
all_residues=all_residues,
next_residue_pk=next_residue_pk,
next_entry_pk=next_entry_pk)
if i % 100 == 0:
conn.commit()
return db_filename
def _insert_row(cur,
row_data,
all_residues,
next_residue_pk=1,
next_entry_pk=1):
"""
Insert data for a single GPCR DB entry.
:param cur: Database cursor
:type cur: sqlite3.Cursor
:param row_data: Entry data
:type row_data: tuple
:param all_residues: Dict of residue primary keys, keyed by the row data.
Used to reduce duplicate data in the database to save space.
:type all_residues: dict
:param next_residue_pk: Next primary key to use in the residue table
:type next_residue_pk: int
:param next_entry_pk: Next primary key to use in the residue table
:type next_entry_pk: int
"""
entry_name, res_number_scheme, sequence, families, residues = row_data
entry_residue_pks = []
# Insert rows into residue table for each unknown residue
for res_dict in residues:
data = tuple(res_dict[key] for key in sql.RESIDUES_KEYS)
existing_pk = all_residues.get(data)
if existing_pk is None:
cur.execute(sql.INSERT_RESIDUE_SQL, [next_residue_pk, *data])
all_residues[data] = next_residue_pk
entry_residue_pks.append(next_residue_pk)
next_residue_pk += 1
else:
entry_residue_pks.append(existing_pk)
# Insert row for entry
families = json.dumps(families)
cur.execute(
sql.INSERT_ENTRY_SQL,
[next_entry_pk, entry_name, res_number_scheme, sequence, families])
# Insert rows into entry-residue table
entry_residue_data = zip(itertools.repeat(next_entry_pk), entry_residue_pks)
cur.executemany(sql.INSERT_ENTRY_RESIDUES_SQL, entry_residue_data)
return next_residue_pk, next_entry_pk + 1
def _init_database(filename):
"""
Open the database and create the tables.
:rtype: sqlite3.Connection
"""
conn = sql.open_database(filename)
cur = conn.cursor()
cur.executescript(sql.CREATE_SQL)
conn.commit()
return conn
def _get_seqs(gpcr_db_filename: str) -> Iterable[SeqRecord]:
conn = sql.open_database(gpcr_db_filename)
# get sequences
with contextlib.closing(conn):
resp = conn.execute("SELECT entry_name, sequence FROM entries")
for entry_name, sequence in resp:
yield SeqRecord(Seq(sequence),
id=f"pdb|{entry_name}|G",
description="")
[docs]def create_blast_db(gpcr_db_filename: str):
"""
Create a BLAST database using sequences from GPCR DB
:param gpcr_db_filename: Path to database with sequences from GPCR DB
"""
# write sequences to fasta
fasta_filename = f"{uuid.uuid4()}.fasta"
with open(fasta_filename, 'w') as fh:
SeqIO.write(_get_seqs(gpcr_db_filename), fh, "fasta")
# call createblastdb
cmd = [
'run', '-FROM', 'psp', 'makeblastdb', '-in', fasta_filename, '-out',
'gpcrdb', '-dbtype', 'prot', '-title', 'gpcrdb', '-parse_seqids'
]
proc = subprocess.run(cmd)
proc.check_returncode()
# clean up the FASTA file if makeblastdb succeeded, otherwise keep it for
# debugging purposes
fileutils.force_remove(fasta_filename)
[docs]def main():
db_file = create_entry_database()
# TODO move database file to the appropriate location (TBD)
create_blast_db(db_file)
# TODO move blast database to the appropriate location (TBD)
if __name__ == "__main__":
main()