Source code for schrodinger.protein.gpcr.tasks
import copy
from typing import Tuple
import numpy as np
from schrodinger.application.msv.gui import gui_alignment
from schrodinger.models import parameters
from schrodinger.protein import align
from schrodinger.protein import sequence
from schrodinger.protein.gpcr import annotate
from schrodinger.protein.gpcr import sql
from schrodinger.protein.tasks import blast
from schrodinger.tasks import tasks
[docs]class GPCRTask(tasks.ComboSubprocessTask):
"""
Task to run 'protein.tasks.blast.BlastTask' task against a custom database.
"""
output: gui_alignment.GuiProteinAlignment
[docs] def mainFunction(self):
blast_task = blast.BlastTask()
inp = blast_task.input
inp.query_sequence = self.input.query_sequence
inp.settings.location = blast.LOCAL
inp.settings.custom_database_path = self.input.custom_database_path
inp.settings.database_name = blast.BlastDatabase.CUSTOM
inp.settings.num_alignments = 100
blast_task.start()
blast_task.wait()
self.output = self._getBestAlignment(blast_task.output)
self.transferrAnnotationToQuerySeq()
def _getBestAlignment(self, hits) -> str:
"""
Get the alignment that consists of query sequence (as ref seq) and the
sequence that has highest identity with the reference sequence.
Raises ValueError when there is no sequence with 90% or more identity.
"""
aligner = align.BiopythonPairwiseAligner(penalize_end_gaps=True)
best_identity = 0.9
best_aln = None
best_alignment_score = 0.0
for hit in hits:
seq = sequence.ProteinSequence(elements=hit['sequence'],
name=hit['name'])
aln, identity, alignment_score = self._getHitScore(aligner, seq)
if identity > best_identity:
best_aln = aln
elif np.isclose(identity, best_identity):
if alignment_score > best_alignment_score:
best_aln = aln
if best_aln is not None:
return best_aln
raise ValueError("Could not find sequence with identity > 90% ")
def _getHitScore(
self, aligner: align.BiopythonPairwiseAligner,
seq: sequence.ProteinSequence
) -> Tuple[gui_alignment.GuiProteinAlignment, float, float]:
"""
Align the given sequence to the reference sequence using the aligner and
find out the identity and alignment score. A copy of the reference seq
is used for the alignment to avoid preserving gap from the previous
alignment.
"""
ref_seq = copy.deepcopy(self.input.query_sequence)
aln = gui_alignment.GuiProteinAlignment([ref_seq, seq])
aligner.run(aln)
alignment_score = aligner.getAlignmentScore()
identity = ref_seq.getIdentity(seq)
return aln, identity, alignment_score
[docs] def transferrAnnotationToQuerySeq(self):
"""
Transfer the GPCR annotation from the blast hit to the query sequence.
"""
gpcr_anno = self._getGPCRAnnotations()
gpcr_anno_map = self.getGPCRAnnotationMap(gpcr_anno)
if gpcr_anno_map:
self.input.query_sequence.setGPCRAnnotations(gpcr_anno_map)
def _getGPCRAnnotations(self):
"""
Get the gpcr annotation of the blast result sequence from the sqlite
database.
"""
out_put_seq = self.output[1]
entry_name = out_put_seq.name.strip('_G')
if self.input.sqlite_path is None:
return []
conn = sql.open_database(filename=self.input.sqlite_path)
gpcr_anno = annotate.get_residue_annotations(entry_name, conn=conn)
return gpcr_anno
[docs] def getGPCRAnnotationMap(self, gpcr_anno):
"""
Generate a map of residues in input sequence to the corresponding gpcr
annotation that is transferred from the blast hit.
"""
# Since we make copy of the input.query_sequence before alignment, the
# aligned query sequence may have gaps at different indices than in the
# input.query_sequence.
original_seq = self.input.query_sequence
gpcr_annotation_map = dict()
query_seq = self.output[0]
hit_seq = self.output[1]
orginal_sequence_map = {
res.gapless_idx_in_seq: res for res in original_seq.residues()
}
for res in hit_seq.residues():
query_seq_res = query_seq[res.idx_in_seq]
if query_seq_res.is_gap:
continue
orig_res = orginal_sequence_map[query_seq_res.gapless_idx_in_seq]
annotation = gpcr_anno[res.gapless_idx_in_seq]
gpcr_annotation_map[orig_res] = annotation
return gpcr_annotation_map