Source code for schrodinger.protein.gpcr.tasks

import copy
from typing import Tuple

import numpy as np

from schrodinger.application.msv.gui import gui_alignment
from schrodinger.models import parameters
from schrodinger.protein import align
from schrodinger.protein import sequence
from schrodinger.protein.gpcr import annotate
from schrodinger.protein.gpcr import sql
from schrodinger.protein.tasks import blast
from schrodinger.tasks import tasks


[docs]class GPCRTask(tasks.ComboSubprocessTask):
    """
    Task to run 'protein.tasks.blast.BlastTask' task against a custom database.
    """
    output: gui_alignment.GuiProteinAlignment

[docs]    class Input(parameters.CompoundParam):
        query_sequence: sequence.ProteinSequence = None
        custom_database_path: str
        sqlite_path: str = None

[docs]    def mainFunction(self):
        blast_task = blast.BlastTask()
        inp = blast_task.input
        inp.query_sequence = self.input.query_sequence
        inp.settings.location = blast.LOCAL
        inp.settings.custom_database_path = self.input.custom_database_path
        inp.settings.database_name = blast.BlastDatabase.CUSTOM
        inp.settings.num_alignments = 100
        blast_task.start()
        blast_task.wait()
        self.output = self._getBestAlignment(blast_task.output)
        self.transferrAnnotationToQuerySeq()

    def _getBestAlignment(self, hits) -> str:
        """
        Get the alignment that consists of query sequence (as ref seq) and the
        sequence that has highest identity with the reference sequence.
        Raises ValueError when there is no sequence with 90% or more identity.
        """
        aligner = align.BiopythonPairwiseAligner(penalize_end_gaps=True)
        best_identity = 0.9
        best_aln = None
        best_alignment_score = 0.0
        for hit in hits:
            seq = sequence.ProteinSequence(elements=hit['sequence'],
                                           name=hit['name'])
            aln, identity, alignment_score = self._getHitScore(aligner, seq)
            if identity > best_identity:
                best_aln = aln
            elif np.isclose(identity, best_identity):
                if alignment_score > best_alignment_score:
                    best_aln = aln
        if best_aln is not None:
            return best_aln
        raise ValueError("Could not find sequence with identity > 90% ")

    def _getHitScore(
        self, aligner: align.BiopythonPairwiseAligner,
        seq: sequence.ProteinSequence
    ) -> Tuple[gui_alignment.GuiProteinAlignment, float, float]:
        """
        Align the given sequence to the reference sequence using the aligner and
        find out the identity and alignment score. A copy of the reference seq
        is used for the alignment to avoid preserving gap from the previous
        alignment.
        """
        ref_seq = copy.deepcopy(self.input.query_sequence)
        aln = gui_alignment.GuiProteinAlignment([ref_seq, seq])
        aligner.run(aln)
        alignment_score = aligner.getAlignmentScore()
        identity = ref_seq.getIdentity(seq)
        return aln, identity, alignment_score

[docs]    def transferrAnnotationToQuerySeq(self):
        """
        Transfer the GPCR annotation from the blast hit to the query sequence.
        """
        gpcr_anno = self._getGPCRAnnotations()
        gpcr_anno_map = self.getGPCRAnnotationMap(gpcr_anno)
        if gpcr_anno_map:
            self.input.query_sequence.setGPCRAnnotations(gpcr_anno_map)

    def _getGPCRAnnotations(self):
        """
        Get the gpcr annotation of the blast result sequence from the sqlite
        database.
        """
        out_put_seq = self.output[1]
        entry_name = out_put_seq.name.strip('_G')
        if self.input.sqlite_path is None:
            return []
        conn = sql.open_database(filename=self.input.sqlite_path)
        gpcr_anno = annotate.get_residue_annotations(entry_name, conn=conn)
        return gpcr_anno

[docs]    def getGPCRAnnotationMap(self, gpcr_anno):
        """
        Generate a map of residues in input sequence to the corresponding gpcr
        annotation that is transferred from the blast hit.
        """
        # Since we make copy of the input.query_sequence before alignment, the
        # aligned query sequence may have gaps at different indices than in the
        # input.query_sequence.
        original_seq = self.input.query_sequence
        gpcr_annotation_map = dict()
        query_seq = self.output[0]
        hit_seq = self.output[1]
        orginal_sequence_map = {
            res.gapless_idx_in_seq: res for res in original_seq.residues()
        }

        for res in hit_seq.residues():
            query_seq_res = query_seq[res.idx_in_seq]
            if query_seq_res.is_gap:
                continue
            orig_res = orginal_sequence_map[query_seq_res.gapless_idx_in_seq]
            annotation = gpcr_anno[res.gapless_idx_in_seq]
            gpcr_annotation_map[orig_res] = annotation
        return gpcr_annotation_map