import itertools
import string
from hypothesis import strategies
from schrodinger import structure
from schrodinger.protein import alignment
from schrodinger.protein import annotation
from schrodinger.protein import predictors
from schrodinger.protein import residue
from schrodinger.protein import sequence
from schrodinger.test.hypothesis.strategies import indices as indices_strats
_LETTERS_AND_NUMBERS = list(string.ascii_letters + string.digits)
_PROT_ANNOTATIONS = list(
annotation.ProteinAlignmentAnnotations.ANNOTATION_TYPES)
_PROTEIN_ANN_COMBINATIONS = list(
itertools.chain(*(itertools.combinations(_PROT_ANNOTATIONS, i)
for i in range(len(_PROT_ANNOTATIONS)))))
_NON_STD_AMINO_ACIDS = list(
set(residue.get_protein_alphabet().values()).difference(
residue.STD_AMINO_ACIDS))
RES_PROP_ANNOS = set(
annotation.ProteinSequenceAnnotations.RES_PROPENSITY_ANNOTATIONS)
SEQ_ANNO_TYPES = set(
annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES) - RES_PROP_ANNOS
ALN_ANNO_TYPES = set(annotation.ProteinAlignmentAnnotations.ANNOTATION_TYPES)
# Listify the annotation types so hypothesis can shrink them.
RES_PROP_ANNOS = list(RES_PROP_ANNOS)
SEQ_ANNO_TYPES = list(SEQ_ANNO_TYPES)
ALN_ANNO_TYPES = list(ALN_ANNO_TYPES)
# These annotation types can't be enabled through the gui currently.
SEQ_ANNO_TYPES.remove(
annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.rescode)
SEQ_ANNO_TYPES.remove(
annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.isoelectric_point)
SEQ_ANNO_TYPES.remove(
annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.hydrophobicity)
SEQ_ANNO_TYPES.remove(
annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.sasa)
# Domain annotation fetches from a remote server
SEQ_ANNO_TYPES.remove(
annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.domains)
@strategies.composite
def _unique_lists_made_of(draw, elements):
assert not isinstance(elements, set)
return draw(strategies.lists(strategies.sampled_from(elements),
unique=True))
[docs]@strategies.composite
def generated_aln_annotations(draw):
return draw(_unique_lists_made_of(ALN_ANNO_TYPES))
[docs]@strategies.composite
def generated_seq_annotations(draw):
return draw(_unique_lists_made_of(SEQ_ANNO_TYPES))
[docs]@strategies.composite
def generated_res_prop_annotations(draw):
return draw(_unique_lists_made_of(RES_PROP_ANNOS))
[docs]@strategies.composite
def generated_annotation_lists(draw):
"""
Returns a strategy providing a list of annotations
"""
return draw(strategies.sampled_from(_PROTEIN_ANN_COMBINATIONS))
[docs]class AlignmentInfo:
[docs] def __init__(self, seqs=(), cysteines_to_bond=(), anchor_residues=()):
"""
An object that can be used in alignment tests
This object can be used to create an alignment, along with fixtures
adapted to the alignment that can be used in testing it. We use this
object so that different tests can create different kinds of alignments
(regular kinds and undoable alignments).
:param seqs: List of sequences to be used in constructing an alignment
:type seq: list
"""
self.seqs = seqs or []
self.cysteines_to_bond = cysteines_to_bond
self.anchor_residues = anchor_residues
def __repr__(self):
"""
Not a strict repr, but rather a comprehensive summary of the alignment
info object
This is useful for hypothesis error messages. The Alignment repr can be
copied and pasted into a repl for interactive debugging.
"""
aln = alignment.ProteinAlignment(self.seqs)
msg = [
"\n", "AlignmentInfo object used to construct:",
str(aln), "and containing the additional information:", "\n",
"\nA repl-ready representation of the alignment: \n\n",
repr(aln)
]
return "\n".join(msg)
POSSIBLE_SECONDARY_STRUCTURES = (structure.SS_NONE, structure.SS_LOOP,
structure.SS_HELIX, structure.SS_STRAND,
structure.SS_TURN)
[docs]@strategies.composite
def generated_residues(draw, residue_types=None, gaps=True):
"""
:param draw: A function supplied by hypothesis
:type draw: function
:param residue_types: Alphabet to use for residues. By default, it will use
95% standard and 5% nonstandard protein amino acids.
:type residue_types: list(residue.ElementType)
:param gaps: Whether to include gaps in default residue types
:type gaps: bool
:return: A residue suitable for testing
:rtype: schrodinger.protein.residue.Residue
"""
if residue_types is None:
# make residue types more likely to be standard
i = draw(strategies.integers(min_value=0, max_value=20))
if gaps and i == 0:
residue_types = [None]
elif i == 1:
residue_types = _NON_STD_AMINO_ACIDS
else:
residue_types = residue.STD_AMINO_ACIDS
res_type = draw(strategies.sampled_from(residue_types))
if res_type is None:
return residue.Gap()
res = residue.Residue(res_type)
res.temperature_factor = draw(strategies.floats(0, 99))
res.secondary_structure = draw(
strategies.sampled_from(POSSIBLE_SECONDARY_STRUCTURES))
res.pred_secondary_structure = draw(
strategies.sampled_from(POSSIBLE_SECONDARY_STRUCTURES))
res.pred_accessibility = draw(
strategies.sampled_from(list(predictors.SolventAccessibility) + [None]))
res.pred_disordered = draw(
strategies.sampled_from(list(predictors.Disordered) + [None]))
res.pred_domain_arr = draw(
strategies.sampled_from(list(predictors.DomainArrangement) + [None]))
# Residue numbers can be negative
res.resnum = draw(strategies.integers(-999, 9999))
# The most common insertion codes are ' ', 'A', and 'B'
inscodes = ' ' + string.ascii_letters
res.inscode = draw(strategies.sampled_from(inscodes))
return res
# For performance reasons, We use `simple_sequences` when generating alignments and
# `generated_sequences` for tests specifically testing `sequence`
# functionality. We hope to fix this in MSV-1537.
simple_sequences = strategies.builds(
sequence.ProteinSequence,
elements=strategies.lists(elements=generated_residues()),
name=strategies.text(alphabet=_LETTERS_AND_NUMBERS),
entry_id=strategies.text(alphabet=string.digits, min_size=1),
chain=strategies.text(alphabet=string.ascii_letters, max_size=1),
long_name=strategies.text(alphabet=_LETTERS_AND_NUMBERS))
[docs]@strategies.composite
def cysteine_pair_lists(draw, residues):
"""
Given an iterable of residues, returns a list of tuples of cysteine pairs
:param draw: A function supplied by hypothesis
:type draw: function
:param residues: An iterable of residues
:type residues: iterable
:rtype: list(tuple)
:return: A list of tuples of cysteine pairs
"""
cysteines = [
res for res in residues if res.is_res and res.type.name == "Cysteine"
]
n_cysteines = len(cysteines)
# Generate a number between 0 and N (number of pairs of cysteines)
n_ss_bonds = draw(
strategies.integers(min_value=0, max_value=n_cysteines // 2))
draw(strategies.randoms()).shuffle(cysteines)
pairs = []
for _ in range(n_ss_bonds):
res1, res2 = cysteines.pop(), cysteines.pop()
pairs.append((res1, res2))
return pairs
[docs]@strategies.composite
def generated_sequences(draw,
min_size=0,
max_size=None,
residue_types=None,
include_gaps=True,
add_cysteine_bonds=True):
"""
:param draw: A function supplied by hypothesis
:type draw: function
:param min_size: Minimum length for sequences
:type min_size: int
:param max_size: Maximum number of residues to include in the sequence
:type max_size: int
:param residue_types: Alphabet to use for residues
:type residue_types: list(residue.ElementType)
:param include_gaps: Whether to generate gaps in the sequence
:type include_gaps: bool
:return: A protein sequence suitable for testing
:rtype: schrodinger.protein.sequence.ProteinSequence
"""
elements = draw(
strategies.lists(elements=generated_residues(
residue_types=residue_types, gaps=include_gaps),
min_size=min_size,
max_size=max_size))
name = draw(strategies.text(alphabet=_LETTERS_AND_NUMBERS))
entry_id = str(draw(strategies.integers(min_value=1, max_value=1000)))
chain = draw(strategies.text(alphabet=string.ascii_letters, max_size=1))
seq = sequence.ProteinSequence(elements,
name=name,
entry_id=entry_id,
chain=chain,
structure_chain=chain)
structureless = draw(indices_strats.index_lists(seq))
for idx in structureless:
seq[idx].seqres_only = True
if add_cysteine_bonds:
cysteines = draw(cysteine_pair_lists(seq))
for res1, res2 in cysteines:
known = draw(strategies.booleans())
residue.add_disulfide_bond(res1, res2, known=known)
return seq
[docs]@strategies.composite
def generated_multichain_sequences(draw,
min_chain_size=0,
max_chain_size=None,
min_num_chains=1,
max_num_chains=None,
residue_types=None,
include_gaps=True):
"""
Generates multiple sequences that represent different chains of a single
protein.
:param draw: A function supplied by hypothesis
:type draw: function
:param min_chain_size: The minimum length of each chain's sequence
:type min_chain_size: int
:param max_chain_size: The maximum length of each chain's sequence
:type max_chain_size: int
:param min_num_chains: The minimum number of chains in the protein. Must be
positive.
:type min_size: int
:param max_num_chains: The maximum number of chains in the protein. Must be
less than or equal to 62 since each chain needs a unique
single-character name.
:type max_size: int
:param residue_types: Alphabet to use for residues
:type residue_types: list(residue.ElementType)
:param include_gaps: Whether to generate gaps in the sequence
:type include_gaps: bool
:return: The generated sequences
:rtype: list[schrodinger.protein.sequence.ProteinSequence[
"""
name = draw(strategies.text(alphabet=_LETTERS_AND_NUMBERS))
entry_id = str(draw(strategies.integers(min_value=1, max_value=1000)))
assert min_num_chains >= 1
if max_num_chains is None:
max_num_chains = len(_LETTERS_AND_NUMBERS)
else:
assert max_num_chains <= len(_LETTERS_AND_NUMBERS)
chain_names = draw(
strategies.lists(elements=strategies.sampled_from(_LETTERS_AND_NUMBERS),
min_size=min_num_chains,
max_size=max_num_chains,
unique=True))
seqs = []
for cur_chain_name in chain_names:
elements = draw(
strategies.lists(elements=generated_residues(
residue_types=residue_types, gaps=include_gaps),
min_size=min_chain_size,
max_size=max_chain_size))
cur_seq = sequence.ProteinSequence(elements,
name=name,
entry_id=entry_id,
chain=cur_chain_name)
for idx in draw(indices_strats.index_lists(cur_seq)):
cur_seq[idx].seqres_only = True
seqs.append(cur_seq)
return seqs
[docs]@strategies.composite
def alignment_infos(draw,
include_interseq_ss_bonds=True,
include_anchor_residues=False,
**kwargs):
"""
Returns everything we need to create an alignment. Takes in `kwargs` to
pass to `strategies.lists()`.
We return an alignment_info instead of an alignment in order to allow tests
to create different kinds of alignments and also to pass along additional
test data customized to the alignment that the test will create.
:return: A test fixture for alignment tests
:rtype: AlignmentInfo
"""
seqs = draw(strategies.lists(simple_sequences, **kwargs))
anchor_residues = []
if include_anchor_residues and seqs:
ref_seq = seqs[0]
anchorable_residues = []
for seq in seqs[1:]:
for res, ref_res in zip(seq, ref_seq):
if not res.is_gap and not ref_res.is_gap:
anchorable_residues.append(res)
if anchorable_residues:
anchor_residues = draw(indices_strats.sublists(anchorable_residues))
cysteines_to_bond = []
if include_interseq_ss_bonds:
cysteines_to_bond = draw(cysteine_pair_lists(itertools.chain(*seqs[:])))
aln_info = AlignmentInfo(seqs=seqs,
cysteines_to_bond=cysteines_to_bond,
anchor_residues=anchor_residues)
return aln_info