Source code for schrodinger.application.livedesign.import_controller

import copy
from collections import OrderedDict
from collections import defaultdict

import requests

from schrodinger import structure
from schrodinger.utils import fileutils

from . import constants
from . import ld_utils
from . import login

# LiveDesign export xtals sources keys and values
TARGET = 'TARGET'
LIGAND = 'LIGAND'
POSE_ID = 'pose_id'
CORPORATE_ID = 'corporate_id'
FILE_NAME = 'file_name'
MODEL_NAME = 'model_name'
STRUCTURE_TYPE = 'structure_type'
POSE_EXT = 'mol2'

# Live design column properties
COMPOUND_STRUCTURE = 'Compound Structure'
VALUE_TYPE_3D = '3D'
FREEFORM_COLUMN = 'freeform'

# Structure properties
SD_ID = 's_sd_ID'
SD_ALL_IDS = 's_sd_All_IDs'

# Maestro Grouping
GROUPNAME_2D_DATA = '2D Compounds'

# LiveDesign Metadata keys
ROW_INFO_KEY = 'row_infos'
DISPLAY_ID_KEY = 'display_id'
ENTITY_ID_KEY = 'entity_id'
STRUCTURE_ATTACHMENTS = 'structure_attachments'
STRUCTURE_TRANSFORMATION = 'structure_transformation'
FILE_UPLOAD = 'file_upload'

# When retrieving structures from `LDClient.export_xtals_result()`, the title
# property of those structures will be a period-delimited string containing
# multiple pieces of information. `NUM_TITLE_PROPERTIES` records the number of
# properties we expect the title to store so that if the parsed string produces
# more than this number of terms, we know that the first element (column title)
# contains periods.
NUM_TITLE_PROPERTIES = 5


[docs]class ImportController(object):
[docs] def __init__(self, ld_client): self.ld_client = ld_client self._version = login.get_LD_version(self.ld_client) self._ld_id_dict = {}
[docs] def importTabularData(self, lr_id, lr_col_ids): """ Import the tabular data from a Live Report given the live report id. This data will include identifiers for data in attachment columns, which can be imported with further calls to ImportController. :param lr_id: the live report id :type lr_id: str :param lr_col_ids: ids of the desired columns from the live report. If `None`, all column data will be downloaded. :type lr_col_ids: `None`, or `list` of `int` :return: structures containing all the tabular data """ res_str = self.ld_client.export_live_report(lr_id, projection=lr_col_ids) res_str = str(res_str, encoding='utf-8') with structure.StructureReader.fromString( res_str, format=structure.SD) as reader: tabular_data_sts = list(reader) # Store the corporate ID (AKA the entity ID) as a structure property for # each imported structure. self._ld_id_dict = get_st_entity_id_map(self.ld_client, tabular_data_sts, lr_id) cache_entity_IDs(tabular_data_sts, self._ld_id_dict) return tabular_data_sts
[docs] def import3DColumns(self, lr_id, lr_columns, tabular_data_sts, lr_rows, callback=lambda col_name: None): """ Downloads 3D data for any columns in `lr_columns` with a 3D value type, and creates new structures from the 3D data if there is any, copying properties corresponding to columns in `lr_columns` from `tabular_data_sts`. Combines the 2D tabular data, which is always included as the first member of the resulting ordered dictionary. Then any 3D data columns are appended in order. The output is structured as an ordered dictionary with fields representing columns, including the 2D data. Each column consists of an ordered dictionary of {pose_id: ligand} mappings and an ordered dictionary of {protein: pose_id} mappings. :param lr_id: The live report id :type lr_id: str :param lr_columns: the desired columns from the live report :type lr_columns: [ld_models.Column] :param tabular_data_sts: The tabular data imported from LiveDesign :type tabular_data_sts: [structure.Structure] :param lr_rows: a dictionary containing row information downloaded from the LiveDesign server :type lr_rows: `dict` :param callback: a callback function that takes the name of the current column as its only argument, ie callback(str) :type callback: callable :return: An ordered dictionary by column name of pairs or ordered dictionaries containing the pose_id: ligand and protein: pose_id mappings. :rtype: {str: ({int: structure.Structure}, {structure.Structure: int})} """ imported_sts = OrderedDict() combined_sts = self._combineTabularData(tabular_data_sts) empty_protein_dict = {} empty_protein_dict[None] = list(combined_sts) imported_sts[GROUPNAME_2D_DATA] = (combined_sts, empty_protein_dict) cols_3d = [col for col in lr_columns if col.value_type == VALUE_TYPE_3D] for col_3d in cols_3d: callback(col_3d.name) combined_sts, protein_dict = self._import3DCol( lr_id, col_3d, tabular_data_sts, lr_rows) imported_sts[col_3d.name] = combined_sts, protein_dict return imported_sts
def _import3DCol(self, lr_id, col_3d, bare_sts, lr_rows): """ Import xtal data from a single column for a list of structures. Each structure can have 0 to many poses in a column. Returns an ordered dictionary of {pose_id: ligand} mappings and an ordered dictionary of {protein: pose_id} mappings. :param lr_id: The live report id :type lr_id: str :param col_3d: a livedesign models Column object representing a 3D data column. :type col_3d: ld_models.Column :param bare_sts: a list of structures, each having a property corresponding to col_3d, whose value is a list of pose ids. :type bare_sts: [structure.Structure] :param lr_rows: a dictionary containing row information downloaded from the LiveDesign server :type lr_rows: `dict` :return: the pose_id: ligand mappings and the protein: pose_id mappings :rtype: OrderedDict({int: structure.Structure}), OrderedDict({structure.Structure: int}) """ lig_sources, prot_sources, prot_duplicates, ligand_duplicates = \ self._makeXtalSourceDicts(lr_id, col_3d, bare_sts, lr_rows) ligand_dict, protein_dict = self._downloadStructures( lig_sources, prot_sources, prot_duplicates, ligand_duplicates) combined_sts = self._combineStructures(bare_sts, ligand_dict, col_3d) return combined_sts, protein_dict def _combineStructures(self, bare_sts, ligand_dict, col_3d): """ Match the bare 2D ligand structures to the 3D ligand structures, and copy over the structure properties. :param bare_sts: the original structures, each representing a row of a live report :type bare_sts: [structure.Structure] :param ligand_dict: A nested ordered dictionary of the ligand structures, keyed by corporate id and pose id. :type ligand_dict: {str: OrderedDict({int: structure.Structure})} :param col_3d: a livedesign models Column object representing a 3D data column. :type col_3d: ld_models.Column :return: An ordered dictionary of the ligand structures keyed by pose id :rtype: OrderedDict({int: structure.Structure}) """ # FIXME: This should be separated out into separate classes. Also, this # should no longer be done since export_3d.py in LD should be modified # to return the 3D data with the properties instead of matching the # 2D structures with the 3D to copy the structure properties. if self._version < login.LD_VERSION_REAL_VIRTUAL: return self._combineStructuresUsingEntityID(bare_sts, ligand_dict) else: return self._combineStructuresUsingPoseID(bare_sts, ligand_dict, col_3d) def _combineStructuresUsingEntityID(self, bare_sts, ligand_dict): """ Combine all of the downloaded structures with their original structures using the Entity ID and assemble them into an ordered dictionary keyed by pose id. All properties are copied from their original structures. The corporate id is added to the title of each structure, which is then flattened out of the resulting dictionary. :param bare_sts: the original structures, each representing a row of a live report :type bare_sts: [structure.Structure] :param ligand_dict: A nested ordered dictionary of the ligand structures, keyed by corporate id and pose id. :type ligand_dict: {str: OrderedDict({int: structure.Structure})} :return: An ordered dictionary of the ligand structures keyed by pose id :rtype: OrderedDict({int: structure.Structure}) """ pose_sts = OrderedDict() if not ligand_dict: return pose_sts for st in bare_sts: # For LD versions >= 8.1, we must determine the entity / corporate # ID specifically due to the loss of order of the structures. corporate_id = self._ld_id_dict.get(st) pose_lig_odict = ligand_dict[corporate_id] for pose_id, lig_st in pose_lig_odict.items(): combined_lig_st = self._combineStructure(st, st_3d=lig_st) combined_lig_st.title = f'{corporate_id} (pose {pose_id})' pose_sts[pose_id] = combined_lig_st return pose_sts def _combineStructuresUsingPoseID(self, bare_sts, ligand_dict, col_3d): """ Combine all of the downloaded structures with their original structures using the Pose IDs and assemble them into an ordered dictionary keyed by pose id. All properties are copied from their original structures. The corporate id is added to the title of each structure, which is then flattened out of the resulting dictionary. Note: This is a temporary fix for 17-4. There has been a change for LD 8.2+ where the real compounds are separated from virtual compounds, so the entity ID returned by LDClient methods will be different according to which type of compound (real or virtual) the method is acting on. :param bare_sts: the original structures, each representing a row of a live report :type bare_sts: [structure.Structure] :param ligand_dict: A nested ordered dictionary of the ligand structures, keyed by corporate id and pose id. :type ligand_dict: {str: OrderedDict({int: structure.Structure})} :param col_3d: a livedesign models Column object representing a 3D data column. :type col_3d: ld_models.Column :return: An ordered dictionary of the ligand structures keyed by pose id :rtype: OrderedDict({int: structure.Structure}) """ # FIXME: This is a temporary fix for 17-4 so that it will work with the # current production version of LD 8.2, which has reinvented the # definition of the entity ID so it is no longer identical across the # 2D and 3D data to do the matching. Thus, the pose ID is used instead, # which from my understanding should be identical in 8.2. See # PANEL-11225 for more info. pose_sts = OrderedDict() if not ligand_dict: return pose_sts # Arrange the ligands by their pose IDs ligand_by_pose_ids = {} for corporate_id, pose_lig_odict in ligand_dict.items(): ligand_by_pose_ids.update(pose_lig_odict) # Each 2D structure holds the LR column data as a property with the # column name being the key. col_3d_string_property = structure.PropertyName( type=structure.PROP_STRING, family='sd', username=col_3d.name).dataName() col_3d_int_property = structure.PropertyName( type=structure.PROP_INTEGER, family='sd', username=col_3d.name).dataName() for st in bare_sts: # Get the pose IDs related to this 2D structure / row in LR, and # use it to match the 3D ligand structures prop_value = st.property.get(col_3d_string_property) if prop_value is None: prop_value = st.property.get(col_3d_int_property) string_ids = str(prop_value) pose_ids = [int(p_id) for p_id in string_ids.split('\n') if p_id] corporate_id = st.property[SD_ID] for pose_id in pose_ids: lig_st = ligand_by_pose_ids[pose_id] combined_lig_st = self._combineStructure(st, st_3d=lig_st) combined_lig_st.title = f'{corporate_id} (pose {pose_id})' pose_sts[pose_id] = combined_lig_st return pose_sts def _makeLiveDesignIDsDict(self, lr_id): """ Generate a dictionary mapping each compound's display id in the LR to its entitity id for easier access. :param lr_id: the live report id :type lr_id: str :return: dictionary mapping display ids of compounds to their entitiy id :rtype: `dict(str, str)` """ # This returns all the metadata about the LiveReport in JSON format lr_results_metadata = self.ld_client.live_report_results_metadata(lr_id) row_info_list = lr_results_metadata[ROW_INFO_KEY] ld_id_dict = { row_info_dict[DISPLAY_ID_KEY].strip(): row_info_dict[ENTITY_ID_KEY].strip() for row_info_dict in row_info_list } return ld_id_dict def _makeXtalSourceDicts(self, lr_id, col_3d, bare_sts, lr_rows): """ Generate the source dictionaries used as an argument to ld_client.export_xtals. Creates a list of dictionaries for both the ligands and proteins, along with the duplicate pose ids for the proteins. :param lr_id: The live report id :type lr_id: str :param col_3d: The live report column :type col_3d: ld_models.Column :param bare_sts: The structures to make dictionaries for :type bare_sts: [structure.Structure] :param lr_rows: a dictionary containing row information downloaded from the LiveDesign server :type lr_rows: `dict` :return: the ligand sources, protein sources, and the duplicate protein pose ids, and the duplicate ligand corporate and pose ids. :rtype: [{}], [{}], {int: [int]}, {str: {int: [(str, int)]} """ # FIXME: Once all LD servers are upgraded to 8.1, we can remove this # check. if self._version < login.LD_VERSION_MULTIPLE_IDS: protein_duplicates = defaultdict(list) ligand_duplicates = defaultdict(list) params = self._getXtalSourceParamsDepracated( col_3d, bare_sts, lr_rows) else: params, protein_duplicates, ligand_duplicates = \ self._getXtalSourceParams(col_3d, lr_id) lig_sources = [] prot_sources = [] unique_pose_ids = list(protein_duplicates) for param in params: pose_id, corporate_id = param source_ligand = { MODEL_NAME: col_3d.name, FILE_NAME: None, STRUCTURE_TYPE: LIGAND, CORPORATE_ID: corporate_id, POSE_ID: pose_id } lig_sources.append(source_ligand) # If LD server is >= 8.1, we know which proteins are duplicates and # thus not setup dicts for them to download multiple times if pose_id in unique_pose_ids or self._version < login.LD_VERSION_MULTIPLE_IDS: source_target = source_ligand.copy() source_target[STRUCTURE_TYPE] = TARGET prot_sources.append(source_target) return lig_sources, prot_sources, protein_duplicates, ligand_duplicates def _getXtalSourceParams(self, col_3d, lr_id): """ Retrieve the pose ids and corporate ids of all structures in the LR and also compile the duplicate protein pose ids. :param col_3d: The live report column :type col_3d: ld_models.Column :param lr_id: The live report id :type lr_id: str :var protein_id_cache: Cache of protein structure attachment ids - key = id : value = pose_id. :vartype protein_id_cache: dict{str, int} :var ligand_id_cache: Cache of ligand structure attachment ids - key = id : value = pose_id. :vartype ligand_id_cache: dict{str, int} :var protein_duplicates: Maps unique pose ids to duplicate pose ids - key = pose_id : value = duplicate pose_ids :vartype protein_duplicates: defaultdict(int, list[int]) :var ligand_duplicates: Maps unique corporate_ids to unique pose_ids to list of tuples of duplicate corporate and pose ids. :vartype ligand_duplicates: dict(str, dict(int, list[(str, int)])) :return: the pose ids and corporate ids of all the structures in the LR rows, and a map of unique protein pose ids to the duplicates, a map of the duplicate ligand corporate and pose ids. :rtype: [(int, str)], {int: [int]}, {str: {int: [(str, int)]} """ # Keeps track of pose_ids and corporate_ids for each row in LR params = [] # FIXME: This is done so to find duplicates using the corporate id, and # pose ids. This will be refactored in PANEL-11245. protein_id_cache = {} ligand_id_cache = {} protein_duplicates = defaultdict(list) ligand_duplicates = defaultdict(OrderedDict) pose_dicts = self.ld_client.pose_search(lr_id) for pose in pose_dicts: if pose['column_id'] != col_3d.id: continue corporate_id = pose['ligand']['entity_id'] pose_id = int(pose['id']) params.append((pose_id, corporate_id)) # Add Protein id to cache to eliminate duplicate proteins. # For the case when the protein doesn't exist, the pose_id will be # added to the 'None' key. protein = pose['protein'] protein_id = None if protein: protein_id = self._getStructureAttachmentID( protein[STRUCTURE_ATTACHMENTS]) if protein_id in protein_id_cache: original_pose_id = protein_id_cache[protein_id] protein_duplicates[original_pose_id].append(pose_id) else: protein_id_cache[protein_id] = pose_id protein_duplicates[pose_id] = [] ligand = pose['ligand'] if ligand and ligand[STRUCTURE_ATTACHMENTS]: ligand_id = self._getStructureAttachmentID( ligand[STRUCTURE_ATTACHMENTS]) else: continue if ligand_id in ligand_id_cache: org_corporate_id, original_pose_id = ligand_id_cache[ligand_id] ligand_duplicates[org_corporate_id][original_pose_id].append( (corporate_id, pose_id)) else: ligand_id_cache[ligand_id] = (corporate_id, pose_id) ligand_duplicates[corporate_id][pose_id] = [] return params, protein_duplicates, ligand_duplicates def _getStructureAttachmentID(self, structure_attachments): """ Given a list of structure attachment metadata, get the appropriate 'file_upload' attachment ID. Each protein or ligand pose dictionary object returned by LDClient holds multiple structure attachment IDs, where only the 'structure_transformation' field holding the type 'file_upload' is the relevant ID required. :param structure_attachments: metadata dictionaries returned by LDClient :type structure_attachments: List of Dict :return: structure attachment ID :rtype: str or None """ for st_attach in structure_attachments: if st_attach[STRUCTURE_TRANSFORMATION] == FILE_UPLOAD: return st_attach['id'] def _getXtalSourceParamsDepracated(self, col_3d, bare_sts, lr_rows): """ Generate the list of pose ids and corporate ids of the given 2D structures and the LR rows. Warning: this method will be removed once all LD servers migrate to 8.1 or above. :param col_3d: The live report column :type col_3d: ld_models.Column :param bare_sts: The structures to make dictionaries for :type bare_sts: [structure.Structure] :param lr_rows: a dictionary containing row information downloaded from the LiveDesign server :type lr_rows: `dict` :return: the pose ids and corporate ids of all the structures in the LR rows :rtype: [(str, str)] """ params = [] for st in bare_sts: values = lr_rows[st.title]['cells'][col_3d.id]['values'] pose_ids = [value_dict['value'] for value_dict in values] corporate_id = st.property[SD_ID] for pose_id in pose_ids: params.append((pose_id, corporate_id)) return params def _downloadStructures(self, lig_sources, prot_sources, prot_duplicates, ligand_duplicates): """ Download all of the xtal data from livedesign defined in the protein ligand sources lists. The protein sources and ligand sources are treated differently, as we only want one protein per parent compound, whereas we want a ligand for each pose. The protein sources are a list of lists, such that each inner list represents a single compound, whereas the ligand sources is a single list where each source represents a pose. The protein sources are searched one by one for the first readable protein structure returned, as the live design server currently returns unintelligible structures for all but one pose of the compound. Ideally we would only need a single source per compound, rather than a list. :param lig_sources: A list of source dictionaries. Each dictionary represents a pose. :type lig_sources: [{}] :param prot_sources: A list of source dictionaries. Each dictionary represents a protein. :type prot_sources: [{}] :param prot_duplicates: a map of unique pose IDs to a list of pose IDs that share the same structure :type prot_duplicates: {int: [int]} :param ligand_duplicates: a map of unique corporate IDs to a map of unique pose IDs to a list of duplicate corporate and pose IDs, where each of these duplicate ids share the same structure. :type ligand_duplicates: {str: {int: [(str, int)]} :return: a nested dictionary of ligand structures keyed by corporate id and pose id, and a mapping from protein structures to pose ids. :rtype: {str: OrderedDict({int: structure.Structure})}, OrderedDict({structure.Structure, [int]}) """ # task order chosen b/c protein will take the longest by far # launch protein export task # break up proteins to download one by one prot_task_ids = [] for prot_source in prot_sources: # FIXME: Pre-8.1: There currently isn't a way to figure out whether # a particular pose contains a protein or not, so we have to catch # the HTTP exception in case the pose doesn't hold a protein. This # check can be removed once LD servers are updated. try: prot_task_id = self.ld_client.export_xtals([prot_source], POSE_EXT) except requests.HTTPError as e: # No protein was found for this pose prot_task_id = None pose_id = prot_source[POSE_ID] prot_task_ids.append((prot_task_id, pose_id)) ligand_dict = self._downloadLigandStructures(lig_sources, ligand_duplicates) protein_dict = self._getDownloadedProteins(prot_task_ids) self._appendDuplicateProteins(protein_dict, prot_duplicates) return ligand_dict, protein_dict def _downloadLigandStructures(self, lig_sources, ligand_duplicates): """ Download the ligand structures from LiveDesign using the source dicts. Any duplicate strucrures stripped out by LD are added back in using the ligand_duplicates. :param lig_sources: A list of source dictionaries. Each dictionary represents a pose. :type lig_sources: [{}] :param ligand_duplicates: a map of unique corporate IDs to a map of unique pose IDs to a list of duplicate corporate and pose IDs, where each of these duplicate ids share the same structure. :type ligand_duplicates: {str: {int: [(str, int)]} :return: a nested dictionary of ligand structures keyed by corporate id and pose id :rtype: {str: OrderedDict({int: structure.Structure})} """ if not lig_sources: return defaultdict(OrderedDict) # launch ligand export task lig_task_id = self.ld_client.export_xtals(lig_sources, POSE_EXT) # get ligand task results lig_res_url = self.ld_client.wait_and_get_result_url(lig_task_id) lig_xtal_res = self.ld_client.export_xtals_result(lig_res_url) ligand_sts = self._readXtalResult(lig_xtal_res) # If I understand ldclient correctly this should always be True # It is possible the number of returned structures is not equal to the # number of input source dicts as LD strips out the duplicate # structures for versions 8.1+. msg = ('The number of ligand structures is greater than the number of' ' sources.') assert ligand_sts and len(ligand_sts) <= len(lig_sources), msg ligand_dict = self._orderDownloadedLigands(lig_sources, ligand_sts, ligand_duplicates) return ligand_dict def _orderDownloadedLigands(self, lig_sources, ligand_sts, duplicates): """ Starting from 8.1 the results from LDClient.export_xtals_result() no longer guarantees the order of the structures returned will match the input source dicts. Thus, for LD servers 8.1 and above, the title of structures are parsed to retrieve the pose and corporate ids. :param lig_sources: A list of source dictionaries. Each dictionary represents a pose. :type lig_sources: [{}] :param ligand_sts: a list of the ligand structures :type ligand_sts: [structure.Structure] :param duplicates: a map of unique ligand corporate ids to a map of pose ids to duplicate ids. :type duplicates: {str: {int: [(str, int)]} :return: a nested dictionary of ligand structures keyed by corporate id and pose id :rtype: {str: OrderedDict({int: structure.Structure})} """ ligand_dict = defaultdict(OrderedDict) if self._version >= login.LD_VERSION_MULTIPLE_IDS: for lig_st in ligand_sts: # For some reason the structure's LD properties are set within # the title as: <col title>.<corporate_id>.pose_<pose_id>.ETC; # because the column <col title> itself may contain periods, we # must split this string apart and then remove however many # "extra" elements there might be for the column title st_properties = lig_st.title.split('.') non_title_props = st_properties[-NUM_TITLE_PROPERTIES + 1:] corporate_id = non_title_props[0] pose_id = int(non_title_props[1].replace('pose_', '')) ligand_dict[corporate_id][pose_id] = lig_st # Add in the duplicate entries here so we don't have to traverse # the list twice. self._appendDuplicateLigands(corporate_id, pose_id, ligand_dict, duplicates) else: for source, lig_st in zip(lig_sources, ligand_sts): corporate_id, pose_id = source[CORPORATE_ID], source[POSE_ID] ligand_dict[corporate_id][pose_id] = lig_st return ligand_dict def _appendDuplicateLigands(self, org_corporate_id, org_pose_id, ligand_dict, duplicates): """ The duplicate ligand corporate and pose ids are added back into the dictionary of ligand structures keyed by corporate id and pose id to ensure the dictionary data returned is identical for all versions of LD. This measure will be unnecessary once all servers are upgraded to 8.1. :param org_corporate_id: the unique corporate id for which we will find the duplicates for. :type org_corporate_id: str :param org_pose_id: the unique pose id for which we will find the duplicates for. :type org_pose_id: int :param ligand_dict: a nested dictionary of ligand structures keyed by corporate id and pose id :type ligand_dict: {str: OrderedDict({int: structure.Structure})} :param duplicates: a map of unique ligand corporate ids to a map of pose ids to duplicate ids. :type duplicates: {str: {int: [(str, int)]} """ if self._version < login.LD_VERSION_MULTIPLE_IDS: return # Get the ligand to be duplicated lig_st = ligand_dict[org_corporate_id][org_pose_id] # Add the duplicate entries into the ligand dictionary along with the st for dup_entry in duplicates[org_corporate_id][org_pose_id]: dup_corporate_id, dup_pose_id = dup_entry ligand_dict[dup_corporate_id][dup_pose_id] = copy.deepcopy(lig_st) def _appendDuplicateProteins(self, protein_dict, duplicates): """ The duplicate protein pose ids are added back in to ensure the data returned is identical for all versions of LD. This measure will be unnecessary once all servers are upgraded to 8.1. :param protein_dict: a mapping from protein structures to pose ids :type protein_dict: OrderedDict({structure.Structure, [int]}) :param duplicates: a map of unique pose ids to the duplicates :type duplicates: {int: [int]} :return: a mapping from protein structures to pose ids (including any duplicates) :rtype: OrderedDict({structure.Structure, [int]}) """ if self._version < login.LD_VERSION_MULTIPLE_IDS: return for protein_st, pose_ids in list(protein_dict.items()): for pose_id in pose_ids: if pose_id in duplicates: protein_dict[protein_st].extend(duplicates[pose_id]) def _getDownloadedProteins(self, task_ids): """ Wait until the protein download tasks in `task_ids` have finished and return the results as mappings from protein structures to pose ids. :param task_ids: a list of (task id, pose id) pairs associated with each protein export :type task_ids: [(int, int)] :return: a mapping from proteins to the lists of pose ids containing those proteins. :rtype: OrderedDict({structure.Structure, [int]}) """ # mapping from unique proteins to lists of pose ids protein_dict = OrderedDict() protein_dict[None] = [] # cache for finding unique proteins based on string equivalence, returns # id for protein_dict protein_cache = {} for prot_task_id, pose_id in task_ids: if prot_task_id: prot_res_url = self.ld_client.wait_and_get_result_url( prot_task_id) prot_xtal_res = self.ld_client.export_xtals_result(prot_res_url) else: # For LD versions < 8.1 it is possible the task_id is None # since a protein for this pose_id might not exist. In this # case, the structure is automatically set to None as well. prot_xtal_res = None # sometimes xtals_result returns empty string if prot_xtal_res: if prot_xtal_res in protein_cache: protein_st = protein_cache[prot_xtal_res] protein_dict[protein_st].append(pose_id) continue possible_protein_st = self._readXtalResult(prot_xtal_res)[0] if possible_protein_st.atom_total > 1: protein_st = possible_protein_st # each time we find a new protein: # store protein with current id protein_dict[protein_st] = [pose_id] # make res string point to prot id in cache protein_cache[prot_xtal_res] = protein_st continue # this task_id returns no valid protein protein_dict[None].append(pose_id) return protein_dict def _readXtalResult(self, xtal_res): """ Read the result string in mol2 format from export_xtals_result and return a list of the structures contained within. :param xtal_res: A string in mol2 format :type xtal_res: `str` or `bytes` :return: a list of the structures from xtal_res :rtype: [structure.Structure] """ # Convert input to str if it is provided as bytes xtal_res = str(xtal_res, encoding='utf-8') if not xtal_res: return [] suffix = '.' + POSE_EXT with fileutils.tempfilename('ld_xtals', suffix) as xtal_filename: with open(xtal_filename, 'w') as xtal_file: xtal_file.write(xtal_res) with structure.StructureReader(xtal_filename) as reader: xtal_sts = list(reader) return xtal_sts def _combineTabularData(self, tabular_data_sts): """ Create an ordered dict from structures copied from "tabular_data_sts". :param tabular_data_sts: The tabular data imported from LiveDesign :type tabular_data_sts: [structure.Structure] :return: Copies of the original structures, keyed by dummy indices to conform to the same data structures as _import3DCol :rtype OrderedDict({int: structure.Structure}) """ return OrderedDict( (idx, st.copy()) for idx, st in enumerate(tabular_data_sts)) def _combineStructure(self, st, st_3d): """ Combine the structures' properties by copying properties from one structure to another. :param st: the structure whose properties are to be copied. :type st: `structure.Structure` :param st_3d: structure with 3d data - the desired properties from `st` are copied here :type st_3d: `structure.Structure` :return: `st_3d`, after the properties of `st` have been added to it :rtype: `structure.Structure` """ for prop_name in list(st.property): st_3d.property[prop_name] = st.property[prop_name] return st_3d
[docs]def cache_entity_IDs(sts, ld_id_dict): """ Store the corporate ID (AKA the entity ID) as a structure property for each imported structure :param sts: List of structures to set properties for :type sts: list(structure.Structure) :param ld_id_dict: Live Design structure to entity ID dictionary :type ld_id_dict: Dict[Structure, str] """ for st in sts: corp_id = ld_id_dict.get(st) ld_utils.safely_set_property(st, constants.PROPNAME_IMPORT_ENTITY_ID, corp_id)
[docs]def get_st_entity_id_map(ld_client, sts, lr_id): """ Generate a dictionary mapping each compound's structure to its entity id for easier access. For LD versions < 8.1: the primary LD ID is used as an entity ID For LD versions >= 8.1: Every structure holds multiple LD IDs, out of which, one is the entity ID. We use the live report metadata to obtain the correct ID. :param ld_client: LiveReport client :type ld_client: LDClient :param sts: structures to get map for :type sts: structure.Structure :param lr_id: the live report id :type lr_id: str :return: dictionary mapping structure to entitiy id :rtype: `dict(structure.Structure, str)` """ if login.get_LD_version(ld_client) < login.LD_VERSION_MULTIPLE_IDS: return {st: st.property[SD_ID] for st in sts} # Use metadata to get Entity IDs lr_results_metadata = ld_client.live_report_results_metadata(lr_id) row_info_list = lr_results_metadata[ROW_INFO_KEY] display_to_entity_id_dict = { row_info_dict[DISPLAY_ID_KEY].strip(): row_info_dict[ENTITY_ID_KEY].strip() for row_info_dict in row_info_list } # Use get to access property as as some structures don't have corporate IDs st_to_entity_id = { st: display_to_entity_id_dict.get(st.property.get(SD_ID)) for st in sts } return st_to_entity_id