Source code for schrodinger.application.matsci.unique_species
"""
Module to calculate unique species from structure object
Copyright Schrodinger, LLC. All rights reserved.
"""
from collections import defaultdict
from schrodinger.application.matsci import msutils
from schrodinger.application.matsci import rdpattern
from schrodinger.structutils import analyze
MOLECULE_NUM = 'i_matsci_molecule_number'
[docs]class UniqueMolecules:
"""
Class to calculate unique molecules in a structure.
"""
[docs] def __init__(self, struct):
"""
Constructs a new instance of UniqueMolecules
:param struct: The structure
:type struct: `structure.Structure`
"""
self.struct = struct.copy()
self.unique_mol_nums = self.getUniqueMols()
def _getAllMolSts(self):
"""
Extracts all the molecules from the structure and set MOLECULE_NUM
property to map them to original structure
:returns: All extracted molecules from structure
:rtype: list(`structure.Structure`)
"""
mol_sts = []
for mol in self.struct.molecule:
mol_st = mol.extractStructure()
mol_st.property[MOLECULE_NUM] = mol.number
mol_sts.append(mol_st)
return mol_sts
def _splitUniqueValue(self, prop_values):
"""
Flattens all the items in the values of the input dict into two lists.
Each value of the input dictionary should be a list. Items from lists of
length 1 are returned in the first list, items from lists of length > 1
are returned in the second list.
:param dict prop_values: values are lists of items
:rtype: (list, list)
:returns: The first list contains all the items that were in 1-item
lists. The second list contains all the items that were in lists > 1
item long.
"""
unique_vals, non_unique_vals = [], []
for values in prop_values.values():
if len(values) == 1:
unique_vals.append(values[0])
else:
non_unique_vals.extend(values)
return unique_vals, non_unique_vals
[docs] def splitUniqueMolsUsingNumAtoms(self, mol_sts):
"""
Splits unique mols and non-unique molecules using number atoms. Unique
molecules have unique number of atoms in the system.
:param list mol_sts: A list of extracted molecule structures
:returns: The first element is the list of molecules with unique
number of atoms and second element is the list of molecule that
have non-unique number of atoms
:rtype: tuple(list, list)
"""
num_atoms_mol = defaultdict(list)
for mol in mol_sts:
num_atoms_mol[mol.atom_total].append(mol)
return self._splitUniqueValue(num_atoms_mol)
[docs] def getUniqueMolsFromSmarts(self, mol_sts):
"""
Get representative molecule for each unique molecular SMARTS.
:param list mol_sts: A list of extracted molecule structures
:returns: Representative structure for each molecule with unique
molecular SMARTS
:rtype: list(`structure.Structure`)
"""
unique_mols = []
seen_smarts = set()
for mol in mol_sts:
smarts = rdpattern.to_smarts(mol)
if smarts not in seen_smarts:
seen_smarts.add(smarts)
unique_mols.append(mol)
return unique_mols
[docs] def getUniqueMols(self):
"""
Get the molecule number of unique representative molecules in the
structure
:rtype: list
:return: list of molecule numbers that are unique
"""
all_mol_structs = self._getAllMolSts()
# Separate the molecules based on number of atoms
num_unique_mols, non_unique_mols = self.splitUniqueMolsUsingNumAtoms(
all_mol_structs)
# Separate the non-unique molecules based on the chemical formula
formula_unique_mols, non_unique_mols = self.splitUniqueMolsUsingFormula(
non_unique_mols)
# Find the representative molecules for each SMARTS among the non-unique
# molecules. This is the robust method but takes significant amount of
# time. The above two methods try to reduce the load on this function
smart_unique_mols = self.getUniqueMolsFromSmarts(non_unique_mols)
unique_mols = smart_unique_mols + formula_unique_mols + num_unique_mols
unique_mols_nums = [x.property[MOLECULE_NUM] for x in unique_mols]
return unique_mols_nums
[docs] def getUniqueStruct(self):
"""
Gets the structure comprising only unique representative molecules.
:returns: The structure with only unique molecules
:rtype: `structure.Structure`
"""
unique_aids = msutils.flatten([
self.struct.molecule[x].getAtomIndices()
for x in self.unique_mol_nums
])
unique_st = self.struct.extract(unique_aids, True)
return unique_st