Source code for syndirella.route.Library

#!venv/bin/env python3
"""
syndirella.route.Library.py

This module contains the Library class. This class contains information about the analogue library. It will create the
analogue library from the Reaction object. It will also store the analogue library as a .pkl file.
"""

import glob
import logging
import os
import pickle
from typing import (List, Dict, Tuple)

import pandas as pd
from pandas import DataFrame
from rdkit import DataStructs
from rdkit.Chem.FilterCatalog import *

import syndirella.fairy as fairy
from syndirella.Postera import Postera
from syndirella.error import SMARTSError, NoReactants, APIQueryError
from .Reaction import Reaction



[docs]
class Library:
    """
    This class contains information about the analogue library. It will create the analogue library from the Reaction
    object. It will also store the analogue library as a .pkl file.
    """

    def __init__(self,
                 reaction: Reaction,
                 output_dir: str,
                 id: str,
                 num_steps: int,
                 current_step: int,
                 filter: bool,
                 route_uuid: str,
                 atom_diff_min: int,
                 atom_diff_max: int):

[docs]
        self.reaction: Reaction = reaction


[docs]
        self.id: str = id


[docs]
        self.output_dir: str = os.path.join(output_dir, self.id)


[docs]
        self.extra_dir_path: str = os.path.join(self.output_dir, "extra")


[docs]
        self.num_steps: int = num_steps


[docs]
        self.current_step: int = current_step


[docs]
        self.analogues_dataframes: Dict[str: pd.DataFrame] = {}


[docs]
        self.filter: bool = filter


[docs]
        self.route_uuid: str = route_uuid


[docs]
        self.atom_diff_min: int = atom_diff_min


[docs]
        self.atom_diff_max: int = atom_diff_max



[docs]
        self.logger = logging.getLogger(f"{__name__}")


[docs]
        self.r1 = None


[docs]
        self.r2 = None



[docs]
    def create_library(self):
        """
        This function is used to create the analogue library from the Reaction object.
        """
        for key, value in self.reaction.matched_smarts_to_reactant.items():  # can work for 1 and 2 reactants
            reactant = value[0]
            reactant_smarts = key
            analogue_prefix = value[2]
            df, analogue_columns = self.process_reactant(reactant, reactant_smarts, analogue_prefix)
            df: pd.DataFrame
            analogue_columns: Tuple[str, str]
            self.analogues_dataframes[analogue_prefix] = (df, analogue_columns)



[docs]
    def process_reactant(self, reactant: Chem.Mol, reactant_smarts: str, analogue_prefix: str) -> tuple[
        DataFrame, tuple[str, str]]:
        # search for analogues csv if already created. Perform for all reactants.
        reactant_analogues_path, internal_step, previous_product = self.check_analogues_pkl_exists(analogue_prefix,
                                                                                                   reactant)
        if reactant_analogues_path is not None:
            analogues: List[str] = self.load_library(reactant_analogues_path,
                                                     analogue_prefix,
                                                     internal_step)
        else:  # perform database search
            postera_search = Postera()
            analogues: List[str] | None = postera_search.perform_database_search(reactant=reactant,
                                                                                 reaction_name=self.reaction.reaction_name,
                                                                                 search_type="superstructure",
                                                                                 vendors=['enamine_bb', 'mcule',
                                                                                          'mcule_ultimate',
                                                                                          'enamine_real',
                                                                                          'enamine_made'])
            if analogues is None:  # if the API query failed
                self.logger.critical(
                    f"API superstructure query failed for reactant {analogue_prefix} in {self.reaction.reaction_name}.")
                raise APIQueryError(
                    message=f"API superstructure query failed for step {self.current_step} reactant {analogue_prefix} in {self.reaction.reaction_name}.",
                    inchi=self.id,
                    route_uuid=self.route_uuid)
        processed_analogues_df, analogue_columns = (
            self.process_analogues(analogues,
                                   reactant_smarts,
                                   analogue_prefix,
                                   previous_product))
        processed_analogues_df: pd.DataFrame
        analogue_columns: Tuple[str, str]
        self.save_library(processed_analogues_df, analogue_prefix)
        return processed_analogues_df, analogue_columns



[docs]
    def process_analogues(self,
                          analogues: List[str],
                          reactant_smarts: str,
                          analogue_prefix: str,
                          previous_product: bool) -> pd.DataFrame:
        """
        This function puts list of analogues in dataframe and does SMART checking to check if the analogues contains
        the SMARTS pattern of the original reactant and against all other reactants SMARTS.
        """
        analogues_mols: List[Chem.Mol] = [Chem.MolFromSmiles(analogue) for analogue in analogues]
        analogues_mols: List[Chem.Mol] = fairy.remove_chirality(analogues_mols)
        analogues_mols: List[Chem.Mol] = fairy.remove_repeat_mols(analogues_mols)
        self.print_diff(analogues, analogues_mols, analogue_prefix)
        if self.filter:
            analogues: List[str] = self.filter_analogues(analogues, analogue_prefix)
        reactant_smarts_mol: Chem.Mol = Chem.MolFromSmarts(reactant_smarts)
        contains_smarts_pattern, num_matches = self.check_analogue_contains_smarts_pattern(analogues_mols,
                                                                                           reactant_smarts_mol)
        contains_smarts_pattern: List[bool]
        num_matches: List[int]
        if len(self.reaction.matched_smarts_to_reactant) == 1:
            contains_other_reactant_smarts_pattern = [False for _ in analogues_mols]
            other_reactant_prefix = None
        else:
            contains_other_reactant_smarts_pattern, other_reactant_prefix = (
                self.check_analogue_contains_other_reactant_smarts_pattern(analogues_mols, reactant_smarts))
        contains_other_reactant_smarts_pattern: List[bool]
        other_reactant_prefix: str
        analogues = [Chem.MolToSmiles(analogue) for analogue in analogues_mols]
        analogues_df = (
            pd.DataFrame({f"{analogue_prefix}_smiles": analogues,
                          f"{analogue_prefix}_mol": analogues_mols,
                          f"{analogue_prefix}_{self.reaction.reaction_name}": contains_smarts_pattern,
                          f"{analogue_prefix}_{self.reaction.reaction_name}_num_matches": num_matches,
                          f"{other_reactant_prefix}_{self.reaction.reaction_name}": contains_other_reactant_smarts_pattern,
                          f"{analogue_prefix}_is_previous_product": previous_product}))
        if self.filter:
            analogues_df['is_PAINS_A'] = False
        return analogues_df, (f"{analogue_prefix}_{self.reaction.reaction_name}",
                              f"{other_reactant_prefix}_{self.reaction.reaction_name}")



[docs]
    def check_analogue_contains_smarts_pattern(self, analogues_mols: List[Chem.Mol], reactant_smarts_mol: Chem.Mol):
        """
        This function is used to check if the analogues contains the original reactant. Will return
        dictionary of boolean values and the number of matches.
        """
        self.logger.info('Checking if analogues contain SMARTS pattern of original reactant...')
        matching = [bool(analogue.GetSubstructMatches(reactant_smarts_mol)) for analogue in analogues_mols]
        num = [len(analogue.GetSubstructMatches(reactant_smarts_mol)) for analogue in analogues_mols]
        if len(matching) != len(analogues_mols):
            self.logger.error("Problem with finding matches.")
            raise SMARTSError(message="Problem with finding SMARTS matches to analogues.",
                              mol=self.reaction.scaffold,
                              route_uuid=self.route_uuid)
        if len(num) != len(analogues_mols):
            self.logger.error("Problem with finding number of matches.")
            raise SMARTSError(message="Problem with finding number of matches.",
                              mol=self.reaction.scaffold,
                              route_uuid=self.route_uuid)
        return matching, num



[docs]
    def filter_analogues(self, analogues: List[str], analogue_prefix: str) -> List[str]:
        """
        This function is used to filter out analogues.
        """
        mols: List[Chem.Mol] = [Chem.MolFromSmiles(mol) for mol in analogues]
        passing_mols: List[Chem.Mol] = self.filter_on_substructure_filters(mols)
        self.print_diff(mols, passing_mols, analogue_prefix)
        filtered_analogues: List[str] = [Chem.MolToSmiles(mol) for mol in passing_mols]
        return filtered_analogues



[docs]
    def print_diff(self, mols: List[Chem.Mol], valid_mols: List[Chem.Mol], analogue_prefix: str):
        """
        This function is used to print the difference between the original number of analogues and the number of
        valid analogues.
        """
        if len(valid_mols) > len(mols):
            self.logger.warning(
                "Problem with finding valid molecules. There are more than were in the original list of "
                "molecules.")
        num_filtered = len(mols) - len(valid_mols)
        percent_diff = round((num_filtered / len(mols)) * 100, 2)
        self.logger.info(
            f'Removed {num_filtered} invalid or repeated molecules ({percent_diff}%) of {analogue_prefix} analogues.')



[docs]
    def filter_on_substructure_filters(self, mols: List[Chem.Mol], ) -> List[str]:
        """
        This function is used to filter out analogues that do not pass the substructure filters.
        """
        self.logger.info('Filtering analogues on PAINS_A filters...')
        params = FilterCatalogParams()
        params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS_A)
        catalog = FilterCatalog(params)
        passing_molecules: List[Chem.Mol] = []
        for mol in mols:
            if not catalog.HasMatch(mol):
                passing_molecules.append(mol)
        return passing_molecules



[docs]
    def check_analogue_contains_other_reactant_smarts_pattern(self, analogues_mols: List[Chem.Mol],
                                                              reactant_smarts: str) -> List[bool]:
        """
        This function is used to check if the analogues contains the SMARTS patterns of the other reactant.
        """
        # get other reactant SMARTS pattern
        self.logger.info('Checking if analogues contain SMARTS pattern of other reactant...')
        other_reactant_smarts_pattern = \
            [smarts for smarts in self.reaction.matched_smarts_to_reactant.keys() if not smarts == reactant_smarts][0]
        if other_reactant_smarts_pattern is None:
            self.logger.error(f"Other reactant SMARTS pattern not found for {self.reaction.reaction_name}.")
            raise SMARTSError(message=f"Other reactant SMARTS pattern not found for {self.reaction.reaction_name}.",
                              mol=self.reaction.scaffold,
                              route_uuid=self.route_uuid)
        other_reactant_prefix = self.reaction.matched_smarts_to_reactant[other_reactant_smarts_pattern][2]
        other_reactant_smarts_mol: Chem.Mol = Chem.MolFromSmarts(other_reactant_smarts_pattern)
        return [bool(analogue.GetSubstructMatches(other_reactant_smarts_mol)) for analogue in
                analogues_mols], other_reactant_prefix



[docs]
    def check_analogue_contains_all_smarts_patterns(self, analogues_mols: List[Chem.Mol]) -> List[
                                                                                                 bool] | NotImplementedError:
        """
        This function is used to check if the analogues contains all the SMARTS patterns of the other reactants.
        """
        return NotImplementedError()



[docs]
    def check_analogues_pkl_exists(self, analogue_prefix: str, reactant: Chem.Mol) -> str and bool:
        """
        This function is used to check if the analogue library has already been created and saved as a .pkl file.
        """
        internal_step = False
        if self.current_step != 1 and (self.current_step == self.num_steps or self.current_step < self.num_steps):
            self.logger.info(
                'Since this is an internal or final step looking for the products .pkl from previous step...')
            pkl_path: str = self._find_products_pkl_name(reactant)
            if pkl_path is not None:
                internal_step = True
                previous_product = True
                return pkl_path, internal_step, previous_product  # returns path to products .pkl.gz since is reactant
        if self.current_step != 1 and self.current_step < self.num_steps:
            self.logger.info('Looking for analogue library .pkl.gz if already created...')
            pkl_path: str = self._find_analogues_pkl_name(analogue_prefix)
            if pkl_path is not None:
                internal_step = True
                previous_product = False
                return pkl_path, internal_step, previous_product  # returns path to analogue .pkl.gz and is internal step
        if self.current_step == 1:
            self.logger.info('Looking for analogue library .pkl.gz if already created...')
            pkl_path: str = self._find_analogues_pkl_name(analogue_prefix)
            if pkl_path is not None:
                internal_step = False
                previous_product = False
                return pkl_path, internal_step, previous_product  # returns path to analogue .pkl.gz and is first step
        previous_product = False
        os.makedirs(self.extra_dir_path, exist_ok=True)
        return None, internal_step, previous_product



[docs]
    def _find_analogues_pkl_name(self, analogue_prefix: str) -> str:
        """
        Checks if the analogue library was already created and saved as a .pkl.gz file. Returns the path to the
        .pkl.gz file.
        """
        pkl: List[str] = glob.glob(f"{self.extra_dir_path}/"
                                   f"{self.id}_{self.route_uuid}_{self.reaction.reaction_name}_{analogue_prefix}_"
                                   f"{self.current_step}of{self.num_steps}.pkl.gz")
        if len(pkl) == 1:
            self.logger.info(f"Found {pkl[0]} as the analogue library .pkl from previous step.")
            return pkl[0]



[docs]
    def _find_products_pkl_name(self, reactant: Chem.Mol) -> str:
        """
        This function is used to find the name of the products .pkl file by comparing the reactant to the scaffold with
        bit vector similarity.
        """
        product_pkls: List[str] = glob.glob(f"{self.extra_dir_path}/"
                                            f"{self.id}_{self.route_uuid}_*products_{self.current_step - 1}of"
                                            f"{self.num_steps}.pkl.gz")
        for i, path in enumerate(product_pkls):
            self.logger.info(f"Found {path} as the potential products .pkl from previous step.")
            # Find if the reactant is the same as the scaffold
            df = pd.read_pickle(path)
            # Iterate through the top 100 products
            for product in df["smiles"][:100]:
                product_mol = Chem.MolFromSmiles(product)
                # Calculate similarity score
                similarity_score = DataStructs.FingerprintSimilarity(
                    fairy.get_morgan_fingerprint(reactant),
                    fairy.get_morgan_fingerprint(product_mol))
                # If a perfect match is found, return the path
                if similarity_score == 1:
                    self.logger.info(f"Found {path} as the products .pkl from previous step.")
                    return path
        self.logger.info(f"Could not find any products .pkl from previous step.")
        raise NoReactants(message=f"Could not find any products .pkl from previous step.",
                          route_uuid=self.route_uuid,
                          inchi=self.id)



[docs]
    def save_library(self, df: pd.DataFrame, analogue_prefix: str):
        """
        This function is used to save the analogue library as a .pkl file in self.extra_dir_path
        """
        os.makedirs(f"{self.extra_dir_path}", exist_ok=True)
        pkl_name = f"{self.id}_{self.route_uuid}_{self.reaction.reaction_name}_{analogue_prefix}_{self.current_step}of{self.num_steps}.pkl.gz"
        self.logger.info(f"Saving {analogue_prefix} analogue library to {self.extra_dir_path}/{pkl_name} \n")
        df.to_pickle(f"{self.extra_dir_path}/{pkl_name}")



[docs]
    def load_library(self,
                     reactant_analogues_path: str,
                     analogue_prefix: str,
                     internal_step: bool) -> List[str]:
        """
        This function is used to load the analogue library from a .pkl file.
        """
        try:
            # find column with analogue smiles
            df = pd.read_pickle(reactant_analogues_path)
            if len(df) == 0:
                self.logger.critical(f"Empty analogue library at {reactant_analogues_path}. Stopping...")
                raise NoReactants(message=f"Empty analogue library at {reactant_analogues_path}",
                                  route_uuid=self.route_uuid,
                                  inchi=self.id,
                                  mol=self.reaction.scaffold)
            if not internal_step:
                analogues = df[f"{analogue_prefix}_smiles"].tolist()
                return analogues
            if internal_step:
                analogues = df["smiles"].tolist()
                return analogues
        except KeyError:
            self.logger.critical(
                f"Could not find analogue column in already existing scaffold.pkl at {reactant_analogues_path}. "
                f"Stopping...")



[docs]
    def save(self):
        """
        This function saves the library object in the self.extra_output_dir_path.
        """
        with open(f"{self.extra_dir_path}/{self.id}_library.pkl", "wb") as f:
            pickle.dump(self, f)


    @staticmethod

[docs]
    def load(output_dir: str, product: str) -> 'Library':
        """
        This function loads the library object. Will have to load all library objects
        """
        logger = logging.getLogger(f"{__name__}.{Library.__name__}")
        # get all .pkl in output_dir.
        library_pkls: List[str] = glob.glob(f"{output_dir}/*library*.pkl")
        # get id from scaffold SMILES
        id = fairy.generate_inchi_ID(product, isomeric=False)
        with open(library_pkls[0], "rb") as f:
            library = pickle.load(f)
            if library.id == id and library.num_steps == library.current_step:  # return the final library
                logger.info(f"Loaded {library.id} final library.")
                return library
        # if not found, return None
        logger.warning(f"Could not load {id} final library.")
        return None