Source code for syndirella.slipper.Slipper

#!venv/bin/env python3
"""
syndirella.slipper.Slipper.py

This module contains the Slipper class. A slipper in this metaphor is the set of molecules that is the
scaffold of a reaction.
"""
from typing import List, Dict, Optional
import pandas as pd

from syndirella.error import NoScaffold, NoToHippo
from syndirella.slipper.slipper_synthesizer.SlipperSynthesizer import SlipperSynthesizer
from syndirella.route.Library import Library
from syndirella.slipper.SlipperFitter import SlipperFitter
from syndirella.slipper._placement_data import get_placement_data
import os, shutil
import glob
from rdkit import Chem
from rdkit.Chem import inchi
import logging



[docs]
class Slipper:
    """
    This class is instantiated to represent all products for a step in a route.
    """

    def __init__(self,
                 *,
                 library: Library,
                 template: str = None,
                 hits_path: str = None,
                 hits_names: List[str] = None,
                 batch_num: int = None,
                 atoms_ids_expansion: dict = None,
                 additional_info: dict = None,
                 scaffold_placements: Dict[Chem.Mol, str | None] = None):

[docs]
        self.products: pd.DataFrame = None


[docs]
        self.library: Library = library


[docs]
        self.route_uuid: str = library.route_uuid


[docs]
        self.output_dir: str = library.output_dir


[docs]
        self.final_products_pkl_path: str = None


[docs]
        self.final_products_csv_path: str = None


[docs]
        self.scaffold_placements: Dict[Chem.Mol, str | None] = scaffold_placements

        # need Fragmenstein information

[docs]
        self.template: str = template  # path to pdb file


[docs]
        self.hits_path: str = hits_path  # path to .sdf or .mol file


[docs]
        self.hits_names: List[str] = hits_names  # name of fragments


[docs]
        self.batch_num: int = batch_num


[docs]
        self.atoms_ids_expansion: dict = atoms_ids_expansion


[docs]
        self.placements: pd.DataFrame = None


[docs]
        self.output_path: str = None


[docs]
        self.additional_info: dict = additional_info


[docs]
        self.logger = logging.getLogger(f"{__name__}")


        # stats for output

[docs]
        self.num_placed: int | None = None


[docs]
        self.num_successful: int | None = None


[docs]
        self.to_hippo_path: str | None = None


[docs]
        self.num_unique_products: int | None = None # number of unique products not including stereoisomers found at the end of the route.


[docs]
        self.num_products_enumstereo: int | None = None # number of products after stereoisomer enumeration found at the end of the route.



[docs]
    def get_products(self) -> pd.DataFrame and str:
        """
        Main entry to the Slipper class. This function is used to get the products the self.library object.
        """
        slipper_synth = SlipperSynthesizer(self.library,
                                           self.output_dir,
                                           self.atoms_ids_expansion,
                                           self.additional_info)
        self.products: pd.DataFrame = slipper_synth.get_products()
        if self.atoms_ids_expansion is not None:
            slipper_synth.label_products()
        slipper_synth.save_products()
        self.final_products_pkl_path: str = slipper_synth.final_products_pkl_path
        self.final_products_csv_path: str = slipper_synth.final_products_csv_path

        # stats for output
        self.num_unique_products = slipper_synth.num_unique_products
        self.num_products_enumstereo = slipper_synth.num_products_enumstereo

        return self.products



[docs]
    def place_products(self):
        """
        This function is used to place the products with Fragmenstein.
        """
        slipper_fitter = SlipperFitter(template_path=self.template,
                                       hits_path=self.hits_path,
                                       hits_names=self.hits_names,
                                       output_dir=self.output_dir,
                                       route_uuid=self.route_uuid,
                                       id=self.library.id,
                                       scaffold_placements=self.scaffold_placements)
        slipper_fitter.atom_diff_min = self.library.atom_diff_min
        slipper_fitter.atom_diff_max = self.library.atom_diff_max
        slipper_fitter.final_products = self.products # products with enumerated stereoisomers from final library
        slipper_fitter.batch_num = self.batch_num
        slipper_fitter.final_products_pkl_path = self.final_products_pkl_path
        slipper_fitter.final_products_csv_path = self.final_products_csv_path

        self.placements: pd.DataFrame = slipper_fitter.fit_products()

        slipper_fitter.save_placements(id=self.library.id, route_uuid=self.route_uuid)
        self.output_path: str = slipper_fitter.output_path

        # stats for output
        self.num_placed = slipper_fitter.num_placed
        self.num_successful = slipper_fitter.num_successful

        return self.placements



[docs]
    def write_products_to_hippo(self) -> str:
        """
        Writes a dataframe that contains the values needed for HIPPO db input.

        Returns:
            path: str : the path to the saved dataframe
        """
        if self.placements is None:
            self.logger.critical("Placements need to be run first before writing HIPPO output.")
            return None
        # cut placements to those that were placed by batch_num
        placements: pd.DataFrame = self.placements.iloc[:self.batch_num]

        hippo_path: str = os.path.join(self.output_dir, f'{self.library.id}_{self.route_uuid}_to_hippo.pkl.gz')
        # get all products dfs in /extra
        products_files: List[str] = glob.glob(f"{self.output_dir}/extra/*{self.route_uuid}*products*.pkl*")
        product_dfs: Dict[int, pd.DataFrame] = self._load_products_dfs(products_files)
        # make HIPPO output dataframe of these specific products
        hippo_df = self._structure_products_for_hippo(placements_df=placements,
                                                      product_dfs=product_dfs)
        # load file if it already exists
        if os.path.isfile(hippo_path):
            previous_hippo = pd.read_pickle(hippo_path)
            hippo_df = pd.concat([previous_hippo, hippo_df], axis=1, ignore_index=True)
        # save the dataframe
        hippo_df.to_pickle(hippo_path)
        self.logger.info(f"Saved HIPPO output to {hippo_path}")
        self.to_hippo_path = hippo_path
        self.check_scaffold_in_hippo(hippo_df, hippo_path)
        return hippo_path



[docs]
    def check_scaffold_in_hippo(self, hippo_df: pd.DataFrame, hippo_path: str):
        """
        Checks if there is a scaffold in the scaffold names of the HIPPO output.
        """
        if not any('scaffold' in name for name in hippo_df[f'{self.library.num_steps}_product_name']):
            self.logger.warning("Scaffold was not found in the scaffold names of the HIPPO output.")
            raise NoScaffold(message=f"Scaffold was not found in the scaffold names of the HIPPO output at {hippo_path}."
                                     f"Most likely due to an incorrectly written SMIRKS.",
                             route_uuid=self.route_uuid,
                             inchi=self.library.id)



[docs]
    def _load_products_dfs(self, products_files: List[str]) -> Dict[int, pd.DataFrame]:
        """
        Load the products dataframes from the files in the /extra directory and putting into dict where key is step.
        """
        product_dfs: Dict[int, pd.DataFrame] = {}
        for file in products_files:
            df = pd.read_pickle(file)
            if len(df) == 0:
                self.logger.info(f"Empty dataframe found in {file}. Continuing with next file to structure hippo outputs")
                continue
            step = df['step'].iloc[0]
            product_dfs[step] = df
        if len(product_dfs) != self.library.num_steps - 1:
            found_steps = list(product_dfs.keys())
            missing_steps = [step for step in range(1, self.library.num_steps) if step not in found_steps]
            self.logger.critical("Not all steps have findable products dataframes or non empty dataframes. "
                                 f"Missing steps: {missing_steps}")
            raise NoToHippo(message=f"Not all steps have findable products dataframes or non empty dataframes. "
                                    f"Missing steps: {missing_steps}",
                            route_uuid=self.route_uuid,
                            inchi=self.library.id)
        return product_dfs



[docs]
    def _structure_products_for_hippo(self,
                                      placements_df: pd.DataFrame,
                                      product_dfs: Dict[int, pd.DataFrame]) -> pd.DataFrame:
        """
        Structures the placements or products df for HIPPO output.
        """
        hippo_dfs: Dict[int, pd.DataFrame] = {}
        for step, products_df in product_dfs.items():
            hippo_df_step: pd.DataFrame = self._structure_step_for_hippo(step, products_df)
            hippo_dfs[step] = hippo_df_step
        hippo_dfs[self.library.num_steps] = self._structure_step_for_hippo(self.library.num_steps, placements_df)
        hippo_df = self._put_hippo_dfs_together(hippo_dfs)
        return hippo_df



[docs]
    def calculate_inchi_similarity(self,
                                   smiles1: str,
                                   smiles2: str) -> int:
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        # Check if either molecule failed to be created from the SMILES
        if mol1 is None or mol2 is None:
            return 0  # Indicating an error or non-comparability
        inchi1 = inchi.MolToInchi(mol1)
        inchi2 = inchi.MolToInchi(mol2)
        # Directly compare the InChI strings for equality
        similarity = 1 if inchi1 == inchi2 else 0
        return similarity


    # Function to find matches

[docs]
    def find_matches(self,
                     row,  # row of current step
                     step,
                     df_previous_step: pd.DataFrame) -> List[str]:
        product_matches = None  # Store matching scaffold names
        for _, product_row in df_previous_step.iterrows():
            similarity = self.calculate_inchi_similarity(
                row[f'{step + 1}_r{row[f"{step + 1}_r_previous_product"]}_smiles'],
                product_row[f'{step}_product_smiles'])
            if similarity == 1:
                product_matches = product_row[f'{step}_product_name']
                break
        return product_matches



[docs]
    def _put_hippo_dfs_together(self,
                                hippo_dfs: Dict[int, pd.DataFrame]) -> pd.DataFrame:
        """
        Puts the HIPPO dataframes together by matching on each reaction scaffold to the correct previous step's reactant.
        """
        # get the last step's dataframe
        hippo_df_step_last = hippo_dfs[self.library.num_steps]
        for step in range(1, self.library.num_steps)[::-1]:  # iterate through the steps in reverse
            # get the step's dataframe
            hippo_df_stepx = hippo_dfs[step]
            # Find the matching scaffold names for the reactant in this new step
            hippo_df_step_last[f'{step}_product_name'] = hippo_df_step_last.apply(self.find_matches,
                                                                                  step=step,
                                                                                  df_previous_step=hippo_df_stepx,
                                                                                  axis=1)
            # What happens if there are null scaffold names?... Still keep row with null scaffold name
            # Join on the scaffold names, has to be right merge because we only care about the products from the last step
            result_df = pd.merge(hippo_df_stepx,
                                 hippo_df_step_last,
                                 left_on=f'{step}_product_name',
                                 right_on=f'{step}_product_name',
                                 how='right')
            # Update the last step dataframe
            hippo_df_step_last = result_df
        # add scaffold compound smiles as first column, get from Inchi Key
        base_compound_smiles: str = Chem.MolToSmiles(self.library.reaction.scaffold)
        hippo_df_step_last.insert(0, 'scaffold_smiles', base_compound_smiles)
        return hippo_df_step_last



[docs]
    def _structure_step_for_hippo(self,
                                  step: int,
                                  products_df: pd.DataFrame) -> pd.DataFrame:
        """
        Structures the products df for HIPPO output.
        """
        # cut down the dataframe to those with num_atom_diff <= 15
        reaction: str = products_df['reaction'].iloc[0]
        r1_smiles: List[str] = products_df['r1_smiles'].tolist()
        r1_is_previous_product: bool = products_df['r1_is_previous_product'].iloc[0]
        try:
            r2_smiles: List[str] = products_df['r2_smiles'].tolist()
            r2_is_previous_product: bool = products_df['r2_is_previous_product'].iloc[0]
        except KeyError:
            r2_smiles: List[str] = [''] * len(products_df)
            r2_is_previous_product: bool = None
        # find which reactants were products of the previous step
        r_previous_product: int | None = self.which_reactant_was_previous_product(r1_is_previous_product,
                                                                           r2_is_previous_product)
        product_smiles: List[str] = products_df['smiles'].tolist()
        product_names: List[str] = products_df['name'].tolist()
        try:
            flags: List[str] = products_df['flag'].tolist()
            # replace nan with None
            flags = [None if pd.isna(flag) else flag for flag in flags]
        except KeyError:
            flags: List[str | None] = [None] * len(products_df)
        # variables for the last step
        if step == self.library.num_steps:
            num_atom_diff: List[int] = products_df['num_atom_diff'].tolist()
            stereoisomer: List[str] = products_df['stereoisomer'].tolist()
            error: List[str] = products_df['error'].tolist()
            delta_delta_G: List[float] = products_df['∆∆G'].tolist()
            delta_G_bound: List[float] = products_df['∆G_bound'].tolist()
            delta_G_unbound: List[float] = products_df['∆G_unbound'].tolist()
            comRMSD: List[float] = products_df['comRMSD'].tolist()
            regarded: List[bool] = products_df['regarded'].tolist()
            intra_geometry_pass: List[bool] = products_df['intra_geometry_pass'].tolist()
            path_to_mol: List[Optional[str]] = products_df['path_to_mol'].tolist()
        # make HIPPO output dataframe
        hippo_df_step = pd.DataFrame({f'{step}_reaction': reaction,
                                      f'{step}_r1_smiles': r1_smiles,
                                      f'{step}_r2_smiles': r2_smiles,
                                      f'{step}_r_previous_product': r_previous_product,
                                      f'{step}_product_smiles': product_smiles,
                                      f'{step}_product_name': product_names,
                                      f'{step}_flag': flags})
        if step == self.library.num_steps:
            hippo_df_step[f'{step}_num_atom_diff'] = num_atom_diff
            hippo_df_step[f'{step}_stereoisomer'] = stereoisomer
            hippo_df_step[f'error'] = error
            hippo_df_step[f'∆∆G'] = delta_delta_G
            hippo_df_step[f'∆G_bound'] = delta_G_bound
            hippo_df_step[f'∆G_unbound'] = delta_G_unbound
            hippo_df_step[f'comRMSD'] = comRMSD
            hippo_df_step[f'regarded'] = regarded
            hippo_df_step[f'path_to_mol'] = path_to_mol
            hippo_df_step['template'] = self.template
            hippo_df_step[f'intra_geometry_pass'] = intra_geometry_pass
        return hippo_df_step



[docs]
    def which_reactant_was_previous_product(self,
                                            r1_is_previous_product: bool,
                                            r2_is_previous_product: bool) -> int | None:
        """
        Determine which reactant was the scaffold of the previous step.
        """
        if r1_is_previous_product:
            if r2_is_previous_product:
                self.logger.critical("Both reactants cannot be products of the previous step.")
                return None
            return 1
        elif r2_is_previous_product:
            return 2
        # what if both ?? don't throw error still work
        else:
            return None



[docs]
    def _delete_file_or_directory(self, path):
        """
        Delete a file or directory at the given path.
        """
        try:
            if os.path.isfile(path) or os.path.islink(path):
                os.unlink(path)
                #print(f"Deleted file: {path}")
            elif os.path.isdir(path):
                shutil.rmtree(path)
                #print(f"Deleted directory: {path}")
        except Exception as e:
            self.logger.warning('Failed to delete %s. Reason: %s' % (path, e))



[docs]
    def _should_delete_file(self, file, suffixes_to_keep):
        """
        Determine if a file should be deleted based on its suffix.
        """
        return not any(file.endswith(suffix) for suffix in suffixes_to_keep)



[docs]
    def clean_up_placements(self):
        """
        This function is used to remove extra files that are generated by Fragmenstein.
        """
        self.logger.info("Cleaning up placement directory...")
        suffixes_to_keep = ['.minimised.json', '.minimised.mol', '.csv', '.pkl', '.pkl.gz']
        for root, dirs, files in os.walk(self.output_path):
            for file in files:
                if self._should_delete_file(file, suffixes_to_keep):
                    file_path = os.path.join(root, file)
                    self._delete_file_or_directory(file_path)



[docs]
    def get_placements_df(self) -> pd.DataFrame | None:
        """
        This function is used to get the placements dataframe which is the merged df with the products.
        """
        if self.placements is not None:
            return self.placements
        # otherwise you're gonna have to build it from scratch by going through each file in the output dir
        # get products df
        if self.products is None:
            self.logger.critical("You need to set the products df first.")
            return None
        self.placements = get_placement_data(self.products, self.output_path, self.output_dir)
        return self.placements