Source code for syndirella.pipeline

#!/usr/bin/env python3
"""
syndirella.run_pipeline.py

This script contains the main pipeline for syndirella.
"""
import datetime
import logging
import time
import traceback
from typing import List, Tuple, Dict

import pandas as pd
from rdkit.Chem import inchi
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions

import syndirella.check_inputs as check_inputs
import syndirella.fairy as fairy
import syndirella.structure_outputs as structure_outputs
from syndirella.Cobbler import Cobbler
from syndirella.error import *
from syndirella.route.CobblersWorkshop import CobblersWorkshop
from syndirella.slipper.Slipper import Slipper
from syndirella.slipper.SlipperFitter import SlipperFitter

[docs] logger = logging.getLogger(__name__)
[docs] def assert_scaffold_placement(scaffold: str, template_path: str, hits_path: str, hits_names: List[str], output_dir: str, scaffold_place_num: int ) -> Dict[Chem.Mol, str]: """ Assert that the scaffold can be placed for any stereoisomers. If not, raise an error. """ scaffold_mol = Chem.MolFromSmiles(scaffold) if not scaffold_mol: logger.critical(f"Could not create a molecule from the smiles {scaffold}.") raise MolError(smiles=scaffold) # enumerate stereoisomers opts = StereoEnumerationOptions(unique=True) isomers = list(EnumerateStereoisomers(scaffold_mol, options=opts)) slipper_fitter = SlipperFitter(template_path, hits_path, hits_names, output_dir) placements: Dict[Chem.Mol, str | None] = {} for i, isomer in enumerate(isomers): scaffold_name: str = f'scaffold-{chr(65 + i)}' can_be_placed: str | None = slipper_fitter.check_scaffold(scaffold=isomer, scaffold_name=scaffold_name, scaffold_place_num=scaffold_place_num) # path to scaffold if successful placements[isomer] = can_be_placed # absolute path to minimised.mol scaffold, checked to exist if not any(placements.values()): logger.critical(f"Scaffold {scaffold} could not be placed successfully.") raise ScaffoldPlacementError(smiles=scaffold) return placements
[docs] def elaborate_from_cobbler_workshops(cobbler_workshops: List[CobblersWorkshop], template_path: str, hits_path: str, hits: List[str], batch_num: int, csv_path: str, output_dir: str, scaffold_placements: Dict[Chem.Mol, str], additional_info=None): """ Does elaboration once the cobbler workshops are created. """ if additional_info is None: additional_info = [] for workshop in cobbler_workshops: try: slipper = None final_library = workshop.get_final_library() if final_library is None: logger.warning(f"Could not get the final library for compound {workshop.product}. Skipping...") continue slipper = Slipper(library=final_library, template=template_path, hits_path=hits_path, hits_names=hits, batch_num=batch_num, atoms_ids_expansion=None, additional_info=additional_info, scaffold_placements=scaffold_placements) slipper.get_products() slipper.place_products() slipper.write_products_to_hippo() # only write at the end after placement, to get correct route_uuid slipper.clean_up_placements() except Exception as e: tb = traceback.format_exc() logger.critical(f"Error elaborating compound {workshop.product}. {tb}") structure_outputs.structure_pipeline_outputs(error=e, csv_path=csv_path, output_dir=output_dir, workshop=workshop if workshop is not None else None, slipper=slipper if slipper is not None else None) continue structure_outputs.structure_pipeline_outputs(csv_path=csv_path, output_dir=output_dir, error=None, workshop=workshop, slipper=slipper)
[docs] def start_elaboration(product: str, template_path: str, hits_path: str, hits: List[str], output_dir: str, scaffold_place_num: int, no_scaffold_place: bool) -> Tuple[float, Dict[Chem.Mol, str | None] | Dict]: """ Starts the elaboration of a single compound. """ logger.info(f'Starting: {product} | {inchi.MolToInchiKey(Chem.MolFromSmiles(product))}') start_time = time.time() if no_scaffold_place: logger.info(f"Skipping initial scaffold placement...") scaffold_placements: Dict[Chem.Mol, str | None] = {} else: logger.info(f"Placing scaffold...") scaffold_placements: Dict[Chem.Mol, str | None] = assert_scaffold_placement(scaffold=product, template_path=template_path, hits_path=hits_path, hits_names=hits, output_dir=output_dir, scaffold_place_num=scaffold_place_num) return start_time, scaffold_placements
[docs] def elaborate_compound_with_manual_routes(product: str, reactants: List[Tuple[str, str]], reaction_names: List[str], num_steps: int, hits: List[str], template_path: str, hits_path: str, batch_num: int, output_dir: str, csv_path: str, atom_diff_min: int, atom_diff_max: int, scaffold_place_num: int, scaffold_place: bool, no_scaffold_place: bool, additional_info=None): """ This function is used to elaborate a single compound using a manually defined route. """ start_time, scaffold_placements = start_elaboration(product=product, template_path=template_path, hits_path=hits_path, hits=hits, output_dir=output_dir, scaffold_place_num=scaffold_place_num, no_scaffold_place=no_scaffold_place) if not scaffold_place: # continue elaboration workshop = CobblersWorkshop(product=product, reactants=reactants, reaction_names=reaction_names, num_steps=num_steps, output_dir=output_dir, filter=False, id=fairy.generate_inchi_ID(product, isomeric=False), atom_diff_min=atom_diff_min, atom_diff_max=atom_diff_max) cobblers_workshops = [workshop] alternative_routes: List[CobblersWorkshop] | None = workshop.get_additional_routes(edit_route=True) if alternative_routes is not None: cobblers_workshops = [workshop] + alternative_routes elaborate_from_cobbler_workshops(cobbler_workshops=cobblers_workshops, template_path=template_path, hits_path=hits_path, hits=hits, batch_num=batch_num, additional_info=additional_info, csv_path=csv_path, output_dir=output_dir, scaffold_placements=scaffold_placements) end_time = time.time() elapsed_time = end_time - start_time logger.info( f"Finished Syndirella 👑 pipeline for compound {product} | {inchi.MolToInchiKey(Chem.MolFromSmiles(product))} " f"after {datetime.timedelta(seconds=elapsed_time)}") logger.info("")
[docs] def elaborate_compound_full_auto(product: str, hits: List[str], template_path: str, hits_path: str, batch_num: int, output_dir: str, csv_path: str, atom_diff_min: int, atom_diff_max: int, scaffold_place_num: int, scaffold_place: bool, no_scaffold_place: bool, additional_info=None): """ This function is used to elaborate a single compound. """ start_time, scaffold_placements = start_elaboration(product=product, template_path=template_path, hits_path=hits_path, hits=hits, output_dir=output_dir, scaffold_place_num=scaffold_place_num, no_scaffold_place=no_scaffold_place) if not scaffold_place: # continue elaboration cobbler = Cobbler(scaffold_compound=product, output_dir=output_dir, atom_diff_min=atom_diff_min, atom_diff_max=atom_diff_max) # check that output_dirs can be made for different routes cobbler_workshops: List[CobblersWorkshop] = cobbler.get_routes() elaborate_from_cobbler_workshops(cobbler_workshops=cobbler_workshops, template_path=template_path, hits_path=hits_path, hits=hits, batch_num=batch_num, additional_info=additional_info, csv_path=csv_path, output_dir=output_dir, scaffold_placements=scaffold_placements) end_time = time.time() elapsed_time = end_time - start_time logger.info( f"Finished Syndirella 👑 pipeline for compound {product} | {inchi.MolToInchiKey(Chem.MolFromSmiles(product))} " f"after {datetime.timedelta(seconds=elapsed_time)}") logger.info("")
#######################################
[docs] def run_pipeline(settings: Dict): """ Run the whole syndirella pipeline! 👑 OUTDATED DOCSTRING: Settings dict should have these keys: csv_path=settings['input'], output_dir=settings['output'], template_dir=settings['templates'], hits_path=settings['hits_path'], metadata_path=settings['metadata'], batch_num=settings['batch_num'], additional_columns=['compound_set'], # Will always be compound_set manual_routes=settings['manual'], atom_diff_min=settings['atom_diff_min'], atom_diff_max=settings['atom_diff_max'] """ def process_row(row: pd.Series, manual_routes: bool, scaffold_place: bool, no_scaffold_place: bool): additional_info: dict = check_inputs.format_additional_info(row, additional_columns) template_path: str = check_inputs.get_template_path( template_dir=template_dir, template=row['template'], metadata_path=metadata_path ) hits: List[str] = check_inputs.get_exact_hit_names(row=row, metadata_path=metadata_path, hits_path=hits_path) try: if manual_routes: reactants, reaction_names, num_steps = check_inputs.format_manual_route(row) elaborate_compound_with_manual_routes( product=row['smiles'], reactants=reactants, reaction_names=reaction_names, num_steps=num_steps, hits=hits, template_path=template_path, hits_path=hits_path, batch_num=batch_num, output_dir=output_dir, additional_info=additional_info, csv_path=csv_path, atom_diff_min=atom_diff_min, atom_diff_max=atom_diff_max, scaffold_place_num=scaffold_place_num, scaffold_place=scaffold_place, no_scaffold_place=no_scaffold_place ) else: elaborate_compound_full_auto( product=row['smiles'], hits=hits, template_path=template_path, hits_path=hits_path, batch_num=batch_num, output_dir=output_dir, additional_info=additional_info, csv_path=csv_path, atom_diff_min=atom_diff_min, atom_diff_max=atom_diff_max, scaffold_place_num=scaffold_place_num, scaffold_place=scaffold_place, no_scaffold_place=no_scaffold_place ) except Exception as e: tb = traceback.format_exc() logger.critical(f"Error elaborating compound {row['smiles']}. {tb}") structure_outputs.structure_pipeline_outputs( error=e, csv_path=csv_path, output_dir=output_dir, smiles=row['smiles'], template_path=template_path, hits=hits, additional_info=additional_info ) # set required variables try: additional_columns: List[str] = ['compound_set'] metadata_path: str = settings['metadata'] template_dir: str = settings['templates'] hits_path: str = settings['hits_path'] output_dir: str = settings['output'] batch_num: int = settings['batch_num'] csv_path: str = settings['input'] atom_diff_min: int = settings['atom_diff_min'] atom_diff_max: int = settings['atom_diff_max'] scaffold_place_num: int = settings['scaffold_place_num'] long_code_column: str = settings['long_code_column'] except KeyError as e: logger.critical(f"Missing critical argument to run pipeline: {e}") # set optional variables try: manual_routes: bool = settings['manual'] except KeyError: manual_routes = False try: scaffold_place: bool = settings['scaffold_place'] # If scaffold_place is True, only place scaffolds and do not continue to elaborate if scaffold_place: logger.info(f"Only placing scaffolds!") except KeyError: scaffold_place = False # Log pipeline type try: no_scaffold_place: bool = settings['no_scaffold_place'] if no_scaffold_place: logger.warning(f"Skipping initial scaffold placement! Immediately starting elaboration process.") except KeyError: no_scaffold_place = False if not scaffold_place: logger.info(f"Running the pipeline with {"manual" if manual_routes else "full auto"} routes.") # Validate inputs check_inputs.check_pipeline_inputs( csv_path=csv_path, template_dir=template_dir, hits_path=hits_path, metadata_path=metadata_path, additional_columns=additional_columns, manual_routes=manual_routes, long_code_column=long_code_column ) # Load data df = pd.read_csv(csv_path) # Process each row in the DataFrame for index, row in df.iterrows(): process_row(row=row, manual_routes=manual_routes, scaffold_place=scaffold_place, no_scaffold_place=no_scaffold_place) logger.info("Pipeline complete.")