#!/usr/bin/env python3
"""
syndirella.structure_outputs.py
This module contains the functions used to structure the pipeline outputs.
"""
import logging
import os
import traceback
from argparse import ArgumentError
from typing import *
import glob2
import pandas as pd
from rdkit import Chem
import syndirella.fairy as fairy
from syndirella.route.CobblersWorkshop import CobblersWorkshop
from syndirella.slipper.Slipper import Slipper
[docs]
logger = logging.getLogger(__name__)
[docs]
def add_route_info(reaction_names: List[str],
reactants: List[Tuple[str]],
route_uuid: str) -> Dict:
"""
This function formats the full route into seperate columns for each reactant.
"""
num_steps: int = len(reaction_names)
route: Dict[str, str] = {'route_uuid': route_uuid}
for step in range(num_steps):
route[f'{step + 1}_reaction'] = reaction_names[step]
route[f'{step + 1}_r1_smiles'] = reactants[step][0]
try:
route[f'{step + 1}_r2_smiles'] = reactants[step][1]
except IndexError:
pass
return route
[docs]
def add_outcome_info(slipper: Slipper | None = None,
template_path: str | None = None,
hits_names: List[str] | None = None) -> Dict[str, Any]:
"""
This function adds placement information (None if not attempted).
"""
num_placed: int | None = None
num_successful: int | None = None
to_hippo_path: str | None = None
template: str | None = template_path
total_num_unique_products: int | None = None
total_num_products_enumstereo: int | None = None
hits_names: List[str] = hits_names if hits_names is not None else []
if slipper:
template = getattr(slipper, 'template', template_path)
num_placed = getattr(slipper, 'num_placed', None)
num_successful = getattr(slipper, 'num_successful', None)
to_hippo_path = getattr(slipper, 'to_hippo_path', None)
total_num_unique_products = getattr(slipper, 'num_unique_products', None)
total_num_products_enumstereo = getattr(slipper, 'num_products_enumstereo', None)
hits_names = getattr(slipper, 'hits_names', hits_names)
outcome: Dict[str, Any] = {
'total_num_unique_products': total_num_unique_products,
'total_num_products_enumstereo': total_num_products_enumstereo,
'num_placed': num_placed,
'num_successful': num_successful,
'to_hippo': to_hippo_path,
'template': template
}
for i, hit in enumerate(hits_names):
outcome[f"hit{i + 1}"] = hit
return outcome
[docs]
def get_output_df(csv_path: str,
output_dir: str) -> Tuple[pd.DataFrame, str | None]:
"""
Given the csv_path and output_dir, checks and reads in the most recent previous output df.
Format of output csv name: [name_of_input_csv]_output_YYYYMMDD_HHMM.csv
"""
past_csv_path: str | None = None
if not os.path.exists(csv_path):
raise FileNotFoundError(f"Could not find csv file at {csv_path}")
csv_name: str = os.path.basename(csv_path).split('.')[0]
# look for a csv that contains csv name in output dir
output_csvs: List[str] = glob2.glob(os.path.join(output_dir, f'*{csv_name}_output*'))
if len(output_csvs) == 0:
# does not exist yet, make new blank df
df = pd.DataFrame(columns=['smiles', 'inchi_key', 'route_uuid', 'error_type', 'error_message',
'num_placed', 'num_successful', '1_reaction', '1_r1_smiles', 'hit1', 'template',
'to_hippo'])
return df, past_csv_path
else:
# Sort files by their date and time in the filename
output_csvs.sort(
key=lambda x: os.path.basename(x).split('_')[-2] + os.path.basename(x).split('_')[-1].split('.')[0],
reverse=True)
# make sure pandas can read in the csv
try:
df = pd.read_csv(output_csvs[0])
past_csv_path: str = output_csvs[0]
except pd.errors.EmptyDataError:
df = pd.DataFrame(columns=['smiles', 'inchi_key', 'route_uuid', 'error_type', 'error_message',
'num_placed', 'num_successful', '1_reaction', '1_r1_smiles', 'hit1', 'template',
'to_hippo'])
return df, past_csv_path
[docs]
def get_scaffold_smiles(error: Exception | None, smiles: str | None, workshop: CobblersWorkshop | None) -> str:
"""
This function gets the scaffold smiles from the error, smiles, or workshop.
"""
if error is not None: # highest level of detail
if error.mol is not None:
try:
scaffold = Chem.MolToSmiles(error.mol)
return scaffold
except ArgumentError:
pass
elif error.smiles is not None:
scaffold = error.smiles
return scaffold
if workshop is not None: # next level of detail
try:
scaffold = workshop.product
return scaffold
except AttributeError:
pass
if smiles is not None: # lowest level of detail
return smiles
else:
raise ValueError("No scaffold found.")
[docs]
def get_error_info(error: Exception | None) -> Tuple[str | None, str | None, bool]:
"""
This function gets the error type and message from the error.
"""
if error is not None:
error_type: str | None = type(error).__name__
try:
error_message: str | None = error.message # custom error
custom_error: bool = True
except AttributeError:
error_message = error.args[0] # any other error
custom_error = False
else:
error_type = ''
error_message = ''
custom_error = False
return error_type, error_message, custom_error
[docs]
def get_inchi(scaffold: str, workshop: CobblersWorkshop | None) -> str:
"""
This function gets the inchi from the scaffold and workshop.
"""
if workshop is not None:
try:
return workshop.id
except AttributeError:
pass
else:
return fairy.generate_inchi_ID(scaffold, isomeric=False)
[docs]
def check_route_to_add(workshop: CobblersWorkshop | None) -> bool:
"""
This function checks if there is a route defined in the workshop object.
Returns True if route is defined.
Returns False if route is not defined.
"""
if workshop is not None:
try:
route_uuid: str = workshop.route_uuid
reaction_names: List[str] = workshop.reaction_names
reactants: List[Tuple[str, str]] = workshop.reactants
return True
except AttributeError:
return False
else:
return False
[docs]
def check_placement_to_add(slipper: Slipper | None) -> bool:
"""
This function checks if there is placement information in the slipper object.
Returns True if placement information exists.
Returns False if placement information does not exist.
"""
if slipper is not None:
try:
hits: List[str] = slipper.hits_names # just need to see if slipper contains the least amount of info
return True
except AttributeError:
return False
else:
return False
[docs]
def check_additional_info_to_add(slipper: Slipper | None) -> bool:
"""
This function checks if there is additional information in the slipper object.
"""
if slipper is not None:
try:
additional_info: Dict = slipper.additional_info
return True
except AttributeError:
return False
else:
return False
[docs]
def add_new_route_to_output_df(output_df: pd.DataFrame, row: Dict) -> pd.DataFrame:
"""
This function adds a new row to the output dataframe. If the row contains different columns,
they are added to the output_df, and all values are filled with None if not present before.
"""
# Convert row dictionary to DataFrame for easier manipulation
row_df = pd.DataFrame([row])
# Align the columns of output_df and row_df
combined_columns = output_df.columns.union(row_df.columns)
output_df = output_df.reindex(columns=combined_columns)
row_df = row_df.reindex(columns=combined_columns)
# Check for duplicate entries
for i, r in output_df.iterrows():
try:
if r['inchi_key'] == row['inchi_key'] and r['route_uuid'] == row['route_uuid']:
logger.info(f"Route already exists in output csv, replacing.")
output_df.drop(i, inplace=True)
except KeyError:
pass
# Append the new row to output_df
output_df = pd.concat([output_df, row_df], ignore_index=True)
# order columns with smiles, inchi_key, error_type, error_message always first
try:
output_df = output_df[
['smiles', 'inchi_key', 'error_type', 'error_message'] + [col for col in output_df.columns if
col not in ['smiles', 'inchi_key', 'error_type',
'error_message']]]
except KeyError:
pass
return output_df
[docs]
def save_output_df(output_df: pd.DataFrame, output_dir: str, csv_path: str):
"""
This function saves the output dataframe to a csv file in the output directory.
Format of output csv name: [name_of_input_csv]_output_YYYYMMDD_HHMM.csv
"""
csv_name: str = os.path.basename(csv_path).split('.')[0]
output_name: str = f'{csv_name}_output_{pd.Timestamp.now().strftime("%Y%m%d_%H%M")}.csv'
output_csv_path: str = os.path.join(output_dir, output_name)
output_df.to_csv(output_csv_path, index=False)
logger.info(f"Output csv saved to {output_csv_path}")
[docs]
def structure_route_outputs(error_message: str | None, error_type: str | None, output_df: pd.DataFrame,
workshop: CobblersWorkshop | None, slipper: Slipper | None, scaffold: str,
template_path: str | None, hits: List[str] | None,
additional_info: Dict | None) -> pd.DataFrame:
"""
This function structures the route outputs as a single row in the output dataframe.
"""
inchi: str = get_inchi(scaffold=scaffold, workshop=workshop)
row: Dict = {'smiles': scaffold,
'inchi_key': inchi,
'error_type': error_type,
'error_message': error_message}
if check_route_to_add(workshop): # True if route exists
reaction_info: Dict | None = add_route_info(reaction_names=workshop.reaction_names,
reactants=workshop.reactants,
route_uuid=workshop.route_uuid)
row.update(reaction_info)
if check_placement_to_add(slipper): # True if slipper exists and at least contains hits_names
placement_info: Dict = add_outcome_info(slipper=slipper)
else:
placement_info: Dict = add_outcome_info(template_path=template_path, hits_names=hits)
row.update(placement_info)
if check_additional_info_to_add(slipper): # True if slipper contains additional info
additional_info: Dict = slipper.additional_info
if additional_info is not None: # True if additional info is passed in
row.update(additional_info)
output_df: pd.DataFrame = add_new_route_to_output_df(output_df=output_df, row=row)
return output_df
##########################################################
[docs]
def structure_pipeline_outputs(error: Exception | None,
csv_path: str,
output_dir: str,
workshop: CobblersWorkshop | None = None,
slipper: Slipper | None = None,
smiles: str | None = None,
template_path: str | None = None,
hits: List[str] | None = None,
additional_info: Dict | None = None):
"""
Structure outputs of pipeline.
"""
try:
output_df, past_csv_path = get_output_df(csv_path=csv_path, output_dir=output_dir)
error_type, error_message, custom_error = get_error_info(error=error)
if custom_error:
scaffold: str = get_scaffold_smiles(error=error,
smiles=smiles,
workshop=workshop)
else:
scaffold: str = get_scaffold_smiles(error=None,
smiles=smiles,
workshop=workshop)
output_df: pd.DataFrame = structure_route_outputs(error_message=error_message,
error_type=error_type,
output_df=output_df,
workshop=workshop,
slipper=slipper,
scaffold=scaffold,
template_path=template_path,
hits=hits,
additional_info=additional_info)
if past_csv_path is not None:
os.remove(past_csv_path) # delete previous output csv
logger.info(f"Deleted previous output csv at {past_csv_path}")
save_output_df(output_df=output_df, output_dir=output_dir, csv_path=csv_path)
except (TypeError, ValueError, FileNotFoundError):
logger.error(f"Could not structure pipeline outputs.")
logger.error(traceback.format_exc())
# don't raise error, just log it