Source code for syndirella.slipper.intra_geometry

#!venv/bin/env python3
"""
slipper/intra_geometry.py

This module was directly sourced from the PoseBusters repository, which is open source code under the BSD-3-Clause license.
The original code is found at: https://github.com/maabuu/posebusters/blob/3c467ab82bdbae5b71f80a286bb49ecc011be529/posebusters/modules/distance_geometry.py

It is used to check bond lengths, bond angles, and the internal clash of ligand conformations.
"""
from __future__ import annotations

from copy import deepcopy
from logging import getLogger
from typing import Any, Iterable

import numpy as np
import pandas as pd
from rdkit.Chem.rdchem import Mol
from rdkit.Chem.rdDistGeom import GetMoleculeBoundsMatrix
from rdkit.Chem.rdmolops import SanitizeMol

from syndirella.slipper.stats import bape, bpe, pe


[docs]
logger = getLogger(__name__)



[docs]
col_lb = "lower_bound"


[docs]
col_ub = "upper_bound"


[docs]
col_pe = "percent_error"


[docs]
col_bpe = "bound_percent_error"


[docs]
col_bape = "bound_absolute_percent_error"



[docs]
bound_matrix_params = {
    "set15bounds": True,
    "scaleVDW": True,
    "doTriangleSmoothing": True,
    "useMacrocycle14config": False,
}



[docs]
col_n_bonds = "number_bonds"


[docs]
col_shortest_bond = "shortest_bond_relative_length"


[docs]
col_longest_bond = "longest_bond_relative_length"


[docs]
col_n_short_bonds = "number_short_outlier_bonds"


[docs]
col_n_long_bonds = "number_long_outlier_bonds"


[docs]
col_n_good_bonds = "number_valid_bonds"


[docs]
col_bonds_result = "bond_lengths_within_bounds"


[docs]
col_n_angles = "number_angles"


[docs]
col_extremest_angle = "most_extreme_relative_angle"


[docs]
col_n_bad_angles = "number_outlier_angles"


[docs]
col_n_good_angles = "number_valid_angles"


[docs]
col_angles_result = "bond_angles_within_bounds"


[docs]
col_n_noncov = "number_noncov_pairs"


[docs]
col_closest_noncov = "shortest_noncovalent_relative_distance"


[docs]
col_n_clashes = "number_clashes"


[docs]
col_n_good_noncov = "number_valid_noncov_pairs"


[docs]
col_clash_result = "no_internal_clash"



[docs]
_empty_results = {
    col_n_bonds: np.nan,
    col_shortest_bond: np.nan,
    col_longest_bond: np.nan,
    col_n_short_bonds: np.nan,
    col_n_long_bonds: np.nan,
    col_bonds_result: np.nan,
    col_n_angles: np.nan,
    col_extremest_angle: np.nan,
    col_n_bad_angles: np.nan,
    col_angles_result: np.nan,
    col_n_noncov: np.nan,
    col_closest_noncov: np.nan,
    col_n_clashes: np.nan,
    col_clash_result: np.nan,
}




[docs]
def check_geometry(  # noqa: PLR0913, PLR0915
    mol_pred: Mol,
    threshold_bad_bond_length: float = 0.2,
    threshold_clash: float = 0.2,
    threshold_bad_angle: float = 0.2,
    bound_matrix_params: dict[str, Any] = bound_matrix_params,
    ignore_hydrogens: bool = True,
    sanitize: bool = True,
) -> dict[str, Any]:
    """Use RDKit distance geometry bounds to check the geometry of a molecule.

    Args:
        mol_pred: Predicted molecule (docked ligand). Only the first conformer will be checked.
        threshold_bad_bond_length: Bond length threshold in relative percentage. 0.2 means that bonds may be up to 20%
            longer than DG bounds. Defaults to 0.2.
        threshold_clash: Threshold for how much overlap constitutes a clash. 0.2 means that the two atoms may be up to
            80% of the lower bound apart. Defaults to 0.2.
        threshold_bad_angle: Bond angle threshold in relative percentage. 0.2 means that bonds may be up to 20%
            longer than DG bounds. Defaults to 0.2.
        bound_matrix_params: Parameters passe to RDKit's GetMoleculeBoundsMatrix function.
        ignore_hydrogens: Whether to ignore hydrogens. Defaults to True.
        sanitize: Sanitize molecule before running DG module (recommended). Defaults to True.

    Returns:
        PoseBusters results dictionary.
    """
    mol = deepcopy(mol_pred)
    results = _empty_results.copy()

    if mol.GetNumConformers() == 0:
        logger.warning("Molecule does not have a conformer.")
        return {"results": results}

    if mol.GetNumAtoms() == 1:
        logger.warning(f"Molecule has only {mol.GetNumAtoms()} atoms.")
        results[col_angles_result] = True
        results[col_bonds_result] = True
        results[col_clash_result] = True
        return {"results": results}

    # sanitize to ensure DG works or manually process molecule
    try:
        if sanitize:
            flags = SanitizeMol(mol)
            assert flags == 0, f"Sanitization failed with flags {flags}"
    except Exception:
        return {"results": results}

    # get bonds and angles
    bond_set = sorted(_get_bond_atom_indices(mol))  # tuples
    angles = sorted(_get_angle_atom_indices(bond_set))  # triples
    angle_set = {(a[0], a[2]): a for a in angles}  # {tuples : triples}

    if len(bond_set) == 0:
        logger.warning("Molecule has no bonds.")

    # distance geometry bounds, lower triangle min distances, upper triangle max distances
    bounds = GetMoleculeBoundsMatrix(mol, **bound_matrix_params)

    # indices
    lower_triangle_idcs = np.tril_indices(mol.GetNumAtoms(), k=-1)
    upper_triangle_idcs = (lower_triangle_idcs[1], lower_triangle_idcs[0])

    # 1,2- distances
    df_12 = pd.DataFrame()
    df_12["atom_pair"] = list(zip(*upper_triangle_idcs))  # indices have i1 < i2
    df_12["atom_types"] = [
        "--".join(tuple(mol.GetAtomWithIdx(int(j)).GetSymbol() for j in i)) for i in df_12["atom_pair"]
    ]
    df_12["angle"] = df_12["atom_pair"].apply(lambda x: angle_set.get(x, None))
    df_12["has_hydrogen"] = [_has_hydrogen(mol, i) for i in df_12["atom_pair"]]
    df_12["is_bond"] = [i in bond_set for i in df_12["atom_pair"]]
    df_12["is_angle"] = df_12["angle"].apply(lambda x: x is not None)
    df_12[col_lb] = bounds[lower_triangle_idcs]
    df_12[col_ub] = bounds[upper_triangle_idcs]

    # add observed dimensions
    conformer = mol.GetConformer()
    conf_distances = _pairwise_distance(conformer.GetPositions())
    df_12["conf_id"] = conformer.GetId()
    df_12["distance"] = conf_distances[lower_triangle_idcs]

    if ignore_hydrogens:
        df_12 = df_12.loc[~df_12["has_hydrogen"], :]

    # calculate violations
    df_bonds = _bond_check(df_12)
    df_clash = _clash_check(df_12)
    df_angles = _angle_check(df_12)

    # bond statistics
    results[col_n_bonds] = len(df_bonds)
    results[col_n_short_bonds] = sum(df_bonds[col_pe] < -threshold_bad_bond_length)
    results[col_n_long_bonds] = sum(df_bonds[col_pe] > threshold_bad_bond_length)
    results[col_n_good_bonds] = results[col_n_bonds] - results[col_n_short_bonds] - results[col_n_long_bonds]
    results[col_bonds_result] = results[col_n_good_bonds] == results[col_n_bonds]
    results[col_shortest_bond] = (df_bonds["distance"] / df_bonds["lower_bound"]).min()
    results[col_longest_bond] = (df_bonds["distance"] / df_bonds["upper_bound"]).max()

    # angle statistics
    results[col_n_angles] = len(df_angles)
    results[col_n_bad_angles] = sum(df_angles[col_bape] > threshold_bad_angle)
    results[col_n_good_angles] = results[col_n_angles] - results[col_n_bad_angles]
    results[col_angles_result] = results[col_n_good_angles] == results[col_n_angles]
    lb_extreme_angle = 2 - (df_angles["distance"] / df_angles["lower_bound"]).min()
    ub_extreme_angle = (df_angles["distance"] / df_angles["upper_bound"]).max()
    results[col_extremest_angle] = max(lb_extreme_angle, ub_extreme_angle)

    # steric clash statistics
    results[col_n_noncov] = len(df_clash)
    results[col_n_clashes] = sum(df_clash[col_bpe] < -threshold_clash)
    results[col_n_good_noncov] = results[col_n_noncov] - results[col_n_clashes]
    results[col_clash_result] = results[col_n_good_noncov] == results[col_n_noncov]
    results[col_closest_noncov] = (df_clash["distance"] / df_clash["lower_bound"]).min()

    return {"results": results, "details": {"bonds": df_bonds, "clash": df_clash, "angles": df_angles}}




[docs]
def _bond_check(df: pd.DataFrame) -> pd.DataFrame:
    # bonds can be too short or too long
    df = df[df["is_bond"]].copy()
    df[col_pe] = df.apply(lambda x: bpe(*x[["distance", col_lb, col_ub]]), axis=1)
    return df




[docs]
def _angle_check(df: pd.DataFrame) -> pd.DataFrame:
    # angles have no direction (we do not know if larger or bigger beyond bounds)
    df = df[(~df["is_bond"]) & (df["is_angle"])].copy()
    df[col_bape] = df.apply(lambda x: bape(*x[["distance", col_lb, col_ub]]), axis=1)
    return df




[docs]
def _clash_check(df: pd.DataFrame) -> pd.DataFrame:
    # clash is only when lower bound is violated
    df = df[(~df["is_bond"]) & (~df["is_angle"])].copy()

    def _lb_pe(value, lower_bound):
        if value >= lower_bound:
            return 0.0
        return pe(value, lower_bound)

    df[col_bpe] = df.apply(lambda x: _lb_pe(*x[["distance", col_lb]]), axis=1)
    return df




[docs]
def _get_bond_atom_indices(mol: Mol) -> list[tuple[int, int]]:
    bonds = []
    for bond in mol.GetBonds():
        bond_tuple = (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
        bond_tuple = _sort_bond(bond_tuple)
        bonds.append(bond_tuple)
    return bonds




[docs]
def _get_angle_atom_indices(bonds: list[tuple[int, int]]) -> list[tuple[int, int, int]]:
    """Check all combinations of bonds to generate list of molecule angles."""
    angles = []
    bonds = list(bonds)
    for i in range(len(bonds)):
        for j in range(i + 1, len(bonds)):
            angle = _two_bonds_to_angle(bonds[i], bonds[j])
            if angle is not None:
                angles.append(angle)
    return angles




[docs]
def _two_bonds_to_angle(bond1: tuple[int, int], bond2: tuple[int, int]) -> None | tuple[int, int, int]:
    set1 = set(bond1)
    set2 = set(bond2)
    all_atoms = set1 | set2
    # angle requires two bonds to share exactly one atom, that is we must have 3 atoms
    if len(all_atoms) != 3:  # noqa: PLR2004
        return None
    # find shared atom
    shared_atom = set1 & set2
    other_atoms = all_atoms - shared_atom
    return (min(other_atoms), shared_atom.pop(), max(other_atoms))




[docs]
def _sort_bond(bond: tuple[int, int]) -> tuple[int, int]:
    return (min(bond), max(bond))




[docs]
def _pairwise_distance(x):
    return np.linalg.norm(x[:, None, :] - x[None, :, :], axis=-1)




[docs]
def _has_hydrogen(mol: Mol, idcs: Iterable[int]) -> bool:
    return any(_is_hydrogen(mol, idx) for idx in idcs)




[docs]
def _is_hydrogen(mol: Mol, idx: int) -> bool:
    return mol.GetAtomWithIdx(int(idx)).GetAtomicNum() == 1