#!venv/bin/env python3
"""
slipper/intra_geometry.py
This module was directly sourced from the PoseBusters repository, which is open source code under the BSD-3-Clause license.
The original code is found at: https://github.com/maabuu/posebusters/blob/3c467ab82bdbae5b71f80a286bb49ecc011be529/posebusters/modules/distance_geometry.py
It is used to check bond lengths, bond angles, and the internal clash of ligand conformations.
"""
from __future__ import annotations
from copy import deepcopy
from logging import getLogger
from typing import Any, Iterable
import numpy as np
import pandas as pd
from rdkit.Chem.rdchem import Mol
from rdkit.Chem.rdDistGeom import GetMoleculeBoundsMatrix
from rdkit.Chem.rdmolops import SanitizeMol
from syndirella.slipper.stats import bape, bpe, pe
[docs]
logger = getLogger(__name__)
[docs]
col_pe = "percent_error"
[docs]
col_bpe = "bound_percent_error"
[docs]
col_bape = "bound_absolute_percent_error"
[docs]
bound_matrix_params = {
"set15bounds": True,
"scaleVDW": True,
"doTriangleSmoothing": True,
"useMacrocycle14config": False,
}
[docs]
col_n_bonds = "number_bonds"
[docs]
col_shortest_bond = "shortest_bond_relative_length"
[docs]
col_longest_bond = "longest_bond_relative_length"
[docs]
col_n_short_bonds = "number_short_outlier_bonds"
[docs]
col_n_long_bonds = "number_long_outlier_bonds"
[docs]
col_n_good_bonds = "number_valid_bonds"
[docs]
col_bonds_result = "bond_lengths_within_bounds"
[docs]
col_n_angles = "number_angles"
[docs]
col_extremest_angle = "most_extreme_relative_angle"
[docs]
col_n_bad_angles = "number_outlier_angles"
[docs]
col_n_good_angles = "number_valid_angles"
[docs]
col_angles_result = "bond_angles_within_bounds"
[docs]
col_n_noncov = "number_noncov_pairs"
[docs]
col_closest_noncov = "shortest_noncovalent_relative_distance"
[docs]
col_n_clashes = "number_clashes"
[docs]
col_n_good_noncov = "number_valid_noncov_pairs"
[docs]
col_clash_result = "no_internal_clash"
[docs]
_empty_results = {
col_n_bonds: np.nan,
col_shortest_bond: np.nan,
col_longest_bond: np.nan,
col_n_short_bonds: np.nan,
col_n_long_bonds: np.nan,
col_bonds_result: np.nan,
col_n_angles: np.nan,
col_extremest_angle: np.nan,
col_n_bad_angles: np.nan,
col_angles_result: np.nan,
col_n_noncov: np.nan,
col_closest_noncov: np.nan,
col_n_clashes: np.nan,
col_clash_result: np.nan,
}
[docs]
def check_geometry( # noqa: PLR0913, PLR0915
mol_pred: Mol,
threshold_bad_bond_length: float = 0.2,
threshold_clash: float = 0.2,
threshold_bad_angle: float = 0.2,
bound_matrix_params: dict[str, Any] = bound_matrix_params,
ignore_hydrogens: bool = True,
sanitize: bool = True,
) -> dict[str, Any]:
"""Use RDKit distance geometry bounds to check the geometry of a molecule.
Args:
mol_pred: Predicted molecule (docked ligand). Only the first conformer will be checked.
threshold_bad_bond_length: Bond length threshold in relative percentage. 0.2 means that bonds may be up to 20%
longer than DG bounds. Defaults to 0.2.
threshold_clash: Threshold for how much overlap constitutes a clash. 0.2 means that the two atoms may be up to
80% of the lower bound apart. Defaults to 0.2.
threshold_bad_angle: Bond angle threshold in relative percentage. 0.2 means that bonds may be up to 20%
longer than DG bounds. Defaults to 0.2.
bound_matrix_params: Parameters passe to RDKit's GetMoleculeBoundsMatrix function.
ignore_hydrogens: Whether to ignore hydrogens. Defaults to True.
sanitize: Sanitize molecule before running DG module (recommended). Defaults to True.
Returns:
PoseBusters results dictionary.
"""
mol = deepcopy(mol_pred)
results = _empty_results.copy()
if mol.GetNumConformers() == 0:
logger.warning("Molecule does not have a conformer.")
return {"results": results}
if mol.GetNumAtoms() == 1:
logger.warning(f"Molecule has only {mol.GetNumAtoms()} atoms.")
results[col_angles_result] = True
results[col_bonds_result] = True
results[col_clash_result] = True
return {"results": results}
# sanitize to ensure DG works or manually process molecule
try:
if sanitize:
flags = SanitizeMol(mol)
assert flags == 0, f"Sanitization failed with flags {flags}"
except Exception:
return {"results": results}
# get bonds and angles
bond_set = sorted(_get_bond_atom_indices(mol)) # tuples
angles = sorted(_get_angle_atom_indices(bond_set)) # triples
angle_set = {(a[0], a[2]): a for a in angles} # {tuples : triples}
if len(bond_set) == 0:
logger.warning("Molecule has no bonds.")
# distance geometry bounds, lower triangle min distances, upper triangle max distances
bounds = GetMoleculeBoundsMatrix(mol, **bound_matrix_params)
# indices
lower_triangle_idcs = np.tril_indices(mol.GetNumAtoms(), k=-1)
upper_triangle_idcs = (lower_triangle_idcs[1], lower_triangle_idcs[0])
# 1,2- distances
df_12 = pd.DataFrame()
df_12["atom_pair"] = list(zip(*upper_triangle_idcs)) # indices have i1 < i2
df_12["atom_types"] = [
"--".join(tuple(mol.GetAtomWithIdx(int(j)).GetSymbol() for j in i)) for i in df_12["atom_pair"]
]
df_12["angle"] = df_12["atom_pair"].apply(lambda x: angle_set.get(x, None))
df_12["has_hydrogen"] = [_has_hydrogen(mol, i) for i in df_12["atom_pair"]]
df_12["is_bond"] = [i in bond_set for i in df_12["atom_pair"]]
df_12["is_angle"] = df_12["angle"].apply(lambda x: x is not None)
df_12[col_lb] = bounds[lower_triangle_idcs]
df_12[col_ub] = bounds[upper_triangle_idcs]
# add observed dimensions
conformer = mol.GetConformer()
conf_distances = _pairwise_distance(conformer.GetPositions())
df_12["conf_id"] = conformer.GetId()
df_12["distance"] = conf_distances[lower_triangle_idcs]
if ignore_hydrogens:
df_12 = df_12.loc[~df_12["has_hydrogen"], :]
# calculate violations
df_bonds = _bond_check(df_12)
df_clash = _clash_check(df_12)
df_angles = _angle_check(df_12)
# bond statistics
results[col_n_bonds] = len(df_bonds)
results[col_n_short_bonds] = sum(df_bonds[col_pe] < -threshold_bad_bond_length)
results[col_n_long_bonds] = sum(df_bonds[col_pe] > threshold_bad_bond_length)
results[col_n_good_bonds] = results[col_n_bonds] - results[col_n_short_bonds] - results[col_n_long_bonds]
results[col_bonds_result] = results[col_n_good_bonds] == results[col_n_bonds]
results[col_shortest_bond] = (df_bonds["distance"] / df_bonds["lower_bound"]).min()
results[col_longest_bond] = (df_bonds["distance"] / df_bonds["upper_bound"]).max()
# angle statistics
results[col_n_angles] = len(df_angles)
results[col_n_bad_angles] = sum(df_angles[col_bape] > threshold_bad_angle)
results[col_n_good_angles] = results[col_n_angles] - results[col_n_bad_angles]
results[col_angles_result] = results[col_n_good_angles] == results[col_n_angles]
lb_extreme_angle = 2 - (df_angles["distance"] / df_angles["lower_bound"]).min()
ub_extreme_angle = (df_angles["distance"] / df_angles["upper_bound"]).max()
results[col_extremest_angle] = max(lb_extreme_angle, ub_extreme_angle)
# steric clash statistics
results[col_n_noncov] = len(df_clash)
results[col_n_clashes] = sum(df_clash[col_bpe] < -threshold_clash)
results[col_n_good_noncov] = results[col_n_noncov] - results[col_n_clashes]
results[col_clash_result] = results[col_n_good_noncov] == results[col_n_noncov]
results[col_closest_noncov] = (df_clash["distance"] / df_clash["lower_bound"]).min()
return {"results": results, "details": {"bonds": df_bonds, "clash": df_clash, "angles": df_angles}}
[docs]
def _bond_check(df: pd.DataFrame) -> pd.DataFrame:
# bonds can be too short or too long
df = df[df["is_bond"]].copy()
df[col_pe] = df.apply(lambda x: bpe(*x[["distance", col_lb, col_ub]]), axis=1)
return df
[docs]
def _angle_check(df: pd.DataFrame) -> pd.DataFrame:
# angles have no direction (we do not know if larger or bigger beyond bounds)
df = df[(~df["is_bond"]) & (df["is_angle"])].copy()
df[col_bape] = df.apply(lambda x: bape(*x[["distance", col_lb, col_ub]]), axis=1)
return df
[docs]
def _clash_check(df: pd.DataFrame) -> pd.DataFrame:
# clash is only when lower bound is violated
df = df[(~df["is_bond"]) & (~df["is_angle"])].copy()
def _lb_pe(value, lower_bound):
if value >= lower_bound:
return 0.0
return pe(value, lower_bound)
df[col_bpe] = df.apply(lambda x: _lb_pe(*x[["distance", col_lb]]), axis=1)
return df
[docs]
def _get_bond_atom_indices(mol: Mol) -> list[tuple[int, int]]:
bonds = []
for bond in mol.GetBonds():
bond_tuple = (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
bond_tuple = _sort_bond(bond_tuple)
bonds.append(bond_tuple)
return bonds
[docs]
def _get_angle_atom_indices(bonds: list[tuple[int, int]]) -> list[tuple[int, int, int]]:
"""Check all combinations of bonds to generate list of molecule angles."""
angles = []
bonds = list(bonds)
for i in range(len(bonds)):
for j in range(i + 1, len(bonds)):
angle = _two_bonds_to_angle(bonds[i], bonds[j])
if angle is not None:
angles.append(angle)
return angles
[docs]
def _two_bonds_to_angle(bond1: tuple[int, int], bond2: tuple[int, int]) -> None | tuple[int, int, int]:
set1 = set(bond1)
set2 = set(bond2)
all_atoms = set1 | set2
# angle requires two bonds to share exactly one atom, that is we must have 3 atoms
if len(all_atoms) != 3: # noqa: PLR2004
return None
# find shared atom
shared_atom = set1 & set2
other_atoms = all_atoms - shared_atom
return (min(other_atoms), shared_atom.pop(), max(other_atoms))
[docs]
def _sort_bond(bond: tuple[int, int]) -> tuple[int, int]:
return (min(bond), max(bond))
[docs]
def _pairwise_distance(x):
return np.linalg.norm(x[:, None, :] - x[None, :, :], axis=-1)
[docs]
def _has_hydrogen(mol: Mol, idcs: Iterable[int]) -> bool:
return any(_is_hydrogen(mol, idx) for idx in idcs)
[docs]
def _is_hydrogen(mol: Mol, idx: int) -> bool:
return mol.GetAtomWithIdx(int(idx)).GetAtomicNum() == 1