Source code for openff.pablo.residue

"""
Classes for defining custom residues.
"""

import dataclasses
import itertools
import logging
from collections import defaultdict
from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
from contextlib import contextmanager
from copy import deepcopy
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING, DefaultDict, Literal, Self, TextIO

from openff.toolkit.topology import Molecule
from openff.toolkit.utils import RDKitToolkitWrapper
from openff.units import elements, unit

from openff.pablo._graph import Graph
from openff.pablo._utils import (
    __UNSET__,
    draw_molecule,
    flatten,
    react,
    sort_tuple,
    unwrap,
)
from openff.pablo.exceptions import PabloError, ResidueValidationError

if TYPE_CHECKING:
    import IPython.core.display

__all__ = [
    "AtomDefinition",
    "BondDefinition",
    "ResidueDefinition",
]

_residue_definition_skip_validation = False


@contextmanager
def _skip_residue_definition_validation():  # type: ignore[deadcode]
    global _residue_definition_skip_validation
    _residue_definition_skip_validation = True
    yield
    _residue_definition_skip_validation = False


[docs]@dataclass(frozen=True) class AtomDefinition: """ Description of an atom in a residue from the Chemical Component Dictionary (CCD). """ name: str """The canonical name of this atom""" synonyms: tuple[str, ...] """Other names this atom can have""" symbol: str """The elemental symbol for this atom""" leaving: bool """ Whether this atom is absent when a bond is formed between two residues. When a bond is formed between this residue and another, a "leaving fragment" must be removed to satisfy the valence model with the new bond. The leaving fragment is computed by identifying a single leaving atom bonded to each of the atoms forming the bond, as well as all leaving atoms bonded to those first leaving atoms. See also -------- openff.pablo.ResidueDefinition.linking_bond, openff.pablo.ResidueDefinition.crosslink """ charge: int """The formal charge of this atom""" aromatic: bool """Whether this atom is aromatic""" stereo: Literal["S", "R"] | None """The chirality of this atom"""
[docs] @classmethod def with_defaults( cls, name: str, symbol: str, *, synonyms: Iterable[str] = (), leaving: bool = False, charge: int = 0, aromatic: bool = False, stereo: Literal["S", "R"] | None = None, ) -> Self: """ Construct a new atom definition, optionally using some default values. """ return cls( name=name, symbol=symbol, synonyms=tuple(synonyms), leaving=leaving, charge=charge, aromatic=aromatic, stereo=stereo, )
[docs] def replace( self, *, name: str | __UNSET__ = __UNSET__(), symbol: str | __UNSET__ = __UNSET__(), synonyms: Iterable[str] | __UNSET__ = __UNSET__(), leaving: bool | __UNSET__ = __UNSET__(), charge: int | __UNSET__ = __UNSET__(), aromatic: bool | __UNSET__ = __UNSET__(), stereo: Literal["S", "R"] | None | __UNSET__ = __UNSET__(), ) -> Self: """Return a copy of ``self`` with the specified attributes replaced.""" return self.__class__( name=self.name if isinstance(name, __UNSET__) else name, symbol=self.symbol if isinstance(symbol, __UNSET__) else symbol, synonyms=( self.synonyms if isinstance(synonyms, __UNSET__) else tuple(synonyms) ), leaving=self.leaving if isinstance(leaving, __UNSET__) else leaving, charge=self.charge if isinstance(charge, __UNSET__) else charge, aromatic=self.aromatic if isinstance(aromatic, __UNSET__) else aromatic, stereo=self.stereo if isinstance(stereo, __UNSET__) else stereo, )
def __repr__(self) -> str: return ( f"AtomDefinition(name={self.name!r}" + (f", synonyms={self.synonyms!r}" if self.synonyms else "") + ", ...)" ) @property def names(self) -> set[str]: """The set of the canonical name and all synonyms""" return {self.name, *self.synonyms}
[docs]@dataclass(frozen=True) class BondDefinition: """ Description of a bond in a residue from the Chemical Component Dictionary (CCD). """ atom1: str """The canonical name of the first atom in this bond""" atom2: str """The canonical name of the second atom in this bond""" order: int """The bond order of this bond (1 for single bond, 2 for double bond, etc.)""" aromatic: bool """``True`` if this bond is aromatic, ``False`` otherwise""" stereo: Literal["E", "Z"] | None """The stereochemistry of this bond."""
[docs] def flipped(self) -> Self: """The same bond, but with the atoms in the opposite order""" return dataclasses.replace(self, atom1=self.atom2, atom2=self.atom1)
[docs] @classmethod def with_defaults( cls, atom1: str, atom2: str, *, order: int = 1, aromatic: bool = False, stereo: Literal["E", "Z"] | None = None, ) -> Self: """ Construct a new bond definition, optionally using some default values. """ return cls( atom1=atom1, atom2=atom2, order=order, aromatic=aromatic, stereo=stereo, )
[docs] def replace( self, *, atom1: str | __UNSET__ = __UNSET__(), atom2: str | __UNSET__ = __UNSET__(), order: int | __UNSET__ = __UNSET__(), aromatic: bool | __UNSET__ = __UNSET__(), stereo: Literal["E", "Z"] | None | __UNSET__ = __UNSET__(), ) -> Self: """Return a copy of ``self`` with the specified attributes replaced.""" return self.__class__( atom1=self.atom1 if isinstance(atom1, __UNSET__) else atom1, atom2=self.atom2 if isinstance(atom2, __UNSET__) else atom2, order=self.order if isinstance(order, __UNSET__) else order, aromatic=(self.aromatic if isinstance(aromatic, __UNSET__) else aromatic), stereo=self.stereo if isinstance(stereo, __UNSET__) else stereo, )
[docs]@dataclass(frozen=True) class ResidueDefinition: """ Description of a residue from the Chemical Component Dictionary (CCD). """ residue_name: str | None """ The 3-letter residue code used in PDB files, or ``None`` if only used anonymously. """ description: str """A longer description of the residue""" linking_bond: BondDefinition | None """Description of how this residue may bond to its neighbours in a polymer If the residue is only found as a monomer, ``None``. Otherwise, a ``BondDefinition``. The ``atom1`` and ``atom2`` attributes give the canonical atom names of the atoms that form the linking bond. ``atom1`` is the name of the atom in the residue preceding the bond, and ``atom2`` is in the residue after the bond. Any atoms that the bond between residues replaces should be marked as ``atom.leaving=True``; there must be either zero or one leaving fragments for each atom in a linking bond for the residue definition to validate, and there must be exactly one for a bond to form. A linking bond will be formed to join two residues if and only if all of the below are true: - The residues' atom records are sequential in the PDB file - The residues have the same chain ID - There is no TER record between the residues in the PDB file - The two residues have identical ``linking_bond`` attributes - All leaving atoms associated with ``linking_bond.atom1`` are absent in the first encountered residue, and all leaving atoms associated with ``linking_bond.atom2`` are absent in the latter residue. Leaving atoms are those that have the ``AtomDefinition.leaving`` attribute set to ``True``. A leaving atom is associated with atom `a` if it is bonded to `a`, or it is bonded to a leaving atom associated with `a`. - There is at least one leaving atom associated with each linking atom The charge of linking atoms is not modified; any change in valence is accounted for via the removal of leaving atoms. """ crosslink: BondDefinition | None """Optional description of a crosslink between this residue and another. A crosslink is a bond between this residue and another. Crosslinks differ from linking bonds primarily because they may occur between residues that are do not appear sequentially in the PDB file. Crosslinks cannot occur within a residue; use a bond or residue variant instead. If the residue does not form cross links, ``None``. Otherwise, a ``BondDefinition``. The ``atom1`` and ``atom2`` attributes give the canonical atom names of the atoms that form the crosslink. ``atom1`` is the name of the atom in this residue, and ``atom2`` is in the other residue. Any atoms that the bond between residues replaces should be marked as ``atom.leaving=True``; there must be either zero or one leaving fragments for each atom in a linking bond for the residue definition to validate, and there must be exactly one for a bond to form. A crosslink will be formed between two residues if and only if all of the below are true: - The two residues' ``crosslink`` attributes are identical except that their atom names are reversed - All leaving atoms associated with each residues' ``crosslink.atom1`` attribute are absent in that residue. Leaving atoms are those that have the ``AtomDefinition.leaving`` attribute set to ``True``. A leaving atom is associated with atom `a` if it is bonded to `a`, or it is bonded to a leaving atom associated with `a`. - There is at least one leaving atom associated with each cross-linking atom The charge of linking atoms is not modified; any change in valence is accounted for via the removal of leaving atoms. """ atoms: tuple[AtomDefinition, ...] """The atom definitions that make up this residue. Atoms may be marked as `leaving` only if they are associated with a linking or crosslinking bond; see the API documentation for those fields. All atoms must have unique canonical names.""" bonds: tuple[BondDefinition, ...] """The bond definitions that make up this residue""" virtual_sites: tuple[str, ...] """Atom names that represent virtual sites. All these names must be present for the residue definition to match.""" @property def n_expected_atoms(self) -> int: """The number of atoms that a matching residue without any linkages has. In practice, the number of atoms including leaving atoms, plus the number of virtual sites. A residue with linkages can have no more than this many atoms. """ return len(self.atoms) + len(self.virtual_sites) @property def n_core_atoms(self) -> int: """The number of non-leaving atoms in the residue. Excludes virtual sites. A matching residue can have no fewer than this many atoms. """ return sum(1 for atom in self.atoms if not atom.leaving) def __repr__(self) -> str: return f"ResidueDefinition(description={self.description!r}, ...)"
[docs] def replace( self, *, residue_name: str | __UNSET__ = __UNSET__(), description: str | __UNSET__ = __UNSET__(), linking_bond: BondDefinition | None | __UNSET__ = __UNSET__(), crosslink: BondDefinition | None | __UNSET__ = __UNSET__(), atoms: Iterable[AtomDefinition] | __UNSET__ = __UNSET__(), bonds: Iterable[BondDefinition] | __UNSET__ = __UNSET__(), virtual_sites: Iterable[str] | __UNSET__ = __UNSET__(), ) -> Self: """Return a copy of ``self`` with the specified attributes replaced.""" return self.__class__( residue_name=( self.residue_name if isinstance(residue_name, __UNSET__) else residue_name ), description=( self.description if isinstance(description, __UNSET__) else description ), linking_bond=( self.linking_bond if isinstance(linking_bond, __UNSET__) else linking_bond ), crosslink=self.crosslink if isinstance(crosslink, __UNSET__) else crosslink, atoms=self.atoms if isinstance(atoms, __UNSET__) else tuple(atoms), bonds=self.bonds if isinstance(bonds, __UNSET__) else tuple(bonds), virtual_sites=( self.virtual_sites if isinstance(virtual_sites, __UNSET__) else tuple(virtual_sites) ), )
@property def is_anonymous(self) -> bool: """ ``True`` if atom names may be used for matching, ``False`` otherwise. """ return self.residue_name is None def __post_init__(self): if _residue_definition_skip_validation: return self._validate() def _validate(self): if ( self.linking_bond is None and self.crosslink is None and True in {atom.leaving for atom in self.atoms} and not self.is_anonymous ): raise ResidueValidationError( f"{self.residue_name}: Leaving atoms were specified, but there is no linking bond or crosslink", self, ) if len({atom.name for atom in self.atoms}) != len(self.atoms): raise ResidueValidationError( f"{self.residue_name}: All atoms must have unique canonical names", ) unassigned_leaving_atoms = self._unassigned_leaving_atoms() if len(unassigned_leaving_atoms) != 0 and not self.is_anonymous: raise ResidueValidationError( f"{self.residue_name}: Leaving atoms could not be assigned to a" + f" bond: {unassigned_leaving_atoms}", ) all_canonical_names = {atom.name for atom in self.atoms} all_synonyms = set(flatten(atom.synonyms for atom in self.atoms)) all_atom_names = all_canonical_names.union(all_synonyms) if not set(self.virtual_sites).isdisjoint(all_atom_names): raise ResidueValidationError( f"{self.residue_name}: Virtual sites may not clash with any atom name" + f" or synonym: {all_atom_names.intersection(self.virtual_sites)}", ) linking_atoms: list[tuple[str, int]] = [] if self.linking_bond is not None: linking_atoms.extend( [ (self.linking_bond.atom1, self.linking_bond.order), (self.linking_bond.atom2, self.linking_bond.order), ], ) if self.crosslink is not None: linking_atoms.append((self.crosslink.atom1, self.crosslink.order)) for linking_atom, bond_order in linking_atoms: bonded_atoms = [ atom for atom in self.atoms_bonded_to(linking_atom) if self.name_to_atom[atom].leaving ] if len(bonded_atoms) > 1: raise ResidueValidationError( f"{self.residue_name} ({self.description}): Linking atom" + f"{linking_atom} must be bonded to no more than 1 leaving atom", ) elif len(bonded_atoms) == 0: continue bonded_atom = unwrap(bonded_atoms) replaced_bond_order = unwrap( bond.order for bond in self.bonds if ( sort_tuple((bond.atom1, bond.atom2)) == sort_tuple((linking_atom, bonded_atom)) ) ) if bond_order != replaced_bond_order: raise ResidueValidationError( f"{self.residue_name} ({self.description}): Bond between" + f" linking atom {linking_atom} and leaving fragment" + f" beginning at atom {bonded_atom} has order" + f" {replaced_bond_order}, but the linking bond has order" + f" {bond_order}", )
[docs] @classmethod def anon_from_molecule( cls, molecule: Molecule, description: str | None = None, virtual_sites: Collection[str] = (), ) -> Self: """ Create an anoymous ``ResidueDefinition`` from an ``openff.toolkit.Molecule``. Parameters ---------- molecule The ``Molecule`` object. Leaving atoms are identified from the atom metadata; atom's whose metadata includes a truthy value for the key ``"leaving_atom"`` are marked as leaving atoms. description An optional string describing the residue. Taken from ``molecule`` ``"description"`` property if ``None``. See :py:data:`openff.pablo.ResidueDefinition.description` virtual_sites Virtual sites expected by the residue. See :py:data:`openff.pablo.ResidueDefinition.virtual_sites` """ molecule_crosslink = molecule.properties.get("crosslink") if isinstance(molecule_crosslink, BondDefinition): crosslink = molecule_crosslink else: crosslink = None molecule_linking_bond = molecule.properties.get("crosslink") if isinstance(molecule_linking_bond, BondDefinition): linking_bond = molecule_linking_bond else: linking_bond = None if description is None: molecule_description = molecule.properties.get("description") if isinstance(molecule_description, str): description = molecule_description else: description = "" atoms: list[AtomDefinition] = [] for atom in molecule.atoms: if ( # nofmt linking_bond is not None and atom.name in {linking_bond.atom1, linking_bond.atom2} ): name = atom.name else: name = str(atom.molecule_atom_index) atoms.append( AtomDefinition( name=name, synonyms=(), symbol=atom.symbol, leaving=bool(atom.metadata.get("leaving_atom")), charge=atom.formal_charge.m_as(unit.elementary_charge), # type: ignore stereo=atom.stereochemistry, aromatic=atom.is_aromatic, ), ) bonds: list[BondDefinition] = [] for bond in molecule.bonds: bonds.append( BondDefinition( atom1=str(bond.atom1.molecule_atom_index), atom2=str(bond.atom2.molecule_atom_index), order=bond.bond_order, aromatic=bond.is_aromatic, stereo=bond.stereochemistry, ), ) return cls( residue_name=None, description=description, linking_bond=linking_bond, crosslink=crosslink, atoms=tuple(atoms), bonds=tuple(bonds), virtual_sites=tuple(virtual_sites), )
[docs] @classmethod def from_molecule( cls, molecule: Molecule, residue_name: str | None = None, description: str | None = None, linking_bond: BondDefinition | None = None, crosslink: BondDefinition | None = None, virtual_sites: Collection[str] = (), ) -> Self: """ Create a ``ResidueDefinition`` from an ``openff.toolkit.Molecule``. Parameters ---------- molecule The ``Molecule`` object. Canonical names are taken from the atom names in this object. Leaving atoms are identified from the atom metadata; atom's whose metadata includes a truthy value for the key ``"leaving_atom"`` are marked as leaving atoms. Synonyms are never set. residue_name The 3-letter code used to identify the residue in a PDB file. If ``None``, takes name from atom's ``"residue_name"`` metadata entry, or raises ``ValueError`` if they do not all agree. See also :py:data:`openff.pablo.ResidueDefinition.residue_name` linking_bond Residue linking bond. May be taken from ``molecule`` ``"linking_bond"`` property if ``None``. See :py:data:`openff.pablo.ResidueDefinition.linking_bond` crosslink Residue crosslink. May be taken from ``molecule`` ``"crosslink"`` property if ``None``. See :py:data:`openff.pablo.ResidueDefinition.crosslink` virtual_sites Virtual sites expected by the residue. See :py:data:`openff.pablo.ResidueDefinition.virtual_sites` description An optional string describing the residue. Taken from ``molecule`` ``"description"`` property if ``None``. See :py:data:`openff.pablo.ResidueDefinition.description` """ if residue_name is None: atom_residue_names = { atom.metadata.get("residue_name", None) for atom in molecule.atoms } if len(atom_residue_names) == 1: atom_residue_name = unwrap(atom_residue_names) if isinstance(atom_residue_name, str): residue_name = atom_residue_name else: raise PabloError("could not infer residue name from atom metadata") else: raise PabloError("could not infer residue name from atom metadata") if crosslink is None: molecule_crosslink = molecule.properties.get("crosslink") if isinstance(molecule_crosslink, BondDefinition): crosslink = molecule_crosslink if linking_bond is None: molecule_linking_bond = molecule.properties.get("crosslink") if isinstance(molecule_linking_bond, BondDefinition): linking_bond = molecule_linking_bond if description is None: molecule_description = molecule.properties.get("description") if isinstance(molecule_description, str): description = molecule_description else: description = "" atoms: list[AtomDefinition] = [] for atom in molecule.atoms: synonyms_str = str(atom.metadata.get("synonyms", "")) atoms.append( AtomDefinition( name=atom.name, synonyms=tuple(synonyms_str.split()), symbol=atom.symbol, leaving=bool(atom.metadata.get("leaving_atom")), charge=atom.formal_charge.m_as(unit.elementary_charge), # type: ignore stereo=atom.stereochemistry, aromatic=atom.is_aromatic, ), ) bonds: list[BondDefinition] = [] for bond in molecule.bonds: bonds.append( BondDefinition( atom1=bond.atom1.name, atom2=bond.atom2.name, order=bond.bond_order, aromatic=bond.is_aromatic, stereo=bond.stereochemistry, ), ) return cls( residue_name=residue_name, description=description, linking_bond=linking_bond, crosslink=crosslink, atoms=tuple(atoms), bonds=tuple(bonds), virtual_sites=tuple(virtual_sites), )
[docs] @classmethod def from_capped_molecule( cls, molecule: Molecule, residue_name: str, leaving_atom_indices: Collection[int], linking_bond: BondDefinition, crosslink: BondDefinition | None = None, virtual_sites: Collection[str] = (), description: str | None = None, ) -> Self: """ Create a linking ``ResidueDefinition`` from an ``openff.toolkit.Molecule`` Parameters ---------- residue_name The 3-letter code used to identify the residue in a PDB file. See :py:data:`openff.pablo.ResidueDefinition.residue_name` molecule The ``Molecule`` object. Canonical names are taken from the atom names in this object. Synonyms are never set. leaving_atom_indices Indices of atoms within the ``molecule`` argument that should be marked as leaving atoms. See :py:data:`openff.pablo.residue.AtomDefinition.leaving` linking_bond The bond linking this residue to its neighbours in a polymer. See :py:data:`openff.pablo.ResidueDefinition.linking_bond` crosslink See :py:data:`openff.pablo.ResidueDefinition.crosslink` description An optional string describing the residue. See :py:data:`openff.pablo.ResidueDefinition.description` virtual_sites Virtual sites expected by the residue. See :py:data:`openff.pablo.ResidueDefinition.virtual_sites` """ molecule = deepcopy(molecule) for i in leaving_atom_indices: molecule.atom(i).metadata["leaving_atom"] = True return cls.from_molecule( residue_name=residue_name, molecule=molecule, linking_bond=linking_bond, crosslink=crosslink, description=description, virtual_sites=virtual_sites, )
[docs] @classmethod def anon_from_sdf( cls, file: str | Path | TextIO, description: str | None = None, ) -> Self: """ Create an anonymous residue definition from an SDF file of a single molecule """ mol = Molecule.from_file( file, file_format="SDF", toolkit_registry=RDKitToolkitWrapper(), ) if not isinstance(mol, Molecule): mol = unwrap(mol) if description is None: description = str(file) if len(description) > 30: description = description[:3] + "..." + description[-23:] return cls.anon_from_molecule(mol, description=description)
[docs] @classmethod def anon_from_smiles( cls, smiles: str, leaving_atoms: Collection[int] = (), virtual_sites: Collection[str] = (), description: str | None = None, ) -> Self: """ Create an anonymous ``ResidueDefinition`` from a SMILES string. Parameters ---------- smiles The SMILES string. leaving_atoms SMILES string mapping numbers for atoms that should be marked as leaving atoms. description An optional string describing the residue. See :py:data:`openff.pablo.ResidueDefinition.description` virtual_sites Virtual sites expected by the residue. See :py:data:`openff.pablo.ResidueDefinition.virtual_sites` """ molecule = Molecule.from_smiles( smiles, allow_undefined_stereo=True, ) leaving_atom_indices = {i - 1 for i in leaving_atoms} for i, atom in enumerate(molecule.atoms): if i in leaving_atom_indices: atom.metadata["leaving_atom"] = True return cls.anon_from_molecule( molecule=molecule, description=smiles if description is None else description, virtual_sites=virtual_sites, )
[docs] @classmethod def anon_from_smiles_marked_leaving( cls, smiles: str, *, virtual_sites: Collection[str] = (), description: str | None = None, ) -> Self: """ Create an anonymous ``ResidueDefinition`` from a partially mapped SMILES. The mapped atoms are treated as leaving atoms. Parameters ---------- smiles The SMILES string. Leaving atoms should be mapped. All other atoms must not be. virtual_sites Virtual sites expected by the residue. See :py:data:`openff.pablo.ResidueDefinition.virtual_sites` description An optional string describing the residue. See :py:data:`openff.pablo.ResidueDefinition.description` """ molecule = Molecule.from_smiles( smiles, allow_undefined_stereo=True, ) neighbours: DefaultDict[int, set[int]] = defaultdict(set) for atom1, atom2 in molecule.nth_degree_neighbors(1): neighbours[atom1.molecule_atom_index].add(atom2.molecule_atom_index) neighbours[atom2.molecule_atom_index].add(atom1.molecule_atom_index) for i in molecule.properties["atom_map"]: atom = molecule.atom(i) atom.metadata["leaving_atom"] = True return cls.anon_from_molecule( molecule=molecule, description=smiles if description is None else description, virtual_sites=virtual_sites, )
[docs] @classmethod def from_smiles( cls, mapped_smiles: str, atom_names: Mapping[int, str], residue_name: str, leaving_atoms: Collection[int] = (), linking_bond: BondDefinition | None = None, crosslink: BondDefinition | None = None, virtual_sites: Collection[str] = (), description: str | None = None, ) -> Self: """ Create a ``ResidueDefinition`` from a mapped SMILES string. Parameters ---------- residue_name The 3-letter code used to identify the residue in a PDB file. See :py:data:`openff.pablo.ResidueDefinition.residue_name` mapped_smiles The SMILES string. All atoms must be explicitly included with contiguous mapping numbers starting at 1. atom_names Mapping from SMILES string mapping numbers to the canonical atom name. Note that this refers to numbers in the actual SMILES string, and so keys should be contiguous integers starting at 1. Atom names must be unique. leaving_atoms SMILES string mapping numbers for atoms that should be marked as leaving atoms. linking_bond The bond linking this residue to its neighbours in a polymer. See :py:data:`openff.pablo.ResidueDefinition.linking_bond` crosslink See :py:data:`openff.pablo.ResidueDefinition.crosslink` description An optional string describing the residue. See :py:data:`openff.pablo.ResidueDefinition.description` virtual_sites Virtual sites expected by the residue. See :py:data:`openff.pablo.ResidueDefinition.virtual_sites` """ molecule = Molecule.from_mapped_smiles( mapped_smiles, allow_undefined_stereo=True, ) leaving_atom_indices = {i - 1 for i in leaving_atoms} index_to_atom_name = {i - 1: name for i, name in atom_names.items()} if len(index_to_atom_name) != molecule.n_atoms: raise PabloError("Should be an atom name for each atom in SMILES string") if set(index_to_atom_name.keys()) != set(range(molecule.n_atoms)): raise PabloError( "Keys of atom_names should be contiguous integers starting at 1", ) for i, atom in enumerate(molecule.atoms): if i in leaving_atom_indices: atom.metadata["leaving_atom"] = True atom.name = index_to_atom_name[i] return cls.from_molecule( residue_name=residue_name, molecule=molecule, linking_bond=linking_bond, description=mapped_smiles if description is None else description, crosslink=crosslink, virtual_sites=virtual_sites, )
[docs] def to_openff_molecule(self) -> Molecule: """Create a ``Molecule`` representing the complete, unlinked residue.""" molecule = Molecule() atoms: dict[str, int] = {} for atom in self.atoms: atoms[atom.name] = molecule.add_atom( atomic_number=elements.NUMBERS[atom.symbol], formal_charge=atom.charge, is_aromatic=atom.aromatic, stereochemistry=atom.stereo, name=atom.name, metadata={ "residue_name": ( "" if self.residue_name is None else self.residue_name ), "leaving_atom": atom.leaving, "substructure_atom": not atom.leaving, "synonyms": " ".join(atom.synonyms), }, ) for bond in self.bonds: molecule.add_bond( atom1=atoms[bond.atom1], atom2=atoms[bond.atom2], bond_order=bond.order, is_aromatic=bond.aromatic, stereochemistry=bond.stereo, ) molecule.properties.update( { "linking_bond": self.linking_bond, "crosslink": self.crosslink, "description": self.description, }, ) return molecule
@cached_property def canonical_name_to_atom(self) -> dict[str, AtomDefinition]: """Map from each atoms' name and synonyms to the atom definition.""" return {atom.name: atom for atom in self.atoms} @cached_property def name_to_atom(self) -> dict[str, AtomDefinition]: """Map from each atoms' name and synonyms to the atom definition.""" mapping = self.canonical_name_to_atom canonical_names = set(mapping) for atom in self.atoms: for synonym in atom.synonyms: if synonym in mapping and mapping[synonym] != atom: raise PabloError( f"synonym {synonym} degenerately defined for canonical" + f" names {mapping[synonym].name} and {atom.name} in" + f" residue {self.residue_name}", ) if synonym in canonical_names: raise PabloError( f"synonym {synonym} of atom {atom.name} clashes with" + f" another canonical name in residue {self.residue_name}", ) mapping[synonym] = atom return mapping
[docs] def atoms_bonded_to(self, atom_name: str) -> Iterator[str]: """ Iterator over atoms bonded to the one with the given canonical name. """ yield from (bond.atom2 for bond in self.bonds_to(atom_name))
[docs] def bonds_to(self, atom_name: str) -> Iterator[BondDefinition]: """ All bonds to the given atom, ordered with ``atom1`` as the given atom. """ for bond in self.bonds: if bond.atom1 == atom_name: yield bond if bond.atom2 == atom_name: yield bond.flipped()
def _leaving_fragment_of(self, linking_atom: str) -> Iterator[str]: atoms_to_check = list(self.atoms_bonded_to(linking_atom)) checked_atoms: set[str] = set() while atoms_to_check: atom_name = atoms_to_check.pop() if self.name_to_atom[atom_name].leaving: yield atom_name atoms_to_check.extend( filter( lambda x: x not in checked_atoms, self.atoms_bonded_to(atom_name), ), ) checked_atoms.add(atom_name) if ( linking_atom in self.canonical_name_to_atom and self.canonical_name_to_atom[linking_atom].leaving and linking_atom not in checked_atoms ): yield linking_atom @cached_property def _posterior_bond_leaving_atoms(self) -> set[str]: return ( set() if self.linking_bond is None else set(self._leaving_fragment_of(self._posterior_bond_linking_atom)) ) @cached_property def _prior_bond_leaving_atoms(self) -> set[str]: return ( set() if self.linking_bond is None else set(self._leaving_fragment_of(self._prior_bond_linking_atom)) ) @cached_property def _crosslink_leaving_atoms(self) -> set[str]: return ( set() if self.crosslink is None else set(self._leaving_fragment_of(self.crosslink.atom1)) ) @property def _prior_bond_linking_atom(self) -> str: if self.linking_bond is None: raise PabloError("not a linking residue") return self.linking_bond.atom2 @property def _posterior_bond_linking_atom(self) -> str: if self.linking_bond is None: raise PabloError("not a linking residue") return self.linking_bond.atom1 @property def _can_form_prior_bond(self) -> bool: return ( self.linking_bond is not None and self._prior_bond_linking_atom in self.canonical_name_to_atom and len(self._prior_bond_leaving_atoms) > 0 ) @property def _can_form_posterior_bond(self) -> bool: return ( self.linking_bond is not None and self._posterior_bond_linking_atom in self.canonical_name_to_atom and len(self._posterior_bond_leaving_atoms) > 0 ) @property def _can_form_crosslink(self) -> bool: return ( self.crosslink is not None and self.crosslink.atom1 in self.canonical_name_to_atom and len(self._crosslink_leaving_atoms) > 0 ) def _is_isomorphic_to(self, other: Self) -> bool: """Atom and bond ordering insensitive equality test""" return all( [ other.linking_bond == self.linking_bond, other.crosslink == self.crosslink, set(other.atoms) == set(self.atoms), set(other.bonds) == set(self.bonds), ], )
[docs] def deprotonated_at( self, name: str, ) -> Self: """Return a copy of ``self`` with proton ``name`` abstracted. The hydrogen with canonical name ``name`` is removed, and the formal charge of the atom bonded to it is decremented. If the named atom is not hydrogen, is missing, or is bonded to a number of other atoms other than 1, ``PabloError`` is raised. """ try: atom = self.canonical_name_to_atom[name] except KeyError: raise PabloError(f"Cannot deprotonate missing atom {name}") if atom.symbol != "H": raise PabloError( f"Cannot deprotonate non-hydrogen atom {name}: {atom.symbol=}", ) neighbours = list(self.atoms_bonded_to(name)) if len(neighbours) != 1: raise PabloError( f"Cannot deprotonate atom {name} bonded to {len(neighbours)} other atoms", ) neighbour = neighbours[0] return self.replace( bonds=[bond for bond in self.bonds if name not in [bond.atom1, bond.atom2]], atoms=[ ( atom.replace(charge=atom.charge - 1) if atom.name == neighbour else atom ) for atom in self.atoms if atom.name != name ], description=self.description + f" -{name}", )
[docs] def protonated_at( self, heavy_atom_name: str, proton_name: str, ignore_synonym_clashes: bool = False, ) -> Self: """Return a copy of ``self`` with ``heavy_atom_name`` protonated by ``proton_name``. The formal charge of the atom with canonical name ``heavy_atom_name`` is incremented, a new hydrogen atom named ``proton_name`` is added, and a bond is created between the two named atoms. If the heavy atom is missing or the new proton name clashes with an existing name or synonym, ``PabloError`` is raised. The new atom has the same value for the ``leaving`` attribute as the heavy atom. The synonym clash check may be suppressed with the ``ignore_synonym_clashes`` argument, but the canonical names are always checked. """ try: heavy_atom = self.canonical_name_to_atom[heavy_atom_name] except KeyError: raise PabloError(f"Cannot protonate missing atom {heavy_atom_name}") if any( [ (ignore_synonym_clashes and proton_name in self.canonical_name_to_atom), (not ignore_synonym_clashes and proton_name in self.name_to_atom), ], ): raise PabloError( f"name {proton_name} clashes with existing name in residue", ) return self.replace( bonds=[ *self.bonds, BondDefinition.with_defaults(heavy_atom_name, proton_name), ], atoms=[ *( ( atom.replace(charge=atom.charge + 1) if atom.name == heavy_atom_name else atom ) for atom in self.atoms ), AtomDefinition.with_defaults( proton_name, "H", leaving=heavy_atom.leaving, ), ], description=self.description + f" +{proton_name}", )
[docs] def vary_protonation( self, *, acidic: Iterable[str] = (), # Each element specifies an atom name to remove, decrementing the formal # charge on the neighbouring heavy atom. Multiply bonded or non-hydrogen # atoms raise an error. basic: Iterable[tuple[str, str]] = (), # Each tuple specifies an atom name to protonate (increment the formal # charge and form a bond) and the name of the added proton ignore_synonym_clashes: bool = False, skip_errors: bool = False, ) -> list[Self]: """ Compute all combinations of the specified protonation variants The first element of the returned list is guaranteed to be the unmodified starting residue definition; to get a list of just the new variants, use ``resdef.vary_protonation(...)[1:]``. Note that all combinations of protonations and deprotonations are generated; this means that if ``acidic`` has length ``n`` and ``basic`` has length ``m``, ``2**(n+m)`` variants will be generated. Parameters ========== acidic Each element specifies an atom name to remove, decrementing the formal charge on the neighbouring heavy atom. Multiply bonded, unbonded, missing or non-hydrogen atoms raise ``PabloError`` unless ``skip_errors`` is ``True``. basic Each tuple specifies an atom name to protonate (increment the formal charge and form a bond) and the name of the added proton. Missing heavy atoms or new atom names that clash with existing names raise ``PabloError`` unless ``skip_errors`` is ``True``. ignore_synonym_clashes If set to ``True``, protons added by the ``basic`` argument may have names that clash with the synonyms of other atoms. This can be useful in the early stages of a multi-step residue definition patching process. Added protons may never have names that clash with the canonical names of other atoms. skip_errors If set to ``True``, missing, clashing, or improperly bonded atoms are skipped rather than raising ``PabloError``. """ variants = [self] for proton_to_remove in acidic: for variant in list(variants): try: variants.append( variant.deprotonated_at( proton_to_remove, ), ) except PabloError as err: if skip_errors: continue else: raise err from None for atom_to_protonate, name_of_proton in basic: for variant in list(variants): try: variants.append( variant.protonated_at( atom_to_protonate, name_of_proton, ignore_synonym_clashes=ignore_synonym_clashes, ), ) except PabloError as err: if skip_errors: continue else: raise err from None return variants
[docs] def with_synonyms(self, synonyms: Mapping[str, Sequence[str]]) -> Self: """Return a copy of self with the given synonyms added ``synonyms`` is a mapping from canonical atom names to its additional synonyms. Existing synonyms are preserved. """ return self.replace( atoms=( atom.replace( synonyms={ *atom.synonyms, *synonyms.get( atom.name, [], ), }, ) for atom in self.atoms ), )
def _missing_atoms_are_valid_leaving_atoms( self, missing_atom_names: Collection[str], ) -> bool: """ ``True`` if the missing atom names form a valid set of leaving atoms, ``False`` if not. A valid set must all be marked as leaving atoms and must all be assignable to a linking or crosslinking bond. """ missing_atom_names = set(missing_atom_names) return ( ( missing_atom_names.issuperset( self._prior_bond_leaving_atoms, ) or missing_atom_names.isdisjoint( self._prior_bond_leaving_atoms, ) ) and ( missing_atom_names.issuperset( self._posterior_bond_leaving_atoms, ) or missing_atom_names.isdisjoint( self._posterior_bond_leaving_atoms, ) ) and ( missing_atom_names.issuperset( self._crosslink_leaving_atoms, ) or missing_atom_names.isdisjoint( self._crosslink_leaving_atoms, ) ) ) def _possible_prior_bond_names_gen(self) -> Iterator[tuple[str, str]]: if ( self.linking_bond is None or self._prior_bond_linking_atom not in self.canonical_name_to_atom ): return linking_atom = self.canonical_name_to_atom[self._prior_bond_linking_atom] for linking_name in (linking_atom.name, *linking_atom.synonyms): yield (self._posterior_bond_linking_atom, linking_name) @cached_property def _possible_prior_bond_names(self) -> set[tuple[str, str]]: return set(self._possible_prior_bond_names_gen()) def _possible_posterior_bond_names_gen(self) -> Iterator[tuple[str, str]]: if ( self.linking_bond is None or self._posterior_bond_linking_atom not in self.canonical_name_to_atom ): return linking_atom = self.canonical_name_to_atom[self._posterior_bond_linking_atom] for linking_name in (linking_atom.name, *linking_atom.synonyms): yield (linking_name, self._prior_bond_linking_atom) @cached_property def _possible_posterior_bond_names(self) -> set[tuple[str, str]]: return set(self._possible_posterior_bond_names_gen()) def _possible_crosslink_bond_names_gen(self) -> Iterator[tuple[str, str]]: if ( self.crosslink is None or self.crosslink.atom1 not in self.canonical_name_to_atom ): return linking_atom = self.canonical_name_to_atom[self.crosslink.atom1] for linking_name in (linking_atom.name, *linking_atom.synonyms): yield (linking_name, self.crosslink.atom2) @cached_property def _possible_crosslink_bond_names(self) -> set[tuple[str, str]]: return set(self._possible_crosslink_bond_names_gen()) def _unassigned_leaving_atoms(self) -> set[str]: all_leaving_atoms = {atom.name for atom in self.atoms if atom.leaving} assigned_leaving_atoms = ( self._prior_bond_leaving_atoms | self._posterior_bond_leaving_atoms | self._crosslink_leaving_atoms ) return all_leaving_atoms.difference(assigned_leaving_atoms) def _unassigned_leaving_atom_fragments(self) -> Iterator[tuple[str, ...]]: yield from { tuple(sorted(set(self._leaving_fragment_of(name)))) for name in self._unassigned_leaving_atoms() } def _leaving_atom_fragments(self) -> Iterator[tuple[str, ...]]: yield from { tuple(sorted(set(self._leaving_fragment_of(atom.name)))) for atom in self.atoms if atom.leaving } def _to_core_graph(self) -> Graph[AtomDefinition, BondDefinition]: graph: Graph[AtomDefinition, BondDefinition] = Graph( node_count_hint=self.n_core_atoms, edge_count_hint=len(self.bonds), ) graph.add_nodes_from(atom for atom in self.atoms if not atom.leaving) for bond in self.bonds: atom1 = self.canonical_name_to_atom[bond.atom1] atom2 = self.canonical_name_to_atom[bond.atom2] if not (atom1.leaving or atom2.leaving): graph.add_edge(atom1, atom2, bond) return graph def _to_graphs(self) -> Iterable[Graph[AtomDefinition, BondDefinition]]: """ Construct graphs of all possible arrangements of matching atoms. Yields one graph per combination of crosslink, posterior bond, and prior bond, and combination of unassigned leaving atoms. Atoms in the final graphs that are not a part of the substructure, ie. non-physical "leaving" atoms, are represented with element symbol ``""``. """ for crosslink in {self.crosslink is not None, False}: for posterior_bond in {self.linking_bond is not None, False}: for prior_bond in {self.linking_bond is not None, False}: leaving_atom_fragments = [ *self._unassigned_leaving_atom_fragments(), ] for leaving_atoms in flatten( map( lambda x: tuple(flatten(x)), itertools.combinations(leaving_atom_fragments, i), ) for i in range(len(leaving_atom_fragments) + 1) ): logging.debug( f"Generating graph for {self.description}:" + f" {crosslink=} {posterior_bond=} {prior_bond=} {leaving_atoms=}", ) yield self._to_graph( crosslinked=crosslink, posterior_bonded=posterior_bond, prior_bonded=prior_bond, left_atoms=leaving_atoms, ) def _to_graph( self, crosslinked: bool = False, prior_bonded: bool = False, posterior_bonded: bool = False, left_atoms: Sequence[str] = (), ) -> Graph[AtomDefinition, BondDefinition]: graph = self._to_core_graph() restore_leaving_atoms: set[str] = ( # nofmt self._unassigned_leaving_atoms() - set(left_atoms) ) # Add the non-matching (left) leaving atoms that bond to core atoms # along with their corresponding bonds for name in left_atoms: atom = self.name_to_atom[name] assert atom.leaving linking_bonds = [ bond for bond in self.bonds_to(name) if not self.name_to_atom[bond.atom2].leaving ] if len(linking_bonds) == 0: continue target_name = f"NON_MATCHING_ATOM_{name}" link_target = AtomDefinition.with_defaults(target_name, "", leaving=True) graph.add_node(link_target) graph.add_edges_from( ( link_target, self.name_to_atom[bond.atom2], bond.replace(atom1=target_name), ) for bond in linking_bonds ) crosslink = self.crosslink if crosslinked else None prior_bond = self.linking_bond if prior_bonded else None posterior_bond = self.linking_bond if posterior_bonded else None if crosslink is not None: crosslink_target = AtomDefinition.with_defaults( crosslink.atom2, "", leaving=True, ) graph.add_node(crosslink_target) graph.add_edge( self.canonical_name_to_atom[crosslink.atom1], crosslink_target, crosslink, ) else: restore_leaving_atoms.update(self._crosslink_leaving_atoms) if posterior_bond is not None: posterior_target = AtomDefinition.with_defaults( posterior_bond.atom2, "", leaving=True, ) graph.add_node(posterior_target) graph.add_edge( self.canonical_name_to_atom[posterior_bond.atom1], posterior_target, posterior_bond, ) else: restore_leaving_atoms.update(self._posterior_bond_leaving_atoms) if prior_bond is not None: prior_target = AtomDefinition.with_defaults( prior_bond.atom1, "", leaving=True, ) graph.add_node(prior_target) graph.add_edge( self.canonical_name_to_atom[prior_bond.atom2], prior_target, prior_bond, ) else: restore_leaving_atoms.update(self._prior_bond_leaving_atoms) added_nodes = {atom.name: atom for atom in graph.nodes} for leaving_name in restore_leaving_atoms: try: bond = unwrap( bond for bond in self.bonds if (bond.atom1 in added_nodes and bond.atom2 == leaving_name) or (bond.atom2 in added_nodes and bond.atom1 == leaving_name) ) except ValueError: continue leaving_atom = self.canonical_name_to_atom[leaving_name].replace( leaving=False, ) graph.add_node(leaving_atom) assert leaving_name not in added_nodes added_nodes[leaving_name] = leaving_atom if bond.atom1 == leaving_name: graph.add_edge( leaving_atom, added_nodes[bond.atom2], bond, ) else: graph.add_edge( added_nodes[bond.atom1], leaving_atom, bond, ) assert graph.is_connected() return graph
[docs] def visualize( self, *, width: int = -1, height: int = 300, highlight: Iterable[str] = (), ) -> "IPython.core.display.SVG": from IPython.display import SVG offmol = self.to_openff_molecule() highlight = set(highlight) highlight_atoms = [ i for i, atom in enumerate(offmol.atoms) if atom.name in highlight ] deemphasize_atoms = [i for i, atom in enumerate(self.atoms) if atom.leaving] legend_elements: list[str] = [] if self.is_anonymous: atom_notes = {} legend_elements.append("Anonymous Residue Definition") if len(deemphasize_atoms) != 0: legend_elements.append("(leaving atoms drawn in light grey)") else: atom_notes = { i: f"{'|'.join([atom.name, *atom.synonyms])}{'*' if atom.leaving else ''}" for i, atom in enumerate(self.atoms) } if self.description: legend_elements.append(f"{self.residue_name}: {self.description}") else: legend_elements.append(f"{self.residue_name}") if len(deemphasize_atoms) != 0: legend_elements.append("(*leaving atoms, drawn in light grey)") bond_notes: dict[tuple[int, int], str] = {} linkages: list[tuple[str, str, str]] = [] if self._can_form_posterior_bond: linkages.append( ( "posterior", self._posterior_bond_linking_atom, self._prior_bond_linking_atom, ), ) if self._can_form_prior_bond: linkages.append( ( "prior", self._prior_bond_linking_atom, self._posterior_bond_linking_atom, ), ) if self.crosslink is not None and self._can_form_crosslink: linkages.append(("crosslink", self.crosslink.atom1, self.crosslink.atom2)) for linkage, linking_atom, partner_atom in linkages: bond = unwrap( bond for bond in self.bonds if ( bond.atom1 == linking_atom and self.canonical_name_to_atom[bond.atom2].leaving ) or ( bond.atom2 == linking_atom and self.canonical_name_to_atom[bond.atom1].leaving ) ) atom1_idx = unwrap( i for i, atom in enumerate(offmol.atoms) if atom.name == bond.atom1 ) atom2_idx = unwrap( i for i, atom in enumerate(offmol.atoms) if atom.name == bond.atom2 ) bond_notes[atom1_idx, atom2_idx] = f"{linkage} to '{partner_atom}'" svg_content = draw_molecule( offmol, width=width, height=height, atom_notes=atom_notes, bond_notes=bond_notes, deemphasize_atoms=deemphasize_atoms or None, legend=" ".join(legend_elements), highlight_atoms=highlight_atoms or None, ) return SVG(data=svg_content)
[docs] @staticmethod def react( reactants: Sequence["ResidueDefinition"], reactant_smarts: Sequence[str], product_smarts: Sequence[str], product_residue_names: Sequence[str | None] | None = None, product_descriptions: Sequence[str | None] | None = None, product_linking_bonds: Sequence[BondDefinition | None] | None = None, product_crosslinks: Sequence[BondDefinition | None] | None = None, product_virtual_sites: Sequence[Collection[str]] | None = None, ) -> list[tuple["ResidueDefinition", ...]]: """ Perform a SMARTS reaction on some residue definitions Parameters ========== reactants The residue definitions involved in the reaction. Names and synonyms for the products are taken from these. Chemical information is also derived from these, but may be modified by the reaction. reactant_smarts Mapped SMARTS patterns identifying atoms in the reactants that will be involved in the reaction. There may be more or fewer of these than reactants to allow for complex and intramolecular reactions. product_smarts Mapped SMARTS patterns describing any altered chemistry in the products. All atom mappings in the reactant smarts must be present in a product SMARTS. Each product SMARTS corresponds to a single product ``ResidueDefinition``. product_residue_names The residue names of the products. One for each product SMARTS. May be ``None`` or omitted, in which case the product is anonymous --- note that this means atom names in the reactants are discarded. If there are fewer residue names than product SMARTS, any unnamed products are made anonymous. product_descriptions The descriptions of the products. If there are fewer descriptions than product SMARTS, the given descriptions are assigned to the first products. product_linking_bonds The linking bonds of the products. If there are fewer linking bonds than product SMARTS, the given linking bonds are assigned to the first products. Must be ``None`` or omitted if the product is anonymous. product_crosslinks The crosslinks of the products. If there are fewer crosslinks than product SMARTS, the given crosslinks are assigned to the first products. Must be ``None`` or omitted if the product is anonymous. product_virtual_sites The virtual sites of the products. If there are fewer virtual sites than product SMARTS, the given virtual sites are assigned to the first products. """ def validate_product_arg[T, U]( argname: str, argvalue: Sequence[T | U] | None, nonevalue: U, product_value_forbidden: Sequence[bool] | None = None, ) -> Sequence[T | U]: if argvalue is None: argvalue = [nonevalue] * len(product_smarts) elif len(argvalue) < len(product_smarts): argvalue = list(argvalue) + [nonevalue] * ( len(product_smarts) - len(argvalue) ) elif len(argvalue) > len(product_smarts): raise PabloError( f"too many {argname}: expected at most {len(product_smarts)}", ) if product_value_forbidden is not None: for i, (forbidden, value) in enumerate( zip(product_value_forbidden, argvalue), ): if forbidden and value is not nonevalue: raise PabloError( f"{argname}[{i}] is {value} but must be {nonevalue}" + " for anonymous product", ) return argvalue product_residue_names = validate_product_arg( "product_residue_names", product_residue_names, None, ) product_is_anon = [name is None for name in product_residue_names] product_descriptions = validate_product_arg( "product_descriptions", product_descriptions, None, ) product_linking_bonds = validate_product_arg( "product_linking_bonds", product_linking_bonds, None, product_value_forbidden=product_is_anon, ) product_crosslinks = validate_product_arg( "product_crosslinks", product_crosslinks, None, product_value_forbidden=product_is_anon, ) product_virtual_sites = validate_product_arg( "product_virtual_sites", product_virtual_sites, (), ) reactant_offmols = [resdef.to_openff_molecule() for resdef in reactants] reaction = ".".join(reactant_smarts) + ">>" + ".".join(product_smarts) all_products = list(react(reactant_offmols, reaction)) for products in all_products: if len(products) != len(product_smarts): raise PabloError( f"expected {len(product_smarts)} products, got {len(products)}:" + " each product_smarts should include only one fragment", ) return [ tuple( ResidueDefinition.from_molecule( offmol, residue_name=product_residue_names[i], description=product_descriptions[i], linking_bond=product_linking_bonds[i], crosslink=product_crosslinks[i], virtual_sites=product_virtual_sites[i], ) if product_residue_names[i] is not None else ResidueDefinition.anon_from_molecule( offmol, description=product_descriptions[i], virtual_sites=product_virtual_sites[i], ) for i, offmol in enumerate(products) ) for products in all_products ]