Source code for openff.pablo.ccd.patches

"""
Patches to add essential features to the CCD.
"""

from openff.pablo.chem import DISULFIDE_BOND
from openff.pablo.exceptions import PabloError

from .._utils import unwrap
from ..residue import (
    AtomDefinition,
    BondDefinition,
    ResidueDefinition,
)
from ._ccdcache import PEPTIDE_BOND

__all__ = [
    "ACIDIC_PROTONS",
    "BASIC_ATOMS",
    "ATOM_NAME_SYNONYMS",
    "fix_caps",
    "add_protonation_variants",
    "add_synonyms",
    "disambiguate_alt_ids",
    "add_disulfide_crosslink",
    "add_dephosphorylated_5p_terminus",
    "set_hop3_leaving",
    "patch_his_sidechain_zwitterion",
    "delete_doubly_deprotonated_arginine",
    "add_nh2_leaving_atom",
    "strip_linkless_leavers",
]


ACIDIC_PROTONS: dict[str, list[str]] = {
    "ALA": ["HXT", "H2"],
    "ARG": ["HXT", "H2", "HH12", "HH22"],
    "ASN": ["HXT", "H2"],
    "ASP": ["HXT", "H2", "HD2"],
    "CYS": ["HXT", "H2", "HG"],
    "GLN": ["HXT", "H2"],
    "GLU": ["HXT", "H2", "HE2"],
    "GLY": ["HXT", "H2"],
    # TODO: Special case HIS so that the neutral form is not a zwitterion
    "HIS": ["HXT", "H2", "HD1", "HE2"],
    "ILE": ["HXT", "H2"],
    "LEU": ["HXT", "H2"],
    "LYS": ["HXT", "H2", "HZ3"],
    "MET": ["HXT", "H2"],
    "PHE": ["HXT", "H2"],
    "PRO": ["HXT"],
    "SER": ["HXT", "H2", "HG"],
    "THR": ["HXT", "H2", "HG1"],
    "TRP": ["HXT", "H2", "HE1"],
    "TYR": ["HXT", "H2", "HH"],
    "VAL": ["HXT", "H2"],
    "DG": ["HOP2", "HOP3", "HO3'"],
    "DA": ["HOP2", "HOP3", "HO3'"],
    "DT": ["HOP2", "HOP3", "HO3'"],
    "DC": ["HOP2", "HOP3", "HO3'"],
    "G": ["HOP2", "HOP3", "HO3'"],
    "A": ["HOP2", "HOP3", "HO3'"],
    "U": ["HOP2", "HOP3", "HO3'"],
    "C": ["HOP2", "HOP3", "HO3'"],
}
"""Map from residue name to a list of atom names of abstractable hydrogens.

Each 3-tuple specifies an atom name to remove. This atom must have exactly one
bond. A variant residue definition is created with that atom and bond removed,
and the formal charge of the bonded atom reduced by one.

Note that all combinations of deprotonations are generated; this means a residue
with ``n`` abstractable hydrogens will have ``2**n`` variants."""


BASIC_ATOMS: dict[str, list[tuple[str, str]]] = {
    "ALA": [("N", "H3")],
    "ARG": [("N", "H3")],
    "ASN": [("N", "H3")],
    "ASP": [("N", "H3")],
    "CYS": [("N", "H3")],
    "GLN": [("N", "H3")],
    "GLU": [("N", "H3")],
    "GLY": [("N", "H3")],
    "HIS": [("N", "H3")],
    "ILE": [("N", "H3")],
    "LEU": [("N", "H3")],
    "LYS": [("N", "H3")],
    "MET": [("N", "H3")],
    "PHE": [("N", "H3")],
    "PRO": [],
    "SER": [("N", "H3")],
    "THR": [("N", "H3")],
    "TRP": [("N", "H3")],
    "TYR": [("N", "H3")],
    "VAL": [("N", "H3")],
}
"""Protonation variants that add an atom to the CCD.

Each 3-tuple specifies an atom name to protonate and the name of the added
proton. A variant residue definition is created with that atom and bond added,
and the formal charge of the atom increased by one."""

ATOM_NAME_SYNONYMS = {
    "NME": {
        "HN2": ["H"],
        "C": ["CH3"],
        "H1": ["1HH3"],
        "H2": ["2HH3"],
        "H3": ["3HH3"],
    },
    "ACE": {
        "H1": ["1HH3"],
        "H2": ["2HH3"],
        "H3": ["3HH3"],
    },
    "NA": {"NA": ["Na"]},
    "CL": {"CL": ["Cl"]},
    "ALA": {"H": ["H1"]},
    "ARG": {"H": ["H1"]},
    "ASN": {"H": ["H1"]},
    "ASP": {"H": ["H1"]},
    "CYS": {"H": ["H1"], "HB2": ["3HB"]},
    "GLN": {"H": ["H1"]},
    "GLU": {"H": ["H1"]},
    "GLY": {"H": ["H1"]},
    "HIS": {"H": ["H1"]},
    "ILE": {"H": ["H1"]},
    "LEU": {"H": ["H1"]},
    "LYS": {"H": ["H1"]},
    "MET": {"H": ["H1"]},
    "PHE": {"H": ["H1"]},
    "SER": {"H": ["H1"], "HB2": ["3HB"]},
    "THR": {"H": ["H1"]},
    "TRP": {"H": ["H1"]},
    "TYR": {"H": ["H1"]},
    "VAL": {"H": ["H1"]},
}
"""Map from residue name and then canonical atom name to a list of synonyms"""


[docs]def fix_caps(res: ResidueDefinition) -> list[ResidueDefinition]: """ Fix ``"NON-POLYMER"`` residues so they can be used as caps for peptides. """ return [ res.replace( linking_bond=PEPTIDE_BOND, atoms=[ atom.replace( leaving=True if atom.name == "H" else atom.leaving, ) for atom in res.atoms ], ), ]
[docs]def add_protonation_variants(res: ResidueDefinition) -> list[ResidueDefinition]: """ Add protonation variants from the ACIDIC_PROTONS and BASIC_ATOMS constants. Note that all combinations of protonations and deprotonations are generated; this means a residue with ``n`` abstractable hydrogens and ``m`` acidic atoms will have ``2**(n+m)`` variants. """ assert res.residue_name is not None return res.vary_protonation( acidic=ACIDIC_PROTONS.get(res.residue_name, []), basic=BASIC_ATOMS.get(res.residue_name, []), ignore_synonym_clashes=True, )
[docs]def add_synonyms(res: ResidueDefinition) -> list[ResidueDefinition]: """ Patch a residue definition to include synonyms from :py:data:`ATOM_NAME_SYNONYMS`. """ assert res.residue_name is not None return [res.with_synonyms(ATOM_NAME_SYNONYMS.get(res.residue_name, {}))]
[docs]def disambiguate_alt_ids(res: ResidueDefinition) -> list[ResidueDefinition]: """ CCD patch: put alt atom ids in their own residue definitions if needed This patch should be run before other patches that add synonyms, as it assumes that there is at most one synonym (from the CCD alt id flag). Some CCD residues (like GLY) have alternative atom IDs that clash with canonical IDs for a different atom. This breaks synonyms because the clashing alternate ID is never assigned; the PDB file is interpreted as having two copies of the canonical ID atom. To fix this, we just split residue definitions with this clashing problem into two definitions, one with the canonical IDs and the other with the alternates. Note that if a linking atom is part of a clash, this may interfere with the disambiguated residue definition's ability to link to canonically named residue definitions. """ clashes: list[int] = [] canonical_names = {atom.name for atom in res.atoms} for i, atom in enumerate(res.atoms): for synonym in atom.synonyms: if synonym in canonical_names: clashes.append(i) if len(clashes) != 0: old_to_new: dict[str, str] = {} for atom in res.atoms: if atom.synonyms: old_to_new[atom.name] = unwrap(atom.synonyms) else: old_to_new[atom.name] = atom.name res1 = res.replace( atoms=[ atom.replace( synonyms=[], ) for atom in res.atoms ], ) res2 = res.replace( atoms=[ atom.replace( name=old_to_new[atom.name], synonyms=[], ) for atom in res.atoms ], bonds=[ bond.replace( atom1=old_to_new[bond.atom1], atom2=old_to_new[bond.atom2], ) for bond in res.bonds ], description=res.description + " altids", crosslink=( None if res.crosslink is None else res.crosslink.replace( atom1=old_to_new[res.crosslink.atom1], atom2=old_to_new.get( res.crosslink.atom2, res.crosslink.atom2, ), ) ), linking_bond=( None if res.linking_bond is None else res.linking_bond.replace( atom1=old_to_new[res.linking_bond.atom1], atom2=old_to_new.get( res.linking_bond.atom2, res.linking_bond.atom2, ), ) ), ) return [res1, res2] else: return [res]
[docs]def add_dephosphorylated_5p_terminus(res: ResidueDefinition) -> list[ResidueDefinition]: """ Add a definition representing a dephosphorylated 5' nucleic acid terminus. Assumes the phosphate has the standard CCD nucleic acid atom names: "P", "HOP2", "HOP3", "OP1", "OP2", and "OP3". """ phosphate_names = {"P", "HOP2", "HOP3", "OP1", "OP2", "OP3"} return [ res, res.replace( atoms=( AtomDefinition( name="HO5'", synonyms=(), symbol="H", aromatic=False, charge=0, leaving=False, stereo=None, ), *(atom for atom in res.atoms if atom.name not in phosphate_names), ), bonds=( BondDefinition( atom1="O5'", atom2="HO5'", aromatic=False, order=1, stereo=None, ), *( bond for bond in res.bonds if bond.atom1 not in phosphate_names and bond.atom2 not in phosphate_names ), ), description=( res.description[:-17] + "-5'-PROTONATED" if res.description.endswith("-5'-MONOPHOSPHATE") else res.description + " 5' dephosphorylated" ), ), ]
[docs]def set_hop3_leaving(res: ResidueDefinition) -> list[ResidueDefinition]: """ Mark the HOP3 leaving atom in nucleic acid residues as such. The OP3 and HOP3 atoms in nucleic acid residues are both absent when the residue forms part of a polymer, but only OP3 is marked as leaving; HOP3 becomes a disconnected fragment. """ # TODO: Replace this with disconnected fragment detection in ResidueDefinition._leaving_fragment_of() return [ res.replace( atoms=( atom.replace(leaving=True) if atom.name == "HOP3" else atom for atom in res.atoms ), ), ]
[docs]def patch_his_sidechain_zwitterion(res: ResidueDefinition) -> list[ResidueDefinition]: """Patch the histidine side chain so that it is neutral, not zwitterionic.""" nd1_atom = res.name_to_atom.get("ND1", None) ne2_atom = res.name_to_atom.get("NE2", None) if ( nd1_atom is not None and nd1_atom.charge == 1 and ne2_atom is not None and ne2_atom.charge == -1 ): bonds: list[BondDefinition] = [] nd1_ce1_bond: BondDefinition | None = None ne2_ce1_bond: BondDefinition | None = None for bond in res.bonds: if {bond.atom1, bond.atom2} == {"ND1", "CE1"}: nd1_ce1_bond = bond elif {bond.atom1, bond.atom2} == {"NE2", "CE1"}: ne2_ce1_bond = bond else: bonds.append(bond) if ( nd1_ce1_bond is None or nd1_ce1_bond.order != 2 or ne2_ce1_bond is None or ne2_ce1_bond.order != 1 ): raise PabloError( "Zwitterionic histidine side chain detected but could not be corrected", ) else: return [ res.replace( bonds=[ nd1_ce1_bond.replace(order=1), ne2_ce1_bond.replace(order=2), *bonds, ], atoms=[ nd1_atom.replace(charge=0), ne2_atom.replace(charge=0), *( atom for atom in res.atoms if atom is not nd1_atom and atom is not ne2_atom ), ], ), ] else: return [res]
[docs]def delete_doubly_deprotonated_arginine( res: ResidueDefinition, ) -> list[ResidueDefinition]: """HH12 and HH22 are both acidic, but only one can leave at a time.""" atom_names = {atom.name for atom in res.atoms} if "HH22" not in atom_names and "HH12" not in atom_names: return [] else: return [res]
[docs]def add_nh2_leaving_atom( res: ResidueDefinition, ) -> list[ResidueDefinition]: """Add the missing leaving atom to the NH3 residue.""" return [ res.replace( atoms=[ *res.atoms, AtomDefinition.with_defaults(name="H3", symbol="H", leaving=True), ], bonds=[*res.bonds, BondDefinition.with_defaults(atom1="H3", atom2="N")], description=res.description + " +H3", ), res, ]
[docs]def strip_linkless_leavers( res: ResidueDefinition, ) -> list[ResidueDefinition]: """ Set ``leaving=False`` on all atoms if ``res`` has no linking bond or crosslink. """ if res.linking_bond is None and res.crosslink is None: return [res.replace(atoms=[atom.replace(leaving=False) for atom in res.atoms])] else: return [res]