Source code for openff.pablo.ccd._ccdcache

from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from copy import deepcopy
from io import StringIO
from pathlib import Path
from typing import Self, no_type_check
from urllib.error import URLError
from urllib.request import HTTPError, urlopen

import xdg.BaseDirectory as xdg_base_dir
from openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader

from openff.pablo._utils import flatten
from openff.pablo.exceptions import PabloError

from ..chem import PEPTIDE_BOND, PHOSPHODIESTER_BOND
from ..residue import (
    AtomDefinition,
    BondDefinition,
    ResidueDefinition,
    _skip_residue_definition_validation,
)

__all__ = [
    "CcdCache",
]


[docs]class CcdCache(Mapping[str, tuple[ResidueDefinition, ...]]):
    """
    Caches, patches, and presents the CCD as a Python ``Mapping``.

    This class is a wrapper around a ``dict`` that stores residue definitions.
    When a residue is requested via the indexing syntax (for example,
    ``my_ccd_cache["ALA"]``) or the ``in`` operator, this dictionary is checked
    first. If the residue is not present, the CCD is then checked. If the
    residue cannot be retrieved from the inner ``dict`` or the CCD, a
    ``KeyError`` is raised or ``False`` is returned as appropriate.

    Iterating over the mapping, checking its length, or otherwise treating the
    ``CcdCache`` as a mapping other than with the indexing syntax or ``in``
    operator works only on the inner ``dict``. As a result, accessing
    a residue via indexing may return a value even if these other methods
    suggest it won't.

    ``CcdCache`` can apply patches to the entries it downloads from the CCD.
    This is used to work around known errors, deficiencies and inconsistencies
    in the CCD definitions. Patches are specified as functions that take a
    single residue definition and return a list of them.

    The ``extra_definitions`` and the ``with_()`` and ``with_replaced`` methods
    allow custom definitions to be added to a ``CcdCache``. These custom
    definitions are not patched. Since they are stored alongside cached entries
    in the inner dictionary, custom definitions supercede any that have not
    already been downloaded from the CCD.

    When a CCD entry is downloaded, the corresponding CIF file is stored in the
    ``cache_path``. This means that each entry will be downloaded only once,
    even across multiple Python invocations. All entries in the cache are loaded
    (and patches applied) when a new ``CcdCache`` is created. ``CcdCache``
    assumes that the files in the cache path were downloaded from the CCD and
    may do unexpected things if they are edited by hand.

    Users may provide additional CCD entries by specifying library paths. By
    default, this is used to ship commonly used residues with Pablo. At the
    moment, patches are applied to files in library paths, but it is likely that
    in the future they won't be and residues shipped with Pablo will be shipped
    pre-patched to speed up load times. Like with the cache, files from library
    paths are loaded when a ``CcdCache`` is created.

    Accessing the CCD requires internet access. Without internet access, entries
    from the cache or library paths can still be loaded, as can any entries
    added to an instance of this class.

    Parameters
    ==========
    library_paths
        Paths to search for user-provided or packaged CCD entries. All paths are
        searched.
    cache_path
        The path to which to download CCD entries. This path is searched in
        addition to ``library_paths``.
    preload
        A list of residue names to download when initializing the class.
    patches
        Functions to call on ``ResidueDefinitions`` downloaded from the CCD
        before they are returned or added to the inner ``dict``. An iterable of
        maps from residue names each to a single callable. Each map is applied
        to residues with the given name in the order they are iterated over. Any
        patches corresponding to key ``"*"`` will be applied to all residues
        before the more specific patches in its map.
    extra_definitions
        Additional residue definitions to add to the cache. Note that patches
        are not applied to these definitions.
    """

    # TODO: remove library_paths in favour of loading pre-patched entries
    #       distributed with Pablo via extra_definitions

    # TODO: Consider storing extra definitions and patched CCD entries in
    #       separate inner dicts to make behaviour less dependent on access
    #       sequencing

    # TODO: Make cache_path optional

    # TODO: Compress the on-disk cache

    def __init__(
        self,
        library_paths: Iterable[Path | str],
        cache_path: Path | str = Path(
            xdg_base_dir.save_cache_path("openff-pablo"),
            "ccd_cache",
        ),
        preload: list[str] = [],
        patches: Iterable[
            Mapping[
                str,
                Callable[[ResidueDefinition], list[ResidueDefinition]],
            ]
        ] = {},
        extra_definitions: Mapping[str, Iterable[ResidueDefinition]] = {},
    ):
        self._cache_path = Path(cache_path).resolve()
        self._cache_path.mkdir(parents=True, exist_ok=True)
        self._library_paths = {Path(path).resolve() for path in library_paths}

        self._definitions: dict[str, list[ResidueDefinition]] = {}
        self._patches: list[
            dict[
                str,
                Callable[[ResidueDefinition], list[ResidueDefinition]],
            ]
        ] = [dict(d) for d in patches]

        for path in self._glob("*.cif"):
            try:
                self._add_definition_from_str(path.read_text(), patch=True)
            except Exception:
                # If adding a file fails, skip it - we want an error at runtime, not importtime
                pass

        for resname in set(preload) - set(self._definitions):
            try:
                self[resname]
            except Exception:
                # If a preload fails, skip it - we want an error at runtime, not importtime
                pass

        for resname, resdefs in extra_definitions.items():
            self._add_definitions(resdefs, resname)

    def __getitem__(self, key: str) -> tuple[ResidueDefinition, ...]:
        res_name = key.upper()
        if res_name in ["UNK", "UNL"] and res_name not in self._definitions:
            # These residue names are reserved for unknown ligands/peptide residues
            raise KeyError(res_name, "reserved residue name")
        # Check the loaded definitions
        try:
            return tuple(self._definitions[res_name])
        except KeyError:
            pass

        # Don't check the library; it was loaded in __init__, and
        # trying to hot-load a definition that's already in self._definitions
        # would be very confusing.

        # If it's a residue id that could be in the CCD, try to download it.
        # We should check the cache here in case this file has been downloaded
        # since this CcdCache was created - say, in a separate process, or
        # another CcdCache - but this is done by _add_definition_from_ccd()
        try:
            return tuple(self._add_definition_from_ccd(res_name))
        except HTTPError:
            raise KeyError(res_name, "unknown and absent from CCD")
        except URLError:
            raise KeyError(res_name, "unknown and CCD could not be accessed")

    def _add_definition_from_ccd(self, res_name: str) -> list[ResidueDefinition]:
        """Add a definition by retrieving it from the CCD (or the cache)"""
        cache_path = self._cache_path / f"{res_name}.cif"
        if cache_path.is_file():
            s = cache_path.read_text()
        else:
            s = self._download_cif(res_name)

        return self._add_definition_from_str(s, res_name=res_name, patch=True)

    def _apply_patches(
        self,
        residue_definition: ResidueDefinition,
    ) -> list[ResidueDefinition]:
        assert residue_definition.residue_name is not None

        with _skip_residue_definition_validation():
            definitions: list[ResidueDefinition] = [residue_definition]
            for patch_dict in self._patches:
                star_patch = patch_dict.get("*", lambda x: [x])
                res_patch = patch_dict.get(
                    residue_definition.residue_name.upper(),
                    lambda x: [x],
                )
                patched_definitions: list[ResidueDefinition] = []
                for definition in definitions:
                    for star_patched_res in star_patch(definition):
                        patched_definitions.extend(res_patch(star_patched_res))
                definitions = patched_definitions

        for definition in definitions:
            definition._validate()

        return definitions

    def _add_definition_from_str(
        self,
        s: str,
        res_name: str | None = None,
        patch: bool = False,
    ) -> list[ResidueDefinition]:
        definition = self._res_def_from_ccd_str(s)
        if res_name is None:
            if definition.residue_name is None:
                raise PabloError(
                    "Anonymous residue definitions cannot be added to a CcdCache",
                )
            res_name = definition.residue_name.upper()

        return self._add_definitions((definition,), res_name, patch=patch)

    def _add_definitions(
        self,
        definitions: Iterable[ResidueDefinition],
        res_name: str,
        patch: bool = False,
    ) -> list[ResidueDefinition]:
        if patch:
            definitions = list(flatten(map(self._apply_patches, definitions)))
        else:
            definitions = list(definitions)

        for definition in definitions:
            if definition.residue_name is None:
                raise PabloError(
                    "Anonymous residue definitions cannot be added to a CcdCache",
                )
            if res_name != definition.residue_name.upper():
                raise PabloError(
                    f"ResidueDefinition {definition.residue_name}"
                    + f" ({definition.description}) must have residue name {res_name}",
                )

        stored_definitions = self._definitions.setdefault(res_name, [])
        # Using a dict comprehension here rather than simple
        # set(definitions) - set(stored_definitions) to preserve ordering
        old_definitions = set(stored_definitions)
        stored_definitions.extend(
            {k: None for k in definitions if k not in old_definitions},
        )
        return stored_definitions

    def _download_cif(self, resname: str) -> str:
        with urlopen(
            f"https://files.rcsb.org/ligands/download/{resname.upper()}.cif",
        ) as stream:
            s: str = stream.read().decode("utf-8")
        path = self._cache_path / f"{resname.upper()}.cif"
        path.write_text(s)
        return s

    def _glob(
        self,
        pattern: str,
        *,
        cached: bool = True,
        library: bool = True,
    ) -> Iterator[Path]:
        """
        Get paths matching the given glob pattern from the cache and/or library

        Parameters
        ==========
        pattern
            The glob to search for
        cached
            Whether to look in the cache path for the glob
        library
            Whether to look in the library paths for the glob
        """
        for path in {
            *((self._cache_path,) if cached else ()),
            *(self._library_paths if library else ()),
        }:
            yield from path.glob(pattern)

    @staticmethod
    def _res_def_from_ccd_str(s: str) -> ResidueDefinition:
        @no_type_check
        def inner(s):
            # TODO: Handle residues like CL with a single atom properly (no tables)
            data = []
            with StringIO(s) as file:
                PdbxReader(file).read(data)
            block = data[0]

            parent_residue_name = (
                block.getObj("chem_comp").getValue("mon_nstd_parent_comp_id").upper()
            )
            parent_residue_name = (
                None if parent_residue_name == "?" else parent_residue_name
            )
            residueName = block.getObj("chem_comp").getValue("id").upper()
            residue_description = block.getObj("chem_comp").getValue("name")
            linking_type = block.getObj("chem_comp").getValue("type").upper()
            linking_bond = LINKING_TYPES[linking_type]

            atomData = block.getObj("chem_comp_atom")
            atomNameCol = atomData.getAttributeIndex("atom_id")
            altAtomNameCol = atomData.getAttributeIndex("alt_atom_id")
            symbolCol = atomData.getAttributeIndex("type_symbol")
            leavingCol = atomData.getAttributeIndex("pdbx_leaving_atom_flag")
            chargeCol = atomData.getAttributeIndex("charge")
            aromaticCol = atomData.getAttributeIndex("pdbx_aromatic_flag")
            stereoCol = atomData.getAttributeIndex("pdbx_stereo_config")

            atoms = [
                AtomDefinition(
                    name=row[atomNameCol],
                    synonyms=tuple(
                        [row[altAtomNameCol]]
                        if row[altAtomNameCol] != row[atomNameCol]
                        else [],
                    ),
                    symbol=row[symbolCol][0:1].upper() + row[symbolCol][1:].lower(),
                    leaving=row[leavingCol] == "Y",
                    charge=int(row[chargeCol]),
                    aromatic=row[aromaticCol] == "Y",
                    stereo=None if row[stereoCol] == "N" else row[stereoCol],
                )
                for row in atomData.getRowList()
            ]

            bondData = block.getObj("chem_comp_bond")
            if bondData is not None:
                atom1Col = bondData.getAttributeIndex("atom_id_1")
                atom2Col = bondData.getAttributeIndex("atom_id_2")
                orderCol = bondData.getAttributeIndex("value_order")
                aromaticCol = bondData.getAttributeIndex("pdbx_aromatic_flag")
                stereoCol = bondData.getAttributeIndex("pdbx_stereo_config")
                bonds = [
                    BondDefinition(
                        atom1=row[atom1Col],
                        atom2=row[atom2Col],
                        order={"SING": 1, "DOUB": 2, "TRIP": 3, "QUAD": 4}[
                            row[orderCol]
                        ],
                        aromatic=row[aromaticCol] == "Y",
                        stereo=None if row[stereoCol] == "N" else row[stereoCol],
                    )
                    for row in bondData.getRowList()
                ]
            else:
                bonds = []

            with _skip_residue_definition_validation():
                residue_definition = ResidueDefinition(
                    residue_name=residueName,
                    description=residue_description,
                    linking_bond=linking_bond,
                    crosslink=None,
                    atoms=tuple(atoms),
                    bonds=tuple(bonds),
                    virtual_sites=(),
                )

            return residue_definition

        return inner(s)

    def __contains__(self, value: object) -> bool:
        if value in self._definitions:
            return True
        if not isinstance(value, str):
            raise TypeError(
                f"CcdCache keys are residue names of type str, not {type(value)}",
            )

        try:
            self[value]
        except Exception:
            # This catches KeyError, but also failures to download the residue
            return False
        else:
            return True

    def __iter__(self) -> Iterator[str]:
        return self._definitions.__iter__()

    def __len__(self) -> int:
        return self._definitions.__len__()

[docs]    def with_(
        self,
        definitions: Mapping[str, Sequence[ResidueDefinition]]
        | Sequence[ResidueDefinition],
    ) -> Self:
        """
        Get a copy of this ``CcdCache`` with additional definitions added.

        Definitions may be supplied as a mapping from residue names to sequences
        of residue definitions, or as a sequence of residue definitions. In the
        latter case, the residue names are taken from the residue definitions
        themselves.

        Note that patches are not applied to the new definitions.

        Examples
        ========

        Add a custom definition to the ``STD_CCD_CACHE``. We use
        a 4-letter residue code as they are supported by Pablo's PDB reader and
        do not clash with the CCD's definitions.

        >>> from openff.pablo import STD_CCD_CACHE, ResidueDefinition
        >>> my_ccd_cache = STD_CCD_CACHE.with_([
        ...     ResidueDefinition.from_smiles(
        ...         "[H:1][O:2][O:3][H:4]",
        ...         {1: "H1", 2: "O1", 3: "O2", 4: "H2"},
        ...         "HOOH",
        ...     )
        ... ])

        Add protonation variants of a residue by specifying acidic and basic
        atoms.

        >>> from openff.pablo import STD_CCD_CACHE, ResidueDefinition
        >>>
        >>> # Get the GABA (γ-amino butanoic acid) residue definition from CCD
        >>> gaba_resdef = STD_CCD_CACHE["ABU"][0]
        >>>
        >>> # Generate the variants and add them to a new cache
        >>> my_ccd_cache = STD_CCD_CACHE.with_({
        ...     "ABU": gaba_resdef.vary_protonation(
        ...         acidic=["HXT"], # Atom name of abstractable proton
        ...         basic=[("N", "H3")], # Atom to protonate, name of new proton
        ...     )[1:], # Skip the first entry, which is already in the cache
        ... })
        >>> # Should have added three variants - positive, negative, zwitterion
        >>> len(my_ccd_cache["ABU"]) - len(STD_CCD_CACHE["ABU"])
        3

        See Also
        ========
        with_replaced, without

        """
        if not isinstance(definitions, Mapping):
            definitions_map: dict[str, list[ResidueDefinition]] = {}
            for resdef in definitions:
                if resdef.residue_name is None:
                    raise PabloError(
                        "Anonymous residue definitions cannot be added to a CcdCache",
                    )
                definitions_map.setdefault(resdef.residue_name, []).append(resdef)
            definitions = definitions_map

        new = deepcopy(self)
        for resname, resdefs in definitions.items():
            new._add_definitions(resdefs, resname)
        return new

[docs]    def with_replaced(
        self,
        definitions: Mapping[str, Sequence[ResidueDefinition]]
        | Sequence[ResidueDefinition],
    ) -> Self:
        """
        Get a copy of this ``CcdCache`` with some definitions replaced.

        Similar to ``with_``, but does not retain existing definitions for the
        specified residue names. All residue names that are keys of a
        ``definitions`` mapping or are residue names in a ``definitions``
        sequence are removed from the new ``CcdCache`` before adding the
        new definitions.

        Note that patches are not applied to the new definitions.

        See Also
        ========
        with_, without

        """
        if not isinstance(definitions, Mapping):
            definitions_map: dict[str, list[ResidueDefinition]] = {}
            for resdef in definitions:
                if resdef.residue_name is None:
                    raise PabloError(
                        "Anonymous residue definitions cannot be added to a CcdCache",
                    )
                definitions_map.setdefault(resdef.residue_name, []).append(resdef)
            definitions = definitions_map

        new = deepcopy(self)
        for resname, resdefs in definitions.items():
            del new._definitions[resname]
            new._add_definitions(resdefs, resname)
        return new

[docs]    def without(
        self,
        residue_names: Iterable[str],
    ) -> Self:
        """
        Get a copy of this ``CcdCache`` lacking any definitions with some names.

        All definitions for each of the given residue names will not be present
        in the new cache. Note that residues that are in the CCD will still be
        returned when they are requested, as long as they can be re-downloaded
        or found in the cache.

        See Also
        ========
        with_replaced, with_

        """
        new = deepcopy(self)
        for resname in residue_names:
            del new._definitions[resname]
        return new

[docs]    def with_patch(
        self,
        residue_name: str,
        patch: Callable[[ResidueDefinition], list[ResidueDefinition]],
    ) -> Self:
        """
        Add a patch to the residues loaded via a copy of this ``CcdCache``.

        The patch is added to a copy of the ``CcdCache``, and the copy is
        returned. The original ``CcdCache`` is left unmodified.

        The patch function is called on each residue definition stored under the
        given residue name. The returned residue definitions are concatenated
        and replace the originals. Patches can therefore add, modify, split, or
        replace residue definitions depending on whether they include the
        original definition in the output.

        The patch is applied to all definitions in the cache when this function
        is applied, as well as any definitions downloaded from the CCD in the
        future. It is not applied to definitions added by the other
        ``CcdCache.with_*()`` methods.
        """
        new = deepcopy(self)
        new._patches.append({residue_name: patch})
        if residue_name in new._definitions:
            new._definitions[residue_name] = list(
                flatten(patch(resdef) for resdef in new._definitions[residue_name]),
            )
        return new

[docs]    def with_virtual_sites(
        self,
        residue_name: str,
        virtual_sites: Iterable[str],
    ) -> Self:
        """
        Copy ``self``, adding new residue definitions requiring some virtual sites.

        The new definition is added to a copy of the ``CcdCache``, and the copy
        is returned. The original ``CcdCache`` is left unmodified.

        A new residue definition is added for each definition currently stored
        in the cache under the given name. The new definition requires that all
        the given virtual site names be present in order for it to match, and it
        discards the corresponding ATOM/HETATM records.

        This method works by adding a patch. It will affect any residue
        definition already added to the cache under the given name, or any
        definition downloaded in the future, but not any definition added in the
        future by the ``with_`` or ``with_replaced`` methods.
        """
        virtual_sites = tuple(virtual_sites)
        return self.with_patch(
            residue_name,
            lambda resdef: [resdef, resdef.replace(virtual_sites=virtual_sites)],
        )

[docs]    def with_vsite_water(self) -> Self:
        """
        Copy ``self``, adding new definitions for common multisite water models.

        The new definitions are added to a copy of the ``CcdCache``, and the
        copy is returned. The original ``CcdCache`` is left unmodified.

        The new definitions require that all the virtual site names be
        present in order for them to match, and they discard the corresponding
        ATOM/HETATM records. The name for the 4-point model virtual site is
        ``EPW``, and for the 5-point model ``EP1`` and ``EP2``.

        This method works by adding a patch. It will affect any 3-atom residue
        definitions already added to the cache under the names ``HOH``, ``WAT``,
        or ``SOL``, or any so-named definition downloaded in the future, but not
        any  definition added in the future by the ``with_`` or ``with_replaced``
        methods.
        """

        def add_vsites_to_water(resdef: ResidueDefinition) -> list[ResidueDefinition]:
            if resdef.n_expected_atoms == 3 and {
                atom.name for atom in resdef.atoms
            } == {"O", "H1", "H2"}:
                return [
                    resdef,
                    resdef.replace(virtual_sites=["EPW"]),
                    resdef.replace(virtual_sites=["EP1", "EP2"]),
                ]
            else:
                return [resdef]

        return (
            self.with_patch(
                "HOH",
                add_vsites_to_water,
            )
            .with_patch(
                "WAT",
                add_vsites_to_water,
            )
            .with_patch(
                "SOL",
                add_vsites_to_water,
            )
        )

[docs]    def with_varied_protonation(
        self,
        residue_name: str,
        *,
        acidic: Iterable[str] = (),
        basic: Iterable[tuple[str, str]] = (),
    ) -> Self:
        """
        Get a copy of ``self`` with all combinations of some protonation states.

        Note that all combinations of protonations and deprotonations are
        generated; this means that if ``acidic`` has length ``n`` and ``basic``
        has length ``m``, ``2**(n+m)`` variants will be generated for each
        existing variant.

        If no variants at all are generated, ``PabloError`` is raised. Otherwise,
        whatever variants make sense are created for each existing variant.

        This method will download the given residue name from the CCD if it is
        not already in the cache.

        Parameters
        ==========
        residue_name
            The name of the residue to generate alternate protonation states
            for.
        acidic
            Existing hydrogen atoms that can be removed to form a new
            protonation state. Each element specifies an atom name to remove,
            decrementing the formal charge on the neighbouring heavy atom.
            Multiply bonded, unbonded, missing, or non-hydrogen atoms are
            skipped unless no variants at all are generated.
        basic
            Existing non-hydrogen atoms that can be protonated to form a new
            protonation state, as well as the canonical name of the new
            hydrogen. Each tuple specifies an atom name to protonate (increment
            the formal charge and form a bond) and the name of the added proton.
            Unknown heavy atoms and new atom names that clash with existing
            names raise are skipped unless no variants at all are generated.
        """
        original_resdefs = self[residue_name]
        if len(original_resdefs) == 0:
            raise PabloError("cannot (de)protonate unknown residue {residue_name}")

        new_resdefs: list[ResidueDefinition] = []
        for resdef in original_resdefs:
            resdef_variants = resdef.vary_protonation(
                acidic=acidic,
                basic=basic,
                skip_errors=True,
            )
            # Only add the new variants so we can keep count of how many we've
            # come up with
            for i, variant in enumerate(resdef_variants[1:]):
                # Filter out duplicate variants
                if not any(
                    variant._is_isomorphic_to(other) for other in resdef_variants[:i]
                ):
                    new_resdefs.append(variant)
        if len(new_resdefs) == 0:
            raise PabloError("no new protonation variants added")

        return self.with_({residue_name: new_resdefs})


# TODO: Fill in this data
LINKING_TYPES: dict[str, BondDefinition | None] = {
    # "D-beta-peptide, C-gamma linking".upper(): [],
    # "D-gamma-peptide, C-delta linking".upper(): [],
    # "D-peptide COOH carboxy terminus".upper(): [],
    # "D-peptide NH3 amino terminus".upper(): [],
    # "D-peptide linking".upper(): [],
    # "D-saccharide".upper(): [],
    # "D-saccharide, alpha linking".upper(): [],
    # "D-saccharide, beta linking".upper(): [],
    # "DNA OH 3 prime terminus".upper(): [],
    # "DNA OH 5 prime terminus".upper(): [],
    "DNA linking".upper(): PHOSPHODIESTER_BOND,
    "L-DNA linking".upper(): PHOSPHODIESTER_BOND,
    "L-RNA linking".upper(): PHOSPHODIESTER_BOND,
    # "L-beta-peptide, C-gamma linking".upper(): [],
    # "L-gamma-peptide, C-delta linking".upper(): [],
    # "L-peptide COOH carboxy terminus".upper(): [],
    # "L-peptide NH3 amino terminus".upper(): [],
    "L-peptide linking".upper(): PEPTIDE_BOND,
    # "L-saccharide".upper(): [],
    # "L-saccharide, alpha linking".upper(): [],
    # "L-saccharide, beta linking".upper(): [],
    # "RNA OH 3 prime terminus".upper(): [],
    # "RNA OH 5 prime terminus".upper(): [],
    "RNA linking".upper(): PHOSPHODIESTER_BOND,
    "non-polymer".upper(): None,
    # "other".upper(): [],
    "peptide linking".upper(): PEPTIDE_BOND,
    "peptide-like".upper(): PEPTIDE_BOND,
    # "saccharide".upper(): [],
}
"""Map from the CCD's linking types to the bond formed between two such monomers"""