Source code for openff.pablo._pdb

import logging
import warnings
from collections.abc import Collection, Iterable, Mapping
from io import TextIOBase
from os import PathLike
from pathlib import Path
from typing import IO, Literal

import numpy as np
from openff.toolkit import Molecule, Topology
from openff.units import unit

from openff.pablo._matching import SuccessfulMatch
from openff.pablo._pdb_data import PdbData
from openff.pablo._std_ccd_cache import STD_CCD_CACHE
from openff.pablo._utils import (
    cryst_to_box_vectors,
    sort_tuple,
)
from openff.pablo.exceptions import PabloError
from openff.pablo.residue import ResidueDefinition

__all__ = [
    "topology_from_pdb",
]


[docs]def topology_from_pdb( file: PathLike[str] | str | IO[str] | TextIOBase, *, residue_library: Mapping[ str, Collection[ResidueDefinition], ] = STD_CCD_CACHE, additional_definitions: Collection[ResidueDefinition] = [], format: Literal["PDB", "CIF", None] = None, use_canonical_names: bool = False, ) -> Topology: """ Load a PDB file into an OpenFF ``Topology``. This function requires all hydrogens (and all other atoms) to be present in the PDB file, and that atom and residue names are consistent with the ``residue_library``. In return, it provides full chemical information on the entire PDB file. To load a PDB file with molecules including any residue not found in the CCD, or with residues that differ from that specified under a particular residue name, provide your own ``residue_library``. Any mapping from a residue name to a list of :py:data:`ResidueDefinition <openff.pablo.ResidueDefinition>` objects may be used, but the :py:mod:`ccd <openff.pablo.ccd>` module provides tools for augmenting the CCD. Alternatively, to load a single-residue molecule that is not present in the CCD, name that molecule ``"UNL"`` (or any name not present in the ``residue_library``), specify its CONECT records, and provide the appropriate molecule to the ``additional_definitions`` argument. Note that chemical information is derived from matching the residue definitions provided to this function to the atom names, residue names, elements, formal charges, and CONECT records in the PDB file. In partiular, the presence, absence, or electronic properties of a bond are never inferred from atomic coordinates. However, the stereochemistry of atoms and bonds are computed from their atomic coordinates, even when that information is present in a residue definition. Parameters ---------- file The path to the PDB file or the PDB file as a file-like object. residue_library The library of residues to identify the atoms in the PDB file by. By default, a patched version of the CCD. Chemistry is identified by atom and residue names. If multiple residue definitions match a particular residue, the first one encountered is applied. additional_definitions Additional residue definitions to match against all residues that found no matches in the ``residue_library``. These definitions can match whether or not the residue name matches. Unlike ``residue_library``, this requires that CONECT records be present for any bonds not covered by the library and performs a match between the chemical graphs rather than using residue and atom names to detect chemistry. To use this argument with OpenFF ``Molecule`` objects or SMILES strings, see the ``ResidueDefinition.from_*`` class methods. format The file format the file is encoded in. `"PDB"` expects a standard PDB file. `"CIF"` expects a PDBx/mmCIF file. Omitting the argument or `None` interprets files with a known filename according to their filename extension, or an unknown filename extension as PDB. use_canonical_names If ``True``, atom names in the PDB file will be replaced by the canonical name for the same atom from the residue library. Notes ----- This function uses a residue library to load a PDB file from its atom and residue names without guessing bonds. Bonds will be added by comparing atom and residue names to the residues defined in the ``residue_library`` argument, which by default uses a patched version of the RCSB Chemical Component Dictionary (CCD). This is the dictionary of residue and atom names that the RCSB PDB is referenced against. The CCD is very large and cannot be distributed with this software, so by default internet access is required to use it. The produced ``Topology`` will have its atoms in the same order as the PDB file in all cases except when the atoms in one molecule are divided by another molecule. This can happen, for example, if a PDB file with 3 chains A, B and C has a disulfide bond between A and C. In this case, chains A and C form a single molecule, but the atoms from B should be in the middle. This atom ordering cannot be represented in :py:class:`openff.toolkit.Topology <openff.toolkit.topology.Topology>` unless all 3 chains are included in a single :py:class:`openff.toolkit.Molecule <openff.toolkit.topology.Molecule>`, which would then represent two distinct chemical molecules. When this occurs, atoms from the latter chain(s) appear immediately after the first, and atoms from other molecules appear later. The following metadata are specified for all atoms produced by this function and can be accessed via ``topology.atom(i).metadata[key]``: ``"residue_name"`` The residue name ``"residue_number"`` The residue number, converted to an ``int``. If the residue number cannot be converted to an ``int``, the residue index instead. ``"res_seq"`` The residue number as the string found in the PDB file ``"residue_index"`` The residue index; the first residue has index ``0``, the second ``1``, etc., regardless of the value of the res_seq column. ``"insertion_code"`` The icode for the atom's residue. Used to align residue numbers between proteins with indels. ``"chain_id"`` The letter identifier for the atom's chain. ``"pdb_index"`` The atom's index in the PDB file. Sometimes called rank. Not to be confused with ``"atom_serial"``, which is the number given to the atom in the second column of the PDB file. Guaranteed to be unique. Care is taken to make this match the index of the atom within the topology as closely as possible, but this is not possible when virtual sites are present or when the PDB atom order cannot be represented in a ``Topology``. ``"used_synonym"`` The name of the atom that was found in the PDB file. By default, `atom.name` is set to this. ``"canonical_name"`` The canonical name of the atom in the residue library. `atom.name` can be set to this with the `use_canonical_names` argument. ``"atom_serial"`` The serial number of the atom, found in the second column of the PDB file, as a string. Not guaranteed to be unique. ``"b_factor"`` The temperature b-factor for the atom. ``"occupancy"`` The occupancy for the atom. ``"alt_loc"`` The alternate location code for the atom. ``"pdb_line_no"`` The line number in the PDB file that contained this atom record. ``"matched_residue_description"`` The residue description found in the residue library. ``"matched_stereo"`` The stereochemistry defined for this atom in the residue definition. This may differ from the stereochemistry assigned to the atom, which is computed from the atomic coordinates. One of the strings ``"R"``, ``"S"``, or ``""``. """ if hasattr(file, "readlines"): if format is None and hasattr(file, "filename"): format = Path(file.name).suffix[1:].upper() # type: ignore elif format is None or format == "": # type: ignore format = "PDB" data = PdbData.from_file_object(file, format=format) # type: ignore else: data = PdbData.from_file(file, format=format) # type: ignore matches = data.get_successful_matches( residue_library, list(additional_definitions), ) topology = _build_topology( matches=matches, data=data, use_canonical_names=use_canonical_names, ) _check_all_conects(topology, data) return topology
def _build_topology( matches: Iterable[SuccessfulMatch], data: PdbData, *, use_canonical_names: bool, ) -> Topology: rdmol = data.matches_to_rdmol(matches, use_canonical_names=use_canonical_names) # Set positions logging.debug("Setting conformer to PDB positions") rdmol_pdb_indices = [atom.properties["pdb_index"] for atom in rdmol.atoms] positions = np.stack([data.x, data.y, data.z], axis=-1) * unit.angstrom rdmol = rdmol.edit().add_conformer_and(positions[rdmol_pdb_indices]).freeze() molecules: list[Molecule] = [] for rdmol in rdmol.split_molecule_fragments(): offmol = ( rdmol.edit() .assign_stereochemistry_from_3d_and() .freeze() .to_openff_molecule() ) offmol.add_default_hierarchy_schemes() molecules.append(offmol) logging.debug("produce topology") topology = Topology.from_molecules(molecules) topology_pdb_indices = [atom.metadata["pdb_index"] for atom in topology.atoms] if topology_pdb_indices != list(range(len(topology_pdb_indices))): logging.debug( "\n".join( f"topology index {j: <7} has pdb index {i: <7}" for j, i in enumerate(topology_pdb_indices) if i != j ), ) warnings.warn( "Input PDB has an atom ordering that cannot be represented in an" + " OpenFF Topology. The atoms in this topology will not be in same" + " order as those in PDB file", ) _set_box_vectors(topology, data) return topology def _check_all_conects(topology: Topology, data: PdbData): all_bonds: set[tuple[int, int]] = { sort_tuple((bond.atom1.metadata["pdb_index"], bond.atom2.metadata["pdb_index"])) # type:ignore for bond in topology.bonds } conect_bonds: set[tuple[int, int]] = set() for i, js in enumerate(data.conects): for j in js: conect_bonds.add(sort_tuple((i, j))) if not conect_bonds.issubset(all_bonds): raise PabloError( "CONECT records without chemical information not supported", sorted( { sort_tuple((data.serial[a], data.serial[b])) for a, b in conect_bonds.difference(all_bonds) }, ), ) def _set_box_vectors(topology: Topology, data: PdbData): if ( data.cryst1_a is not None and data.cryst1_b is not None and data.cryst1_c is not None and data.cryst1_alpha is not None and data.cryst1_beta is not None and data.cryst1_gamma is not None ): topology.box_vectors = cryst_to_box_vectors( data.cryst1_a, data.cryst1_b, data.cryst1_c, data.cryst1_alpha, data.cryst1_beta, data.cryst1_gamma, )