Source code for qcelemental.models.molecule

"""
Molecule Object Model
"""

import hashlib
import json
import warnings
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union, cast

import numpy as np
from pydantic import ConstrainedFloat, ConstrainedInt, Field, constr, validator

# molparse imports separated b/c https://github.com/python/mypy/issues/7203
from ..molparse.from_arrays import from_arrays
from ..molparse.from_schema import from_schema
from ..molparse.from_string import from_string
from ..molparse.to_schema import to_schema
from ..molparse.to_string import to_string
from ..periodic_table import periodictable
from ..physical_constants import constants
from ..testing import compare, compare_values
from ..util import deserialize, measure_coordinates, msgpackext_loads, provenance_stamp, which_import
from .basemodels import ProtoModel, qcschema_draft
from .common_models import Provenance, qcschema_molecule_default
from .types import Array

if TYPE_CHECKING:
    from pydantic.typing import ReprArgs

# Rounding quantities for hashing
GEOMETRY_NOISE = 8
MASS_NOISE = 6
CHARGE_NOISE = 4

_extension_map = {
    ".npy": "numpy",
    ".json": "json",
    ".xyz": "xyz",
    ".psimol": "psi4",
    ".psi4": "psi4",
    ".msgpack": "msgpack",
}


def float_prep(array, around):
    """
    Rounds floats to a common value and build positive zeros to prevent hash conflicts.
    """
    if isinstance(array, (list, np.ndarray)):
        # Round array
        array = np.around(array, around)
        # Flip zeros
        array[np.abs(array) < 5 ** (-(around + 1))] = 0

    elif isinstance(array, (float, int)):
        array = round(array, around)
        if array == -0.0:
            array = 0.0
    else:
        raise TypeError("Type '{}' not recognized".format(type(array).__name__))

    return array


class NonnegativeInt(ConstrainedInt):
    ge = 0


class BondOrderFloat(ConstrainedFloat):
    ge = 0
    le = 5


class Identifiers(ProtoModel):
    """Canonical chemical identifiers"""

    molecule_hash: Optional[str] = None
    molecular_formula: Optional[str] = None
    smiles: Optional[str] = None
    inchi: Optional[str] = None
    inchikey: Optional[str] = None
    canonical_explicit_hydrogen_smiles: Optional[str] = None
    canonical_isomeric_explicit_hydrogen_mapped_smiles: Optional[str] = None
    canonical_isomeric_explicit_hydrogen_smiles: Optional[str] = None
    canonical_isomeric_smiles: Optional[str] = None
    canonical_smiles: Optional[str] = None
    pubchem_cid: Optional[str] = Field(None, description="PubChem Compound ID")
    pubchem_sid: Optional[str] = Field(None, description="PubChem Substance ID")
    pubchem_conformerid: Optional[str] = Field(None, description="PubChem Conformer ID")

    class Config(ProtoModel.Config):
        serialize_skip_defaults = True


[docs]class Molecule(ProtoModel): """ The physical Cartesian representation of the molecular system. A QCSchema representation of a Molecule. This model contains data for symbols, geometry, connectivity, charges, fragmentation, etc while also supporting a wide array of I/O and manipulation capabilities. Molecule objects geometry, masses, and charges are truncated to 8, 6, and 4 decimal places respectively to assist with duplicate detection. Notes ----- All arrays are stored flat but must be reshapable into the dimensions in attribute ``shape``, with abbreviations as follows: nat: number of atomic = calcinfo_natom nfr: number of fragments <varies>: irregular dimension not systematically reshapable """ schema_name: constr(strip_whitespace=True, regex="^(qcschema_molecule)$") = Field( # type: ignore qcschema_molecule_default, description=( f"The QCSchema specification to which this model conforms. Explicitly fixed as {qcschema_molecule_default}." ), ) schema_version: int = Field( # type: ignore 2, description="The version number of ``schema_name`` to which this model conforms." ) validated: bool = Field( # type: ignore False, description="A boolean indicator (for speed purposes) that the input Molecule data has been previously checked " "for schema (data layout and type) and physics (e.g., non-overlapping atoms, feasible " "multiplicity) compliance. This should be False in most cases. A ``True`` setting " "should only ever be set by the constructor for this class itself or other trusted sources such as " "a Fractal Server or previously serialized Molecules.", ) # Required data symbols: Array[str] = Field( # type: ignore ..., description="The ordered array of atomic elemental symbols in title case. This field's index " "sets atomic order for all other per-atom fields like ``real`` and the first dimension of " "``geometry``. Ghost/virtual atoms must have an entry here in ``symbols``; ghostedness is " "indicated through the ``real`` field.", shape=["nat"], ) geometry: Array[float] = Field( # type: ignore ..., description="The ordered array for Cartesian XYZ atomic coordinates [a0]. " "Atom ordering is fixed; that is, a consumer who shuffles atoms must not reattach the input " "(pre-shuffling) molecule schema instance to any output (post-shuffling) per-atom results " "(e.g., gradient). Index of the first dimension matches the 0-indexed indices of all other " "per-atom settings like ``symbols`` and ``real``." "\n" "Serialized storage is always flat, (3*nat,), but QCSchema implementations will want to reshape it. " "QCElemental can also accept array-likes which can be mapped to (nat,3) such as a 1-D list of length 3*nat, " "or the serialized version of the array in (3*nat,) shape; all forms will be reshaped to " "(nat,3) for this attribute.", shape=["nat", 3], units="a0", ) # Molecule data name: Optional[str] = Field( # type: ignore None, description="Common or human-readable name to assign to this molecule. This field can be arbitrary; see ``identifiers`` for well-defined labels.", ) identifiers: Optional[Identifiers] = Field( # type: ignore None, description="An optional dictionary of additional identifiers by which this molecule can be referenced, " "such as INCHI, canonical SMILES, etc. See the :class:``Identifiers`` model for more details.", ) comment: Optional[str] = Field( # type: ignore None, description="Additional comments for this molecule. Intended for pure human/user consumption and clarity.", ) molecular_charge: float = Field(0.0, description="The net electrostatic charge of the molecule.") # type: ignore molecular_multiplicity: int = Field(1, description="The total multiplicity of the molecule.") # type: ignore # Atom data masses_: Optional[Array[float]] = Field( # type: ignore None, description="The ordered array of atomic masses. Index order " "matches the 0-indexed indices of all other per-atom fields like ``symbols`` and ``real``. If " "this is not provided, the mass of each atom is inferred from its most common isotope. If this " "is provided, it must be the same length as ``symbols`` but can accept ``None`` entries for " "standard masses to infer from the same index in the ``symbols`` field.", shape=["nat"], units="u", ) real_: Optional[Array[bool]] = Field( # type: ignore None, description="The ordered array indicating if each atom is real (``True``) or " "ghost/virtual (``False``). Index " "matches the 0-indexed indices of all other per-atom settings like ``symbols`` and the first " "dimension of ``geometry``. If this is not provided, all atoms are assumed to be real (``True``)." "If this is provided, the reality or ghostedness of every atom must be specified.", shape=["nat"], ) atom_labels_: Optional[Array[str]] = Field( # type: ignore None, description="Additional per-atom labels as an array of strings. Typical use is in " "model conversions, such as Elemental <-> Molpro and not typically something which should be user " "assigned. See the ``comments`` field for general human-consumable text to affix to the molecule.", shape=["nat"], ) atomic_numbers_: Optional[Array[np.int16]] = Field( # type: ignore None, description="An optional ordered 1-D array-like object of atomic numbers of shape (nat,). Index " "matches the 0-indexed indices of all other per-atom settings like ``symbols`` and ``real``. " "Values are inferred from the ``symbols`` list if not explicitly set. " "Ghostedness should be indicated through ``real`` field, not zeros here.", shape=["nat"], ) mass_numbers_: Optional[Array[np.int16]] = Field( # type: ignore None, description="An optional ordered 1-D array-like object of atomic *mass* numbers of shape (nat). Index " "matches the 0-indexed indices of all other per-atom settings like ``symbols`` and ``real``. " "Values are inferred from the most common isotopes of the ``symbols`` list if not explicitly set. " "If single isotope not (yet) known for an atom, -1 is placeholder.", shape=["nat"], ) # Fragment and connection data connectivity_: Optional[List[Tuple[NonnegativeInt, NonnegativeInt, BondOrderFloat]]] = Field( # type: ignore None, description="A list of bonds within the molecule. Each entry is a tuple " "of ``(atom_index_A, atom_index_B, bond_order)`` where the ``atom_index`` " "matches the 0-indexed indices of all other per-atom settings like ``symbols`` and ``real``. " "Bonds may be freely reordered and inverted.", min_items=1, ) fragments_: Optional[List[Array[np.int32]]] = Field( # type: ignore None, description="List of indices grouping atoms (0-indexed) into molecular fragments within the molecule. " "Each entry in the outer list is a new fragment; index matches the ordering in ``fragment_charges`` and " "``fragment_multiplicities``. Inner lists are 0-indexed atoms which compose the fragment; every atom must " "be in exactly one inner list. Noncontiguous fragments are allowed, though no QM program is known to support them. " "Fragment ordering is fixed; that is, a consumer who shuffles fragments must not reattach the input " "(pre-shuffling) molecule schema instance to any output (post-shuffling) per-fragment results (e.g., n-body energy arrays).", shape=["nfr", "<varies>"], ) fragment_charges_: Optional[List[float]] = Field( # type: ignore None, description="The total charge of each fragment in the ``fragments`` list. The index of this " "list matches the 0-index indices of ``fragments`` list. Will be filled in based on a set of rules " "if not provided (and ``fragments`` are specified).", shape=["nfr"], ) fragment_multiplicities_: Optional[List[int]] = Field( # type: ignore None, description="The multiplicity of each fragment in the ``fragments`` list. The index of this " "list matches the 0-index indices of ``fragments`` list. Will be filled in based on a set of " "rules if not provided (and ``fragments`` are specified).", shape=["nfr"], ) # Orientation fix_com: bool = Field( # type: ignore False, description="Whether translation of geometry is allowed (fix F) or disallowed (fix T)." "When False, QCElemental will pre-process the Molecule object to translate the center of mass " "to (0,0,0) in Euclidean coordinate space, resulting in a different ``geometry`` than the " "one provided. " "guidance: A consumer who translates the geometry must not reattach the input (pre-translation) molecule schema instance to any output (post-translation) origin-sensitive results (e.g., an ordinary energy when EFP present).", ) fix_orientation: bool = Field( # type: ignore False, description="Whether rotation of geometry is allowed (fix F) or disallowed (fix T). " "When False, QCElemental will pre-process the Molecule object to orient via the intertial tensor, " "resulting in a different ``geometry`` than the one provided. " "guidance: A consumer who rotates the geometry must not reattach the input (pre-rotation) molecule schema instance to any output (post-rotation) frame-sensitive results (e.g., molecular vibrations).", ) fix_symmetry: Optional[str] = Field( # type: ignore None, description="Maximal point group symmetry which ``geometry`` should be treated. Lowercase." ) # Extra provenance: Provenance = Field( default_factory=partial(provenance_stamp, __name__), description="The provenance information about how this Molecule (and its attributes) were generated, " "provided, and manipulated.", ) id: Optional[Any] = Field( # type: ignore None, description="A unique identifier for this Molecule object. This field exists primarily for Databases " "(e.g. Fractal's Server) to track and lookup this specific object and should virtually " "never need to be manually set.", ) extras: Dict[str, Any] = Field( # type: ignore None, description="Additional information to bundle with the molecule. Use for schema development and scratch space.", ) class Config(ProtoModel.Config): serialize_skip_defaults = True repr_style = lambda self: [ ("name", self.name), ("formula", self.get_molecular_formula()), ("hash", self.get_hash()[:7]), ] fields = { "masses_": "masses", "real_": "real", "atom_labels_": "atom_labels", "atomic_numbers_": "atomic_numbers", "mass_numbers_": "mass_numbers", "connectivity_": "connectivity", "fragments_": "fragments", "fragment_charges_": "fragment_charges", "fragment_multiplicities_": "fragment_multiplicities", } def schema_extra(schema, model): # below addresses the draft-04 issue until https://github.com/samuelcolvin/pydantic/issues/1478 . schema["$schema"] = qcschema_draft def __init__(self, orient: bool = False, validate: Optional[bool] = None, **kwargs: Any) -> None: """Initializes the molecule object from dictionary-like values. Parameters ---------- orient : bool, optional If True, orientates the Molecule to a common reference frame. validate : Optional[bool], optional If ``None`` validation is always applied unless the ``validated`` flag is set. Otherwise uses the boolean to decide to validate the Molecule or not. **kwargs : Any The values of the Molecule object attributes. """ if validate is None: validate = not kwargs.get("validated", False) geometry_prep = kwargs.pop("_geometry_prep", False) if validate: kwargs["schema_name"] = kwargs.pop("schema_name", "qcschema_molecule") kwargs["schema_version"] = kwargs.pop("schema_version", 2) # original_keys = set(kwargs.keys()) # revive when ready to revisit sparsity nonphysical = kwargs.pop("nonphysical", False) schema = to_schema( from_schema(kwargs, nonphysical=nonphysical), dtype=kwargs["schema_version"], copy=False, np_out=True ) schema = _filter_defaults(schema) kwargs["validated"] = True kwargs = {**kwargs, **schema} # Allow any extra fields validate = True super().__init__(**kwargs) # We are pulling out the values *explicitly* so that the pydantic skip_defaults works as expected # All attributes set below are equivalent to the default set. values = self.__dict__ if validate: values["symbols"] = np.core.defchararray.title(self.symbols) # Title case for consistency if orient: values["geometry"] = float_prep(self._orient_molecule_internal(), GEOMETRY_NOISE) elif validate or geometry_prep: values["geometry"] = float_prep(values["geometry"], GEOMETRY_NOISE) @validator("geometry") def _must_be_3n(cls, v, values, **kwargs): n = len(values["symbols"]) try: v = v.reshape(n, 3) except (ValueError, AttributeError): raise ValueError("Geometry must be castable to shape (N,3)!") return v @validator("masses_", "real_") def _must_be_n(cls, v, values, **kwargs): n = len(values["symbols"]) if len(v) != n: raise ValueError("Masses and Real must be same number of entries as Symbols") return v @validator("real_") def _populate_real(cls, v, values, **kwargs): # Can't use geometry here since its already been validated and not in values n = len(values["symbols"]) if len(v) == 0: v = np.array([True for _ in range(n)]) return v @validator("fragment_charges_", "fragment_multiplicities_") def _must_be_n_frag(cls, v, values, **kwargs): if "fragments_" in values and values["fragments_"] is not None: n = len(values["fragments_"]) if len(v) != n: raise ValueError( "Fragment Charges and Fragment Multiplicities must be same number of entries as Fragments" ) return v @property def hash_fields(self): return [ "symbols", "masses", "molecular_charge", "molecular_multiplicity", "real", "geometry", "fragments", "fragment_charges", "fragment_multiplicities", "connectivity", ] @property def masses(self) -> Array[float]: masses = self.__dict__.get("masses_") if masses is None: masses = np.array([periodictable.to_mass(x) for x in self.symbols]) return masses @property def real(self) -> Array[bool]: real = self.__dict__.get("real_") if real is None: real = np.array([True for x in self.symbols]) return real @property def atom_labels(self) -> Array[str]: atom_labels = self.__dict__.get("atom_labels_") if atom_labels is None: atom_labels = np.array(["" for x in self.symbols]) return atom_labels @property def atomic_numbers(self) -> Array[np.int16]: atomic_numbers = self.__dict__.get("atomic_numbers_") if atomic_numbers is None: atomic_numbers = np.array([periodictable.to_Z(x) for x in self.symbols]) return atomic_numbers @property def mass_numbers(self) -> Array[np.int16]: mass_numbers = self.__dict__.get("mass_numbers_") if mass_numbers is None: mass_numbers = np.array([periodictable.to_A(x) for x in self.symbols]) return mass_numbers @property def connectivity(self) -> List[Tuple[int, int, float]]: connectivity = self.__dict__.get("connectivity_") # default is None, not [] return connectivity @property def fragments(self) -> List[Array[np.int32]]: fragments = self.__dict__.get("fragments_") if fragments is None: fragments = [np.arange(len(self.symbols), dtype=np.int32)] return fragments @property def fragment_charges(self) -> List[float]: fragment_charges = self.__dict__.get("fragment_charges_") if fragment_charges is None: fragment_charges = [self.molecular_charge] return fragment_charges @property def fragment_multiplicities(self) -> List[int]: fragment_multiplicities = self.__dict__.get("fragment_multiplicities_") if fragment_multiplicities is None: fragment_multiplicities = [self.molecular_multiplicity] return fragment_multiplicities ### Non-Pydantic API functions def show(self, ngl_kwargs: Optional[Dict[str, Any]] = None) -> "nglview.NGLWidget": # type: ignore """Creates a 3D representation of a moleucle that can be manipulated in Jupyter Notebooks and exported as images (`.png`). Parameters ---------- ngl_kwargs : Optional[Dict[str, Any]], optional Addition nglview NGLWidget kwargs Returns ------- nglview.NGLWidget A nglview view of the molecule """ if not which_import("nglview", return_bool=True): raise ModuleNotFoundError( f"Python module nglwview not found. Solve by installing it: `conda install -c conda-forge nglview`" ) # pragma: no cover import nglview as nv # type: ignore if ngl_kwargs is None: ngl_kwargs = {} structure = nv.TextStructure(self.to_string("nglview-sdf"), ext="sdf") widget = nv.NGLWidget(structure, **ngl_kwargs) return widget def measure( self, measurements: Union[List[int], List[List[int]]], *, degrees: bool = True ) -> Union[float, List[float]]: """ Takes a measurement of the moleucle from the indicies provided. Parameters ---------- measurements : Union[List[int], List[List[int]]] Either a single list of indices or multiple. Return a distance, angle, or dihedral depending if 2, 3, or 4 indices is provided, respectively. Values are returned in Bohr (distance) or degree. degrees : bool, optional Returns degrees by default, radians otherwise. Returns ------- Union[float, List[float]] Either a value or list of the measured values. """ return measure_coordinates(self.geometry, measurements, degrees=degrees) def orient_molecule(self): """ Centers the molecule and orients via inertia tensor before returning a new Molecule """ return Molecule(orient=True, **self.dict()) def compare(self, other): warnings.warn( "Molecule.compare is deprecated and will be removed in v0.13.0. Use == instead.", DeprecationWarning ) return self == other def __eq__(self, other): """ Checks if two molecules are identical. This is a molecular identity defined by scientific terms, and not programing terms, so it's less rigorous than a programmatic equality or a memory equivalent `is`. """ if isinstance(other, dict): other = Molecule(orient=False, **other) elif isinstance(other, Molecule): pass else: raise TypeError("Comparison molecule not understood of type '{}'.".format(type(other))) return self.get_hash() == other.get_hash() def dict(self, *args, **kwargs): kwargs["by_alias"] = True kwargs["exclude_unset"] = True return super().dict(*args, **kwargs) def pretty_print(self): """Print the molecule in Angstroms. Same as :py:func:`print_out` only always in Angstroms. (method name in libmints is print_in_angstrom) """ text = "" text += """ Geometry (in {0:s}), charge = {1:.1f}, multiplicity = {2:d}:\n\n""".format( "Angstrom", self.molecular_charge, self.molecular_multiplicity ) text += """ Center X Y Z \n""" text += """ ------------ ----------------- ----------------- -----------------\n""" for i in range(len(self.geometry)): text += """ {0:8s}{1:4s} """.format(self.symbols[i], "" if self.real[i] else "(Gh)") for j in range(3): text += """ {0:17.12f}""".format( self.geometry[i][j] * constants.conversion_factor("bohr", "angstroms") ) text += "\n" # text += "\n" return text def get_fragment( self, real: Union[int, List], ghost: Optional[Union[int, List]] = None, orient: bool = False, group_fragments: bool = True, ) -> "Molecule": """Get new Molecule with fragments preserved, dropped, or ghosted. Parameters ---------- real Fragment index or list of indices (0-indexed) to be real atoms in new Molecule. ghost Fragment index or list of indices (0-indexed) to be ghost atoms (basis fns only) in new Molecule. orient Whether or not to align (inertial frame) and phase geometry upon new Molecule instantiation (according to _orient_molecule_internal)? group_fragments Whether or not to group real fragments at the start of the atom list and ghost fragments toward the back. Previous to ``v0.5``, this was always effectively True. True is handy for finding duplicate (atom-order-independent) molecules by hash. False preserves fragment order (though collapsing gaps for absent fragments) like Psi4's ``extract_subsets``. False is handy for gradients where atom order of returned values matters. Returns ------- mol New ``py::class:qcelemental.model.Molecule`` with ``self``\'s fragments present, ghosted, or absent. """ if isinstance(real, int): real = [real] if isinstance(ghost, int): ghost = [ghost] elif ghost is None: ghost = [] constructor_dict: Dict = {} ret_name = (self.name if self.name is not None else "") + " (" + str(real) + "," + str(ghost) + ")" constructor_dict["name"] = ret_name # ret = Molecule(None, name=ret_name) if len(set(real) & set(ghost)): raise TypeError( "Molecule:get_fragment: real and ghost sets are overlapping! ({0}, {1}).".format(str(real), str(ghost)) ) geom_blocks = [] symbols = [] masses = [] real_atoms = [] fragments = [] fragment_charges = [] fragment_multiplicities = [] atom_size = 0 if group_fragments: # Loop through the real blocks frag_start = 0 for frag in real: frag_size = len(self.fragments[frag]) geom_blocks.append(self.geometry[self.fragments[frag]]) for idx in self.fragments[frag]: symbols.append(self.symbols[idx]) real_atoms.append(True) masses.append(self.masses[idx]) fragments.append(list(range(frag_start, frag_start + frag_size))) frag_start += frag_size fragment_charges.append(float(self.fragment_charges[frag])) fragment_multiplicities.append(self.fragment_multiplicities[frag]) # Set charge and multiplicity constructor_dict["molecular_charge"] = sum(fragment_charges) constructor_dict["molecular_multiplicity"] = sum(x - 1 for x in fragment_multiplicities) + 1 # Loop through the ghost blocks for frag in ghost: frag_size = len(self.fragments[frag]) geom_blocks.append(self.geometry[self.fragments[frag]]) for idx in self.fragments[frag]: symbols.append(self.symbols[idx]) real_atoms.append(False) masses.append(self.masses[idx]) fragments.append(list(range(frag_start, frag_start + frag_size))) frag_start += frag_size fragment_charges.append(0) fragment_multiplicities.append(1) else: # List[Array[np.int32]] at2fr: List[Union[int, None]] = [None] * len(self.symbols) for ifr, fr in enumerate(self.fragments): for iat in fr: at2fr[iat] = ifr at2at: List[Union[int, None]] = [None] * len(self.symbols) for iat in range(len(self.symbols)): ifr = at2fr[iat] if ifr in real or ifr in ghost: geom_blocks.append(self.geometry[iat]) symbols.append(self.symbols[iat]) real_atoms.append(ifr in real) masses.append(self.masses[iat]) at2at[iat] = atom_size atom_size += 1 else: at2at[iat] = None for ifr, fr in enumerate(self.fragments): if ifr in real or ifr in ghost: fragments.append([at2at[iat] for iat in fr]) if ifr in real: fragment_charges.append(self.fragment_charges[ifr]) fragment_multiplicities.append(self.fragment_multiplicities[ifr]) elif ifr in ghost: fragment_charges.append(0) fragment_multiplicities.append(1) assert None not in fragments constructor_dict["fragments"] = fragments constructor_dict["fragment_charges"] = fragment_charges constructor_dict["fragment_multiplicities"] = fragment_multiplicities constructor_dict["symbols"] = symbols constructor_dict["geometry"] = np.vstack(geom_blocks) constructor_dict["real"] = real_atoms constructor_dict["masses"] = masses return Molecule(orient=orient, **constructor_dict) def to_string( # type: ignore self, dtype: str, units: str = None, *, atom_format: str = None, ghost_format: str = None, width: int = 17, prec: int = 12, return_data: bool = False, ): """Returns a string that can be used by a variety of programs. Unclear if this will be removed or renamed to "to_psi4_string" in the future Suggest psi4 --> psi4frag and psi4 route to to_string """ molrec = from_schema(self.dict(), nonphysical=True) return to_string( molrec, dtype=dtype, units=units, atom_format=atom_format, ghost_format=ghost_format, width=width, prec=prec, return_data=return_data, ) def get_hash(self): """ Returns the hash of the molecule. """ m = hashlib.sha1() concat = "" np.set_printoptions(precision=16) for field in self.hash_fields: data = getattr(self, field) if field == "geometry": data = float_prep(data, GEOMETRY_NOISE) elif field == "fragment_charges": data = float_prep(data, CHARGE_NOISE) elif field == "molecular_charge": data = float_prep(data, CHARGE_NOISE) elif field == "masses": data = float_prep(data, MASS_NOISE) concat += json.dumps(data, default=lambda x: x.ravel().tolist()) m.update(concat.encode("utf-8")) return m.hexdigest() def get_molecular_formula(self, order: str = "alphabetical") -> str: """ Returns the molecular formula for a molecule. Parameters ---------- order: str, optional Sorting order of the formula. Valid choices are "alphabetical" and "hill". Returns ------- str The molecular formula. Examples -------- >>> methane = qcelemental.models.Molecule(''' ... H 0.5288 0.1610 0.9359 ... C 0.0000 0.0000 0.0000 ... H 0.2051 0.8240 -0.6786 ... H 0.3345 -0.9314 -0.4496 ... H -1.0685 -0.0537 0.1921 ... ''') >>> methane.get_molecular_formula() CH4 >>> hcl = qcelemental.models.Molecule(''' ... H 0.0000 0.0000 0.0000 ... Cl 0.0000 0.0000 1.2000 ... ''') >>> hcl.get_molecular_formula() ClH """ from ..molutil import molecular_formula_from_symbols return molecular_formula_from_symbols(symbols=self.symbols, order=order) ### Constructors @classmethod def from_data( cls, data: Union[str, Dict[str, Any], np.array, bytes], dtype: Optional[str] = None, *, orient: bool = False, validate: bool = None, **kwargs: Dict[str, Any], ) -> "Molecule": """ Constructs a molecule object from a data structure. Parameters ---------- data : Union[str, Dict[str, Any], np.array] Data to construct Molecule from dtype : Optional[str], optional How to interpret the data, if not passed attempts to discover this based on input type. orient : bool, optional Orientates the molecule to a standard frame or not. validate : bool, optional Validates the molecule or not. **kwargs : Dict[str, Any] Additional kwargs to pass to the constructors. kwargs take precedence over data. Returns ------- Molecule A constructed molecule class. """ if dtype is None: if isinstance(data, str): dtype = "string" elif isinstance(data, np.ndarray): dtype = "numpy" elif isinstance(data, dict): dtype = "dict" elif isinstance(dtype, bytes): dtype = "msgpack" else: raise TypeError("Input type not understood, please supply the 'dtype' kwarg.") if dtype in ["string", "psi4", "xyz", "xyz+"]: mol_dict = from_string(data, dtype if dtype != "string" else None) assert isinstance(mol_dict, dict) input_dict = to_schema(mol_dict["qm"], dtype=2, np_out=True) input_dict = _filter_defaults(input_dict) input_dict["validated"] = True input_dict["_geometry_prep"] = True elif dtype == "numpy": data = np.asarray(data) data = { "geom": data[:, 1:], "elez": data[:, 0], "units": kwargs.pop("units", "Angstrom"), "fragment_separators": kwargs.pop("frags", []), } input_dict = to_schema(from_arrays(**data), dtype=2, np_out=True) input_dict = _filter_defaults(input_dict) input_dict["validated"] = True input_dict["_geometry_prep"] = True elif dtype == "msgpack": assert isinstance(data, bytes) input_dict = msgpackext_loads(data) elif dtype == "json": assert isinstance(data, str) input_dict = json.loads(data) elif dtype == "dict": assert isinstance(data, dict) input_dict = data else: raise KeyError("Dtype not understood '{}'.".format(dtype)) input_dict.update(kwargs) # if charge/spin options are given, invalidate charge and spin options that are missing charge_spin_opts = {"molecular_charge", "fragment_charges", "molecular_multiplicity", "fragment_multiplicities"} kwarg_keys = set(kwargs.keys()) if len(charge_spin_opts & kwarg_keys) > 0: for key in charge_spin_opts - kwarg_keys: input_dict.pop(key, None) input_dict.pop("validated", None) return cls(orient=orient, validate=validate, **input_dict) @classmethod def from_file(cls, filename: str, dtype: Optional[str] = None, *, orient: bool = False, **kwargs): """ Constructs a molecule object from a file. Parameters ---------- filename : str The filename to build dtype : Optional[str], optional The type of file to interpret. orient : bool, optional Orientates the molecule to a standard frame or not. **kwargs Any additional keywords to pass to the constructor Returns ------- Molecule A constructed molecule class. """ ext = Path(filename).suffix if dtype is None: if ext in _extension_map: dtype = _extension_map[ext] else: # Let `from_string` try to sort it dtype = "string" # Raw string type, read and pass through if dtype in ["string", "xyz", "xyz+", "psi4"]: with open(filename, "r") as infile: data = infile.read() elif dtype == "numpy": data = np.load(filename) elif dtype == "json": with open(filename, "r") as infile: data = json.load(infile) dtype = "dict" elif dtype == "msgpack": with open(filename, "rb") as infile_bytes: data = deserialize(infile_bytes.read(), encoding="msgpack-ext") dtype = "dict" else: raise KeyError("Dtype not understood '{}'.".format(dtype)) return cls.from_data(data, dtype, orient=orient, **kwargs) def to_file(self, filename: str, dtype: Optional[str] = None) -> None: """Writes the Molecule to a file. Parameters ---------- filename : str The filename to write to dtype : Optional[str], optional The type of file to write, attempts to infer dtype from the filename if not provided. """ ext = Path(filename).suffix if dtype is None: if ext in _extension_map: dtype = _extension_map[ext] else: raise KeyError(f"Could not infer dtype from filename: `{filename}`") flags = "w" if dtype in ["xyz", "xyz+", "psi4"]: stringified = self.to_string(dtype) elif dtype in ["json"]: stringified = self.serialize("json") elif dtype in ["msgpack", "msgpack-ext"]: stringified = self.serialize("msgpack-ext") flags = "wb" elif dtype in ["numpy"]: elements = np.array(self.atomic_numbers).reshape(-1, 1) npmol = np.hstack((elements, self.geometry * constants.conversion_factor("bohr", "angstroms"))) np.save(filename, npmol) return else: raise KeyError(f"Dtype `{dtype}` is not valid") with open(filename, flags) as handle: handle.write(stringified) ### Non-Pydantic internal functions def _orient_molecule_internal(self): """ Centers the molecule and orients via inertia tensor before returning a new set of the molecule geometry """ new_geometry = self.geometry.copy() # Ensure we get a copy # Get the mass as an array # Masses are needed for orientation np_mass = np.array(self.masses) # Center on Mass new_geometry -= np.average(new_geometry, axis=0, weights=np_mass) # Rotate into inertial frame tensor = self._inertial_tensor(new_geometry, weight=np_mass) _, evecs = np.linalg.eigh(tensor) new_geometry = np.dot(new_geometry, evecs) # Phases? Lets do the simplest thing and ensure the first atom in each column # that is not on a plane is positve phase_check = [False, False, False] geom_noise = 10 ** (-GEOMETRY_NOISE) for num in range(new_geometry.shape[0]): for x in range(3): if phase_check[x]: continue val = new_geometry[num, x] if abs(val) < geom_noise: continue phase_check[x] = True if val < 0: new_geometry[:, x] *= -1 if sum(phase_check) == 3: break return new_geometry def __repr_args__(self) -> "ReprArgs": return [("name", self.name), ("formula", self.get_molecular_formula()), ("hash", self.get_hash()[:7])] def _ipython_display_(self, **kwargs) -> None: try: self.show()._ipython_display_(**kwargs) except ModuleNotFoundError: from IPython.display import display display(f"Install nglview for interactive visualization.", f"{repr(self)}") @staticmethod def _inertial_tensor(geom, *, weight): """ Compute the moment inertia tensor for a given geometry. """ # Build inertia tensor tensor = np.zeros((3, 3)) # Diagonal tensor[0][0] = np.sum(weight * (geom[:, 1] ** 2.0 + geom[:, 2] ** 2.0)) tensor[1][1] = np.sum(weight * (geom[:, 0] ** 2.0 + geom[:, 2] ** 2.0)) tensor[2][2] = np.sum(weight * (geom[:, 0] ** 2.0 + geom[:, 1] ** 2.0)) # I(alpha, beta) # Off diagonal tensor[1][0] = tensor[0][1] = -1.0 * np.sum(weight * geom[:, 0] * geom[:, 1]) tensor[2][0] = tensor[0][2] = -1.0 * np.sum(weight * geom[:, 0] * geom[:, 2]) tensor[2][1] = tensor[1][2] = -1.0 * np.sum(weight * geom[:, 1] * geom[:, 2]) return tensor def nuclear_repulsion_energy(self, ifr: int = None) -> float: """Nuclear repulsion energy. Parameters ---------- ifr : int, optional If not `None`, only compute for the `ifr`-th (0-indexed) fragment. Returns ------- Nuclear repulsion energy in entire molecule or in fragment. """ Zeff = [z * int(real) for z, real in zip(cast(Iterable[int], self.atomic_numbers), self.real)] atoms = list(range(self.geometry.shape[0])) if ifr is not None: atoms = self.fragments[ifr] nre = 0.0 for iat1, at1 in enumerate(atoms): for at2 in atoms[:iat1]: dist = np.linalg.norm(self.geometry[at1] - self.geometry[at2]) nre += Zeff[at1] * Zeff[at2] / dist return nre def nelectrons(self, ifr: int = None) -> int: """Number of electrons. Parameters ---------- ifr : int, optional If not `None`, only compute for the `ifr`-th (0-indexed) fragment. Returns ------- Number of electrons in entire molecule or in fragment. """ Zeff = [z * int(real) for z, real in zip(cast(Iterable[int], self.atomic_numbers), self.real)] if ifr is None: nel = sum(Zeff) - self.molecular_charge else: nel = sum([zf for iat, zf in enumerate(Zeff) if iat in self.fragments[ifr]]) - self.fragment_charges[ifr] return int(nel) def align( self, ref_mol: "Molecule", *, do_plot: bool = False, verbose: int = 0, atoms_map: bool = False, run_resorting: bool = False, mols_align: Union[bool, float] = False, run_to_completion: bool = False, uno_cutoff: float = 1.0e-3, run_mirror: bool = False, ): """Finds shift, rotation, and atom reordering of `concern_mol` (self) that best aligns with `ref_mol`. Wraps :py:func:`qcel.molutil.B787` for :py:class:`qcel.models.Molecule`. Employs the Kabsch, Hungarian, and Uno algorithms to exhaustively locate the best alignment for non-oriented, non-ordered structures. Parameters ---------- ref_mol : qcel.models.Molecule Molecule to match. atoms_map : bool, optional Whether atom1 of `ref_mol` corresponds to atom1 of `concern_mol`, etc. If true, specifying `True` can save much time. mols_align : bool or float, optional Whether ref_mol and concern_mol have identical geometries (barring orientation or atom mapping) and expected final RMSD = 0. If `True`, procedure is truncated when RMSD condition met, saving time. If float, RMSD tolerance at which search for alignment stops. If provided, the alignment routine will throw an error if it fails to align the molecule within the specified RMSD tolerance. do_plot : bool, optional Pops up a mpl plot showing before, after, and ref geometries. run_to_completion : bool, optional Run reorderings to completion (past RMSD = 0) even if unnecessary because `mols_align=True`. Used to test worst-case timings. run_resorting : bool, optional Run the resorting machinery even if unnecessary because `atoms_map=True`. uno_cutoff : float, optional TODO run_mirror : bool, optional Run alternate geometries potentially allowing best match to `ref_mol` from mirror image of `concern_mol`. Only run if system confirmed to be nonsuperimposable upon mirror reflection. verbose : int, optional Print level. Returns ------- Molecule, data Molecule is internal geometry of `self` optimally aligned and atom-ordered to `ref_mol`. Presently all fragment information is discarded. `data['rmsd']` is RMSD [A] between `ref_mol` and the optimally aligned geometry computed. `data['mill']` is a AlignmentMill with fields (shift, rotation, atommap, mirror) that prescribe the transformation from `concern_mol` and the optimally aligned geometry. """ from ..molutil.align import B787 rgeom = np.array(ref_mol.geometry) runiq = np.asarray( [ hashlib.sha1((sym + str(mas)).encode("utf-8")).hexdigest() for sym, mas in zip(cast(Iterable[str], ref_mol.symbols), ref_mol.masses) ] ) concern_mol = self cgeom = np.array(concern_mol.geometry) cmass = np.array(concern_mol.masses) celem = np.array(concern_mol.symbols) celez = np.array(concern_mol.atomic_numbers) cuniq = np.asarray( [ hashlib.sha1((sym + str(mas)).encode("utf-8")).hexdigest() for sym, mas in zip(cast(Iterable[str], concern_mol.symbols), concern_mol.masses) ] ) rmsd, solution = B787( cgeom=cgeom, rgeom=rgeom, cuniq=cuniq, runiq=runiq, do_plot=do_plot, verbose=verbose, atoms_map=atoms_map, run_resorting=run_resorting, mols_align=mols_align, run_to_completion=run_to_completion, run_mirror=run_mirror, uno_cutoff=uno_cutoff, ) ageom, amass, aelem, aelez, _ = solution.align_system(cgeom, cmass, celem, celez, cuniq, reverse=False) adict = from_arrays( geom=ageom, mass=amass, elem=aelem, elez=aelez, units="Bohr", molecular_charge=concern_mol.molecular_charge, molecular_multiplicity=concern_mol.molecular_multiplicity, fix_com=True, fix_orientation=True, ) amol = Molecule(validate=False, **to_schema(adict, dtype=2)) # TODO -- can probably do more with fragments in amol now that # Mol is something with non-contig frags. frags now discarded. assert compare_values( concern_mol.nuclear_repulsion_energy(), amol.nuclear_repulsion_energy(), "Q: concern_mol-->returned_mol NRE uncorrupted", atol=1.0e-4, quiet=(verbose > 1), ) if mols_align: assert compare_values( ref_mol.nuclear_repulsion_energy(), amol.nuclear_repulsion_energy(), "Q: concern_mol-->returned_mol NRE matches ref_mol", atol=1.0e-4, quiet=(verbose > 1), ) assert compare( True, np.allclose(ref_mol.geometry, amol.geometry, atol=4), "Q: concern_mol-->returned_mol geometry matches ref_mol", quiet=(verbose > 1), ) return amol, {"rmsd": rmsd, "mill": solution} def scramble( self, *, do_shift: bool = True, do_rotate=True, do_resort=True, deflection=1.0, do_mirror=False, do_plot=False, do_test=False, run_to_completion=False, run_resorting=False, verbose=0, ): """Generate a Molecule with random or directed translation, rotation, and atom shuffling. Optionally, check that the aligner returns the opposite transformation. Parameters ---------- ref_mol : qcel.models.Molecule Molecule to perturb. do_shift : bool or array-like, optional Whether to generate a random atom shift on interval [-3, 3) in each dimension (`True`) or leave at current origin. To shift by a specified vector, supply a 3-element list. do_rotate : bool or array-like, optional Whether to generate a random 3D rotation according to algorithm of Arvo. To rotate by a specified matrix, supply a 9-element list of lists. do_resort : bool or array-like, optional Whether to shuffle atoms (`True`) or leave 1st atom 1st, etc. (`False`). To specify shuffle, supply a nat-element list of indices. deflection : float, optional If `do_rotate`, how random a rotation: 0.0 is no change, 0.1 is small perturbation, 1.0 is completely random. do_mirror : bool, optional Whether to construct the mirror image structure by inverting y-axis. do_plot : bool, optional Pops up a mpl plot showing before, after, and ref geometries. do_test : bool, optional Additionally, run the aligner on the returned Molecule and check that opposite transformations obtained. run_to_completion : bool, optional By construction, scrambled systems are fully alignable (final RMSD=0). Even so, `True` turns off the mechanism to stop when RMSD reaches zero and instead proceed to worst possible time. run_resorting : bool, optional Even if atoms not shuffled, test the resorting machinery. verbose : int, optional Print level. Returns ------- Molecule, data Molecule is scrambled copy of `ref_mol` (self). `data['rmsd']` is RMSD [A] between `ref_mol` and the scrambled geometry. `data['mill']` is a AlignmentMill with fields (shift, rotation, atommap, mirror) that prescribe the transformation from `ref_mol` to the returned geometry. Raises ------ AssertionError If `do_test=True` and aligner sanity check fails for any of the reverse transformations. """ from ..molutil.align import compute_scramble ref_mol = self rgeom = np.array(ref_mol.geometry) rmass = np.array(ref_mol.masses) relem = np.array(ref_mol.symbols) relez = np.array(ref_mol.atomic_numbers) runiq = np.asarray( [ hashlib.sha1((sym + str(mas)).encode("utf-8")).hexdigest() for sym, mas in zip(cast(Iterable[str], ref_mol.symbols), ref_mol.masses) ] ) nat = rgeom.shape[0] perturbation = compute_scramble( rgeom.shape[0], do_shift=do_shift, do_rotate=do_rotate, deflection=deflection, do_resort=do_resort, do_mirror=do_mirror, ) cgeom, cmass, celem, celez, cuniq = perturbation.align_system(rgeom, rmass, relem, relez, runiq, reverse=True) cmolrec = from_arrays( geom=cgeom, mass=cmass, elem=celem, elez=celez, units="Bohr", molecular_charge=ref_mol.molecular_charge, molecular_multiplicity=ref_mol.molecular_multiplicity, # copying fix_* vals rather than outright True. neither way great fix_com=ref_mol.fix_com, fix_orientation=ref_mol.fix_orientation, ) cmol = Molecule(validate=False, **to_schema(cmolrec, dtype=2)) rmsd = np.linalg.norm(cgeom - rgeom) * constants.bohr2angstroms / np.sqrt(nat) if verbose >= 1: print("Start RMSD = {:8.4f} [A]".format(rmsd)) if do_test: _, data = cmol.align( ref_mol, do_plot=do_plot, atoms_map=(not do_resort), run_resorting=run_resorting, mols_align=True, run_to_completion=run_to_completion, run_mirror=do_mirror, verbose=verbose, ) solution = data["mill"] assert compare( True, np.allclose(solution.shift, perturbation.shift, atol=6), "shifts equiv", quiet=(verbose > 1) ) if not do_resort: assert compare( True, np.allclose(solution.rotation.T, perturbation.rotation), "rotations transpose", quiet=(verbose > 1), ) if solution.mirror: assert compare(True, do_mirror, "mirror allowed", quiet=(verbose > 1)) return cmol, {"rmsd": rmsd, "mill": perturbation}
def _filter_defaults(dicary): nat = len(dicary["symbols"]) default_mass = np.array([periodictable.to_mass(e) for e in dicary["symbols"]]) dicary.pop("atomic_numbers") if np.allclose(default_mass, dicary["masses"]): dicary.pop("mass_numbers") dicary.pop("masses") if all(dicary["real"]): dicary.pop("real") if dicary["atom_labels"].tolist() == nat * [""]: dicary.pop("atom_labels") if dicary.get("connectivity", "N/A") is None: dicary.pop("connectivity") if dicary["fragments"] == [list(np.arange(nat))]: dicary.pop("fragments") dicary.pop("fragment_charges") dicary.pop("fragment_multiplicities") return dicary # auto_gen_docs_on_demand(Molecule)