Source code for structure_data_extractor

"""
Module: structure_data_extractor.py

Parallel extraction of raw CSD data into an HDF5 container.

This module defines:
- `_extract_one`: Retrieve and package raw crystal, molecule, contact, and hydrogen-bond data for a single refcode.
- `StructureDataExtractor`: Manage parallel batch extraction of multiple refcodes into an HDF5 file.

Dependencies
------------
numpy
h5py
ccdc
hdf5_utils
"""
import logging
import os
from pathlib import Path
from typing import List, Dict, Any, Union, Tuple

import numpy as np
import h5py
from ccdc import io
from concurrent.futures import ProcessPoolExecutor, as_completed

from hdf5_utils import initialize_hdf5_file

logger = logging.getLogger(__name__)

[docs] def _extract_one( refcode: str, filters: Dict[str, Any], center_molecule: bool ) -> Tuple[str, Dict[str, Any]]: """ Retrieve raw fields for one CSD refcode. Parameters ---------- refcode : str CSD identifier (e.g., "ABCDEF12"). filters : Dict[str, Any] ExtractionConfig.filters may include: - 'target_z_prime_values' - 'target_space_groups' - 'center_molecule' center_molecule : bool If True, recenter the crystal before extracting coordinates. Returns ------- Tuple[str, Dict[str, Any]] refcode : str Same as input. raw_data : Dict[str, Any] Contains: - 'crystal_data': dict of crystal properties (identifier, space_group, z_value, z_prime, cell_lengths, cell_angles, cell_volume, cell_density, packing_coefficient, symmetry_operators). - 'molecule_data': dict with: - 'atoms': mapping label → dict of atomic_symbol, coordinates, fractional_coordinates, atomic_weight, atomic_number, partial_charge, neighbour_labels, sybyl_type. - 'bonds': mapping id → dict of atom1, atom2, bond_type, is_cyclic, is_rotatable, length. - 'intermolecular_contacts', 'intramolecular_contacts', 'intermolecular_hbonds', 'intramolecular_hbonds': dicts of corresponding data (if any). Raises ------ Exception If any CCDC call fails (entry not found, missing coordinates, etc.). """ mode = filters.get("structure_list", ["csd-unique"])[0].lower() if mode == "cif": # refcode here is actually the filename (e.g. "foo.cif") cif_dir = Path(filters["structure_list"][1]) cif_path = cif_dir / refcode # read crystal + molecule from CIF crystal = io.CrystalReader(str(cif_path))[0] molecule = io.MoleculeReader(str(cif_path))[0] else: # default CSD behavior reader = io.EntryReader("CSD") entry = reader.entry(refcode) crystal = entry.crystal molecule = entry.molecule if center_molecule: crystal.centre_molecule() molecule.assign_bond_types() molecule.add_hydrogens(mode='missing', add_sites=True) molecule.assign_partial_charges() if not molecule.all_atoms_have_sites: return None # --- Crystal-level data --- crystal_data = { "identifier": crystal.identifier, "space_group": crystal.spacegroup_symbol, "z_value": crystal.z_value, "z_prime": crystal.z_prime, "cell_lengths": list(crystal.cell_lengths), "cell_angles": list(crystal.cell_angles), "cell_volume": crystal.cell_volume, "cell_density": crystal.calculated_density, "packing_coefficient": crystal.packing_coefficient, "symmetry_operators": crystal.symmetry_operators } # --- Molecule-level data --- atoms_dict: Dict[str, Any] = {} for atom in molecule.atoms: atoms_dict[atom.label] = { "atomic_symbol": atom.atomic_symbol, "coordinates": [atom.coordinates.x, atom.coordinates.y, atom.coordinates.z], "fractional_coordinates": [ atom.fractional_coordinates.x, atom.fractional_coordinates.y, atom.fractional_coordinates.z ], "atomic_weight": atom.atomic_weight, "atomic_number": atom.atomic_number, "partial_charge": atom.partial_charge, "neighbour_labels": [nbr.label for nbr in atom.neighbours], "sybyl_type": atom.sybyl_type } bonds_dict: Dict[str, Any] = {} for bond in molecule.bonds: key = f"{bond.atoms[0].label}-{bond.atoms[1].label}" bonds_dict[key] = { "atom1": bond.atoms[0].label, "atom2": bond.atoms[1].label, "bond_type": str(bond.bond_type), "is_cyclic": bond.is_cyclic, "is_rotatable": bond.is_rotatable, "length": bond.length } molecule_data = { "atoms": atoms_dict, "bonds": bonds_dict } # --- Contacts --- inter_contact_data: Dict[str, Any] = {} intra_contact_data: Dict[str, Any] = {} for contact in crystal.contacts(intermolecular="Any", distance_range=(-3.0, 0.6)): cid = f"{contact.atoms[0].label}-{contact.atoms[1].label}" c_dict = { "central_atom": contact.atoms[0].label, "central_atom_coordinates": [ contact.atoms[0].coordinates.x, contact.atoms[0].coordinates.y, contact.atoms[0].coordinates.z ], "central_atom_fractional_coordinates": [ contact.atoms[0].fractional_coordinates.x, contact.atoms[0].fractional_coordinates.y, contact.atoms[0].fractional_coordinates.z ], "contact_atom": contact.atoms[1].label, "contact_atom_coordinates": [ contact.atoms[1].coordinates.x, contact.atoms[1].coordinates.y, contact.atoms[1].coordinates.z ], "contact_atom_fractional_coordinates": [ contact.atoms[1].fractional_coordinates.x, contact.atoms[1].fractional_coordinates.y, contact.atoms[1].fractional_coordinates.z ], "length": contact.length, "strength": contact.strength, "is_intermolecular": contact.intermolecular, "is_in_line_of_sight": contact.is_in_line_of_sight, "symmetry_operator": contact.symmetry_operators[1] } if contact.intermolecular: inter_contact_data[cid] = c_dict else: intra_contact_data[cid] = c_dict # --- Hydrogen bonds --- inter_hbond_data: Dict[str, Any] = {} intra_hbond_data: Dict[str, Any] = {} for hbond in crystal.hbonds(intermolecular="Any", distance_range=(-3.0, 0.6)): hid = f"{hbond.atoms[0].label}-{hbond.atoms[1].label}-{hbond.atoms[2].label}" hb_dict = { "central_atom": hbond.atoms[0].label, "central_atom_coordinates": [ hbond.atoms[0].coordinates.x, hbond.atoms[0].coordinates.y, hbond.atoms[0].coordinates.z ], "central_atom_fractional_coordinates": [ hbond.atoms[0].fractional_coordinates.x, hbond.atoms[0].fractional_coordinates.y, hbond.atoms[0].fractional_coordinates.z ], "hydrogen_atom": hbond.atoms[1].label, "hydrogen_atom_coordinates": [ hbond.atoms[1].coordinates.x, hbond.atoms[1].coordinates.y, hbond.atoms[1].coordinates.z ], "hydrogen_atom_fractional_coordinates": [ hbond.atoms[1].fractional_coordinates.x, hbond.atoms[1].fractional_coordinates.y, hbond.atoms[1].fractional_coordinates.z ], "contact_atom": hbond.atoms[2].label, "contact_atom_coordinates": [ hbond.atoms[2].coordinates.x, hbond.atoms[2].coordinates.y, hbond.atoms[2].coordinates.z ], "contact_atom_fractional_coordinates": [ hbond.atoms[2].fractional_coordinates.x, hbond.atoms[2].fractional_coordinates.y, hbond.atoms[2].fractional_coordinates.z ], "length": hbond.da_distance, "angle": hbond.angle, "is_intermolecular": hbond.intermolecular, "is_in_line_of_sight": hbond.is_in_line_of_sight, "symmetry_operator": hbond.symmetry_operators[2] } if hbond.intermolecular: inter_hbond_data[hid] = hb_dict else: intra_hbond_data[hid] = hb_dict raw_data: Dict[str, Any] = { "crystal_data": crystal_data, "molecule_data": molecule_data, "intermolecular_contacts": inter_contact_data, "intermolecular_hbonds": inter_hbond_data, "intramolecular_contacts": intra_contact_data, "intramolecular_hbonds": intra_hbond_data } return refcode, raw_data
[docs] class StructureDataExtractor: """ Manage parallel extraction of raw CSD data into an HDF5 container. This class: - Loads a list of refcodes from CSV (clustered or unique). - Initializes or overwrites an HDF5 file with a '/structures' group. - Batches refcodes and invokes `_extract_one()` in parallel. - Writes each structure's raw fields into its own HDF5 subgroup. Attributes ---------- hdf5_path : Path File path for the output HDF5 container. filters : Dict[str, Any] ExtractionConfig.filters dictionary (e.g., data_directory, data_prefix, center_molecule, etc.). batch_size : int Number of refcodes to process concurrently per batch. reader : io.EntryReader CCDC EntryReader instance used by `_extract_one`. Methods ------- run : () -> None Execute the full extraction pipeline: overwrite HDF5, initialize, load refcode list, and process each batch. _load_refcodes : () -> List[str] Read the CSV of refcodes to extract. _process_batch : (batch: List[str], h5: h5py.File) -> None Extract raw data for each batch of refcodes and write to HDF5. """
[docs] def __init__( self, hdf5_path: Union[str, Path], filters: Dict[str, Any], batch_size: int ): """ Initialize a StructureDataExtractor. Parameters ---------- hdf5_path : Union[str, Path] Path for the HDF5 file to create or overwrite. filters : Dict[str, Any] ExtractionConfig.filters, containing: - 'data_directory' - 'data_prefix' - 'center_molecule' and other keys. batch_size : int Number of structures to extract concurrently per batch. """ self.hdf5_path = Path(hdf5_path) self.filters = filters self.batch_size = batch_size self.reader = io.EntryReader("CSD")
[docs] def run(self) -> None: """ Perform the full raw-data extraction for all refcodes into HDF5. This method: - Deletes the existing HDF5 file at `hdf5_path` if it exists. - Calls `initialize_hdf5_file` to create the '/structures' group. - Loads all refcodes via `_load_refcodes()` and writes the '/refcode_list' dataset. - Processes refcodes in batches of size `batch_size`: - Submits each refcode to `ProcessPoolExecutor` running `_extract_one()`. - Collects `(refcode, raw_data)` tuples. - Writes raw fields under `/structures/<refcode>/...` as typed datasets. - Closes the HDF5 file. Raises ------ FileNotFoundError If the refcode list CSV is missing. Exception If any CCDC call or HDF5 write operation fails. """ # Overwrite existing file if self.hdf5_path.exists(): logger.info(f"Overwriting existing HDF5 file: {self.hdf5_path}") self.hdf5_path.unlink() # Initialize file and group h5 = initialize_hdf5_file( self.hdf5_path, compression="gzip", chunk_size=self.batch_size ) # Load and store refcode list refcodes = self._load_refcodes() dt = h5py.string_dtype(encoding="utf-8") h5.create_dataset( "refcode_list", data=np.array(refcodes, dtype=object), dtype=dt ) total = len(refcodes) logger.info(f"{total} structures to extract (batch size {self.batch_size})") # Process in batches for start in range(0, total, self.batch_size): batch = refcodes[start : start + self.batch_size] logger.info( f"Extracting batch {start//self.batch_size + 1} (size {len(batch)})" ) self._process_batch(batch, h5) h5.close() logger.info("Raw data extraction complete; HDF5 file closed.")
[docs] def _load_refcodes(self) -> List[str]: """ Read the list of refcodes to extract from a CSV file. The CSV filename is chosen based on `filters['structure_list'][0]`: - "csd-unique" → "{data_prefix}_refcode_families_unique.csv" - otherwise → "{data_prefix}_refcode_families_clustered.csv" If filters["structure_list"][0] == "cif", returns all .cif filenames from the directory in filters["structure_list"][1]. Returns ------- List[str] Refcode strings to process. Raises ------ FileNotFoundError If the CSV file does not exist. """ mode = self.filters.get("structure_list", ["csd-unique"])[0].lower() if mode == "cif": cif_dir = Path(self.filters["structure_list"][1]) if not cif_dir.is_dir(): raise FileNotFoundError(f"CIF directory {cif_dir} not found") # return filenames (with .cif) so refcode==filename return [p.name for p in sorted(cif_dir.glob("*.cif"))] base = Path(self.filters["data_directory"]) prefix = self.filters["data_prefix"] mode = self.filters.get("structure_list", ["csd-unique"])[0] fname = ( f"{prefix}_refcode_families_unique.csv" if mode == "csd-unique" else f"{prefix}_refcode_families_clustered.csv" ) path = base / fname if not path.is_file(): raise FileNotFoundError(f"Refcode list {path} not found") import pandas as pd df = pd.read_csv(path) return df["refcode"].astype(str).tolist()
[docs] def _process_batch(self, batch: List[str], h5: h5py.File) -> None: """ Extract raw data for a batch of refcodes and write to HDF5. This method: - Submits each refcode in `batch` to `ProcessPoolExecutor` calling `_extract_one(refcode, filters, center_flag)`. - Collects `(refcode, raw_data)` tuples for successful extractions. - For each tuple, writes datasets under `/structures/<refcode>/`: - Crystal-level data: identifier, space_group, z_value, etc. - Atom data: atom_label, atom_coords, atom_frac_coords, etc. - Bond data: bond_id, bond_atom1, bond_atom2, etc. - Contact and hydrogen-bond datasets for intermolecular and intramolecular interactions. Parameters ---------- batch : List[str] Sub-list of refcodes to process. h5 : h5py.File Open HDF5 file returned by `initialize_hdf5_file()`. Raises ------ Exception If any error occurs during extraction or dataset writing. """ # 1) Parallel raw extraction center_flag = self.filters.get('center_molecule', False) max_workers = min(len(batch), (os.cpu_count() or 1) - 1) results: List[Tuple[str, Dict[str, Any]]] = [] with ProcessPoolExecutor(max_workers=max_workers) as exe: futures = { exe.submit(_extract_one, ref, self.filters, center_flag): ref for ref in batch } for fut in as_completed(futures): out = fut.result() if out: results.append(out) if not results: return # 2) Write each structure’s raw fields as typed datasets for refcode, raw in results: grp = h5["structures"].require_group(refcode) # --- Crystal-level data --- cd = raw["crystal_data"] grp.create_dataset("identifier", data=cd["identifier"], dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("space_group", data=cd["space_group"], dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("z_value", data=cd["z_value"]) grp.create_dataset("z_prime", data=cd["z_prime"]) grp.create_dataset("cell_volume", data=cd["cell_volume"]) grp.create_dataset("cell_density", data=cd["cell_density"]) grp.create_dataset("packing_coefficient", data=cd["packing_coefficient"]) grp.create_dataset("cell_lengths", data=np.array(cd["cell_lengths"], dtype=np.float32)) grp.create_dataset("cell_angles", data=np.array(cd["cell_angles"], dtype=np.float32)) grp.create_dataset("symmetry_operators", data=np.array(cd["symmetry_operators"], dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) # --- Molecule atoms --- atoms = raw["molecule_data"]["atoms"] labels = list(atoms.keys()) N = len(labels) coords = np.zeros((N, 3), dtype=np.float32) frac_coords = np.zeros((N, 3), dtype=np.float32) weights = np.zeros((N,), dtype=np.float32) numbers = np.zeros((N,), dtype=np.int32) charges = np.zeros((N,), dtype=np.float32) symbols = [] sybyl = [] neighbours = [] for i, lbl in enumerate(labels): p = atoms[lbl] coords[i] = p["coordinates"] frac_coords[i] = p["fractional_coordinates"] weights[i] = p["atomic_weight"] numbers[i] = p["atomic_number"] charges[i] = p["partial_charge"] symbols.append(p["atomic_symbol"]) sybyl.append(p["sybyl_type"]) neighbours.append(",".join(p["neighbour_labels"])) grp.create_dataset("atom_label", data=np.array(labels, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("atom_symbol", data=np.array(symbols, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("atom_coords", data=coords) grp.create_dataset("atom_frac_coords", data=frac_coords) grp.create_dataset("atom_weight", data=weights) grp.create_dataset("atom_number", data=numbers) grp.create_dataset("atom_charge", data=charges) grp.create_dataset("atom_sybyl_type", data=np.array(sybyl, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("atom_neighbour_list", data=np.array(neighbours, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) # --- Molecule bonds --- bonds = raw["molecule_data"]["bonds"] bids = list(bonds.keys()) M = len(bonds) atom1_idx = np.zeros((M,), dtype=np.int32) atom2_idx = np.zeros((M,), dtype=np.int32) cyclic = np.zeros((M,), dtype=bool) rotat = np.zeros((M,), dtype=bool) lengths = np.zeros((M,), dtype=np.float32) atom1 = [] atom2 = [] types = [] for j, (bid, bp) in enumerate(bonds.items()): atom1_idx[j] = labels.index(bp["atom1"]) atom2_idx[j] = labels.index(bp["atom2"]) cyclic[j] = bp["is_cyclic"] rotat[j] = bp["is_rotatable"] lengths[j] = bp["length"] atom1.append(bp["atom1"]) atom2.append(bp["atom2"]) types.append(bp["bond_type"]) grp.create_dataset("bond_id", data=np.array(bids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("bond_atom1", data=np.array(atom1, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("bond_atom2", data=np.array(atom2, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("bond_atom1_idx", data=atom1_idx) grp.create_dataset("bond_atom2_idx", data=atom2_idx) grp.create_dataset("bond_type", data=np.array(types, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("bond_is_cyclic", data=cyclic) grp.create_dataset("bond_is_rotatable", data=rotat) grp.create_dataset("bond_length", data=lengths) # --- Contacts --- contacts_inter = raw.get("intermolecular_contacts", {}) contacts_intra = raw.get("intramolecular_contacts", {}) # Intermolecular contacts cids = list(contacts_inter.keys()) if len(cids) == 0: # no contacts → empty 1D arrays for labels/indices and (0,3) for coords central = np.empty((0,), dtype=object) contact = np.empty((0,), dtype=object) central_idx = np.empty((0,), dtype=np.int32) contact_idx = np.empty((0,), dtype=np.int32) lengths = np.empty((0,), dtype=np.float32) strengths = np.empty((0,), dtype=np.float32) in_line = np.empty((0,), dtype=bool) sym = np.empty((0,), dtype=object) central_coords = np.zeros((0, 3), dtype=np.float32) contact_coords = np.zeros((0, 3), dtype=np.float32) central_frac_coords = np.zeros((0, 3), dtype=np.float32) contact_frac_coords = np.zeros((0, 3), dtype=np.float32) else: central = np.array([contacts_inter[c]["central_atom"] for c in cids], dtype=object) contact = np.array([contacts_inter[c]["contact_atom"] for c in cids], dtype=object ) central_idx = np.array([labels.index(contacts_inter[c]["central_atom"]) for c in cids], dtype=np.int32) contact_idx = np.array([labels.index(contacts_inter[c]["contact_atom"]) for c in cids], dtype=np.int32) lengths = np.array([contacts_inter[c]["length"] for c in cids], dtype=np.float32) strengths = np.array([contacts_inter[c]["strength"] for c in cids], dtype=np.float32) in_line = np.array([contacts_inter[c]["is_in_line_of_sight"] for c in cids], dtype=bool) sym = np.array([contacts_inter[c]["symmetry_operator"] for c in cids], dtype=object) central_coords = np.array([contacts_inter[c]["central_atom_coordinates"] for c in cids], dtype=np.float32) contact_coords = np.array([contacts_inter[c]["contact_atom_coordinates"] for c in cids], dtype=np.float32) central_frac_coords = np.array([contacts_inter[c]["central_atom_fractional_coordinates"] for c in cids], dtype=np.float32) contact_frac_coords = np.array([contacts_inter[c]["contact_atom_fractional_coordinates"] for c in cids], dtype=np.float32) grp.create_dataset("inter_cc_id", data=np.array(cids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_cc_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_cc_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_cc_central_atom_idx", data=central_idx) grp.create_dataset("inter_cc_contact_atom_idx", data=contact_idx) grp.create_dataset("inter_cc_length", data=lengths) grp.create_dataset("inter_cc_strength", data=strengths) grp.create_dataset("inter_cc_in_los", data=in_line) grp.create_dataset("inter_cc_symmetry", data=sym, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_cc_central_atom_coords", data=central_coords) grp.create_dataset("inter_cc_contact_atom_coords", data=contact_coords) grp.create_dataset("inter_cc_central_atom_frac_coords", data=central_frac_coords) grp.create_dataset("inter_cc_contact_atom_frac_coords", data=contact_frac_coords) # Intramolecular contacts cids = list(contacts_intra.keys()) if len(cids) == 0: # no contacts → empty 1D arrays for labels/indices and (0,3) for coords central = np.empty((0,), dtype=object) contact = np.empty((0,), dtype=object) central_idx = np.empty((0,), dtype=np.int32) contact_idx = np.empty((0,), dtype=np.int32) lengths = np.empty((0,), dtype=np.float32) strengths = np.empty((0,), dtype=np.float32) in_line = np.empty((0,), dtype=bool) central_coords = np.zeros((0, 3), dtype=np.float32) contact_coords = np.zeros((0, 3), dtype=np.float32) central_frac_coords = np.zeros((0, 3), dtype=np.float32) contact_frac_coords = np.zeros((0, 3), dtype=np.float32) else: central = np.array([contacts_intra[c]["central_atom"] for c in cids], dtype=object) contact = np.array([contacts_intra[c]["contact_atom"] for c in cids], dtype=object ) central_idx = np.array([labels.index(contacts_intra[c]["central_atom"]) for c in cids], dtype=np.int32) contact_idx = np.array([labels.index(contacts_intra[c]["contact_atom"]) for c in cids], dtype=np.int32) lengths = np.array([contacts_intra[c]["length"] for c in cids], dtype=np.float32) strengths = np.array([contacts_intra[c]["strength"] for c in cids], dtype=np.float32) in_line = np.array([contacts_intra[c]["is_in_line_of_sight"] for c in cids], dtype=bool) central_coords = np.array([contacts_intra[c]["central_atom_coordinates"] for c in cids], dtype=np.float32) contact_coords = np.array([contacts_intra[c]["contact_atom_coordinates"] for c in cids], dtype=np.float32) central_frac_coords = np.array([contacts_intra[c]["central_atom_fractional_coordinates"] for c in cids], dtype=np.float32) contact_frac_coords = np.array([contacts_intra[c]["contact_atom_fractional_coordinates"] for c in cids], dtype=np.float32) grp.create_dataset("intra_cc_id", data=np.array(cids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("intra_cc_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("intra_cc_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("intra_cc_central_atom_idx", data=central_idx) grp.create_dataset("intra_cc_contact_atom_idx", data=contact_idx) grp.create_dataset("intra_cc_length", data=lengths) grp.create_dataset("intra_cc_strength", data=strengths) grp.create_dataset("intra_cc_in_los", data=in_line) grp.create_dataset("intra_cc_central_atom_coords", data=central_coords) grp.create_dataset("intra_cc_contact_atom_coords", data=contact_coords) grp.create_dataset("intra_cc_central_atom_frac_coords", data=central_frac_coords) grp.create_dataset("intra_cc_contact_atom_frac_coords", data=contact_frac_coords) # --- Hydrogen bonds --- hbonds_inter = raw.get("intermolecular_hbonds", {}) hbonds_intra = raw.get("intramolecular_hbonds", {}) # Intermolecular hydrogen bonds hids = list(hbonds_inter.keys()) if len(hids) == 0: # no contacts → empty 1D arrays for labels/indices and (0,3) for coords central = np.empty((0,), dtype=object) hydrogen = np.empty((0,), dtype=object) contact = np.empty((0,), dtype=object) central_idx = np.empty((0,), dtype=np.int32) hydrogen_idx = np.empty((0,), dtype=np.int32) contact_idx = np.empty((0,), dtype=np.int32) lengths = np.empty((0,), dtype=np.float32) angles = np.empty((0,), dtype=np.float32) in_line = np.empty((0,), dtype=bool) sym = np.empty((0,), dtype=object) central_coords = np.zeros((0, 3), dtype=np.float32) hydrogen_coords = np.zeros((0, 3), dtype=np.float32) contact_coords = np.zeros((0, 3), dtype=np.float32) central_frac_coords = np.zeros((0, 3), dtype=np.float32) hydrogen_frac_coords = np.zeros((0, 3), dtype=np.float32) contact_frac_coords = np.zeros((0, 3), dtype=np.float32) else: central = np.array([hbonds_inter[h]["central_atom"] for h in hids], dtype=object) hydrogen = np.array([hbonds_inter[h]["hydrogen_atom"] for h in hids], dtype=object) contact = np.array([hbonds_inter[h]["contact_atom"] for h in hids], dtype=object) central_idx = np.array([labels.index(hbonds_inter[h]["central_atom"]) for h in hids], dtype=np.int32) hydrogen_idx = np.array([labels.index(hbonds_inter[h]["hydrogen_atom"]) for h in hids], dtype=np.int32) contact_idx = np.array([labels.index(hbonds_inter[h]["contact_atom"]) for h in hids], dtype=np.int32) lengths = np.array([hbonds_inter[h]["length"] for h in hids], dtype=np.float32) angles = np.array([hbonds_inter[h]["angle"] for h in hids], dtype=np.float32) in_line = np.array([hbonds_inter[h]["is_in_line_of_sight"] for h in hids], dtype=bool) sym = np.array([hbonds_inter[h]["symmetry_operator"] for h in hids], dtype=object) central_coords = np.array([hbonds_inter[h]["central_atom_coordinates"] for h in hids], dtype=np.float32) hydrogen_coords = np.array([hbonds_inter[h]["hydrogen_atom_coordinates"] for h in hids], dtype=np.float32) contact_coords = np.array([hbonds_inter[h]["contact_atom_coordinates"] for h in hids], dtype=np.float32) central_frac_coords = np.array([hbonds_inter[h]["central_atom_fractional_coordinates"] for h in hids], dtype=np.float32) hydrogen_frac_coords = np.array([hbonds_inter[h]["hydrogen_atom_fractional_coordinates"] for h in hids], dtype=np.float32) contact_frac_coords = np.array([hbonds_inter[h]["contact_atom_fractional_coordinates"] for h in hids], dtype=np.float32) grp.create_dataset("inter_hb_id", data=np.array(hids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_hb_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_hb_hydrogen_atom", data=hydrogen, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_hb_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_hb_central_atom_idx", data=central_idx) grp.create_dataset("inter_hb_hydrogen_atom_idx", data=hydrogen_idx) grp.create_dataset("inter_hb_contact_atom_idx", data=contact_idx) grp.create_dataset("inter_hb_length", data=lengths) grp.create_dataset("inter_hb_angle", data=angles) grp.create_dataset("inter_hb_in_los", data=in_line) grp.create_dataset("inter_hb_symmetry", data=sym, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("inter_hb_central_atom_coords", data=central_coords) grp.create_dataset("inter_hb_hydrogen_atom_coords", data=hydrogen_coords) grp.create_dataset("inter_hb_contact_atom_coords", data=contact_coords) grp.create_dataset("inter_hb_central_atom_frac_coords", data=central_frac_coords) grp.create_dataset("inter_hb_hydrogen_atom_frac_coords", data=hydrogen_frac_coords) grp.create_dataset("inter_hb_contact_atom_frac_coords", data=contact_frac_coords) # Intramolecular hydrogen bonds hids = list(hbonds_intra.keys()) if len(hids) == 0: # no contacts → empty 1D arrays for labels/indices and (0,3) for coords central = np.empty((0,), dtype=object) hydrogen = np.empty((0,), dtype=object) contact = np.empty((0,), dtype=object) central_idx = np.empty((0,), dtype=np.int32) hydrogen_idx = np.empty((0,), dtype=np.int32) contact_idx = np.empty((0,), dtype=np.int32) lengths = np.empty((0,), dtype=np.float32) angles = np.empty((0,), dtype=np.float32) in_line = np.empty((0,), dtype=bool) central_coords = np.zeros((0, 3), dtype=np.float32) hydrogen_coords = np.zeros((0, 3), dtype=np.float32) contact_coords = np.zeros((0, 3), dtype=np.float32) central_frac_coords = np.zeros((0, 3), dtype=np.float32) hydrogen_frac_coords = np.zeros((0, 3), dtype=np.float32) contact_frac_coords = np.zeros((0, 3), dtype=np.float32) else: central = np.array([hbonds_intra[h]["central_atom"] for h in hids], dtype=object) hydrogen = np.array([hbonds_intra[h]["hydrogen_atom"] for h in hids], dtype=object) contact = np.array([hbonds_intra[h]["contact_atom"] for h in hids], dtype=object) central_idx = np.array([labels.index(hbonds_intra[h]["central_atom"]) for h in hids], dtype=np.int32) hydrogen_idx = np.array([labels.index(hbonds_intra[h]["hydrogen_atom"]) for h in hids], dtype=np.int32) contact_idx = np.array([labels.index(hbonds_intra[h]["contact_atom"]) for h in hids], dtype=np.int32) lengths = np.array([hbonds_intra[h]["length"] for h in hids], dtype=np.float32) angles = np.array([hbonds_intra[h]["angle"] for h in hids], dtype=np.float32) in_line = np.array([hbonds_intra[h]["is_in_line_of_sight"] for h in hids], dtype=bool) central_coords = np.array([hbonds_intra[h]["central_atom_coordinates"] for h in hids], dtype=np.float32) hydrogen_coords = np.array([hbonds_intra[h]["hydrogen_atom_coordinates"] for h in hids], dtype=np.float32) contact_coords = np.array([hbonds_intra[h]["contact_atom_coordinates"] for h in hids], dtype=np.float32) central_frac_coords = np.array([hbonds_intra[h]["central_atom_fractional_coordinates"] for h in hids], dtype=np.float32) hydrogen_frac_coords = np.array([hbonds_intra[h]["hydrogen_atom_fractional_coordinates"] for h in hids], dtype=np.float32) contact_frac_coords = np.array([hbonds_intra[h]["contact_atom_fractional_coordinates"] for h in hids], dtype=np.float32) grp.create_dataset("intra_hb_id", data=np.array(hids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("intra_hb_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("intra_hb_hydrogen_atom", data=hydrogen, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("intra_hb_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8")) grp.create_dataset("intra_hb_central_atom_idx", data=central_idx) grp.create_dataset("intra_hb_hydrogen_atom_idx", data=hydrogen_idx) grp.create_dataset("intra_hb_contact_atom_idx", data=contact_idx) grp.create_dataset("intra_hb_length", data=lengths) grp.create_dataset("intra_hb_angle", data=angles) grp.create_dataset("intra_hb_in_los", data=in_line) grp.create_dataset("intra_hb_central_atom_coords", data=central_coords) grp.create_dataset("intra_hb_hydrogen_atom_coords", data=hydrogen_coords) grp.create_dataset("intra_hb_contact_atom_coords", data=contact_coords) grp.create_dataset("intra_hb_central_atom_frac_coords", data=central_frac_coords) grp.create_dataset("intra_hb_hydrogen_atom_frac_coords", data=hydrogen_frac_coords) grp.create_dataset("intra_hb_contact_atom_frac_coords", data=contact_frac_coords)