Source code for hdf5_utils

"""
Module: hdf5_utils.py

Utilities to initialize and update HDF5 files for storing batched CSD structure data.

Dependencies
------------
pathlib
h5py
"""
from pathlib import Path
import h5py

[docs] def initialize_hdf5_file( hdf5_path: Path, compression: str = "gzip", chunk_size: int = 64 ) -> h5py.File: """ Create or open an HDF5 file and ensure the top-level '/structures' group exists. Parameters ---------- hdf5_path : Path Path to the HDF5 file. If it does not exist, it will be created. compression : str, default="gzip" Compression algorithm to use for future datasets (not applied at group level). chunk_size : int, default=64 Suggested chunk size for any future dataset creation (placeholder). Returns ------- h5py.File An open HDF5 file handle in mode "a" (append). Side Effects ------------ - Creates the file at hdf5_path if it does not exist. - Adds a group named '/structures' if it is missing. Raises ------ OSError If the file cannot be opened or created. """ f = h5py.File(str(hdf5_path), "a") if "structures" not in f: f.create_group("structures") return f
[docs] def write_structure_group( h5: h5py.File, refcode: str, raw_json: str ) -> None: """ Store a raw JSON string under '/structures/<refcode>/raw_json', replacing any existing. Parameters ---------- h5 : h5py.File An open HDF5 file, already structured with '/structures' group. refcode : str CSD refcode to identify the subgroup. raw_json : str Fully serialized JSON blob containing raw fields for this structure. Side Effects ------------ - Creates or retrieves the group '/structures/<refcode>'. - Deletes any existing 'raw_json' dataset under that group. - Creates a new 'raw_json' dataset of type variable-length UTF-8 string. Raises ------ KeyError If '/structures' group does not exist in h5 (should not happen if initialize_hdf5_file was used). OSError On dataset creation failure. """ grp = h5["structures"].require_group(refcode) # raw JSON if "raw_json" in grp: del grp["raw_json"] grp.create_dataset( "raw_json", data=raw_json, dtype=h5py.string_dtype(encoding="utf-8") )