Source code for structure_data_extractor
"""
Module: structure_data_extractor.py
Parallel extraction of raw CSD data into an HDF5 container.
This module defines:
- `_extract_one`: Retrieve and package raw crystal, molecule, contact, and hydrogen-bond data for a single refcode.
- `StructureDataExtractor`: Manage parallel batch extraction of multiple refcodes into an HDF5 file.
Dependencies
------------
numpy
h5py
ccdc
hdf5_utils
"""
import logging
import os
from pathlib import Path
from typing import List, Dict, Any, Union, Tuple
import numpy as np
import h5py
from ccdc import io
from concurrent.futures import ProcessPoolExecutor, as_completed
from hdf5_utils import initialize_hdf5_file
logger = logging.getLogger(__name__)
[docs]
def _extract_one(
refcode: str,
filters: Dict[str, Any],
center_molecule: bool
) -> Tuple[str, Dict[str, Any]]:
"""
Retrieve raw fields for one CSD refcode.
Parameters
----------
refcode : str
CSD identifier (e.g., "ABCDEF12").
filters : Dict[str, Any]
ExtractionConfig.filters may include:
- 'target_z_prime_values'
- 'target_space_groups'
- 'center_molecule'
center_molecule : bool
If True, recenter the crystal before extracting coordinates.
Returns
-------
Tuple[str, Dict[str, Any]]
refcode : str
Same as input.
raw_data : Dict[str, Any]
Contains:
- 'crystal_data': dict of crystal properties (identifier, space_group,
z_value, z_prime, cell_lengths, cell_angles, cell_volume,
cell_density, packing_coefficient, symmetry_operators).
- 'molecule_data': dict with:
- 'atoms': mapping label → dict of atomic_symbol, coordinates,
fractional_coordinates, atomic_weight, atomic_number,
partial_charge, neighbour_labels, sybyl_type.
- 'bonds': mapping id → dict of atom1, atom2, bond_type,
is_cyclic, is_rotatable, length.
- 'intermolecular_contacts', 'intramolecular_contacts',
'intermolecular_hbonds', 'intramolecular_hbonds': dicts of
corresponding data (if any).
Raises
------
Exception
If any CCDC call fails (entry not found, missing coordinates, etc.).
"""
mode = filters.get("structure_list", ["csd-unique"])[0].lower()
if mode == "cif":
# refcode here is actually the filename (e.g. "foo.cif")
cif_dir = Path(filters["structure_list"][1])
cif_path = cif_dir / refcode
# read crystal + molecule from CIF
crystal = io.CrystalReader(str(cif_path))[0]
molecule = io.MoleculeReader(str(cif_path))[0]
else:
# default CSD behavior
reader = io.EntryReader("CSD")
entry = reader.entry(refcode)
crystal = entry.crystal
molecule = entry.molecule
if center_molecule:
crystal.centre_molecule()
molecule.assign_bond_types()
molecule.add_hydrogens(mode='missing', add_sites=True)
molecule.assign_partial_charges()
if not molecule.all_atoms_have_sites:
return None
# --- Crystal-level data ---
crystal_data = {
"identifier": crystal.identifier,
"space_group": crystal.spacegroup_symbol,
"z_value": crystal.z_value,
"z_prime": crystal.z_prime,
"cell_lengths": list(crystal.cell_lengths),
"cell_angles": list(crystal.cell_angles),
"cell_volume": crystal.cell_volume,
"cell_density": crystal.calculated_density,
"packing_coefficient": crystal.packing_coefficient,
"symmetry_operators": crystal.symmetry_operators
}
# --- Molecule-level data ---
atoms_dict: Dict[str, Any] = {}
for atom in molecule.atoms:
atoms_dict[atom.label] = {
"atomic_symbol": atom.atomic_symbol,
"coordinates": [atom.coordinates.x, atom.coordinates.y, atom.coordinates.z],
"fractional_coordinates": [
atom.fractional_coordinates.x,
atom.fractional_coordinates.y,
atom.fractional_coordinates.z
],
"atomic_weight": atom.atomic_weight,
"atomic_number": atom.atomic_number,
"partial_charge": atom.partial_charge,
"neighbour_labels": [nbr.label for nbr in atom.neighbours],
"sybyl_type": atom.sybyl_type
}
bonds_dict: Dict[str, Any] = {}
for bond in molecule.bonds:
key = f"{bond.atoms[0].label}-{bond.atoms[1].label}"
bonds_dict[key] = {
"atom1": bond.atoms[0].label,
"atom2": bond.atoms[1].label,
"bond_type": str(bond.bond_type),
"is_cyclic": bond.is_cyclic,
"is_rotatable": bond.is_rotatable,
"length": bond.length
}
molecule_data = {
"atoms": atoms_dict,
"bonds": bonds_dict
}
# --- Contacts ---
inter_contact_data: Dict[str, Any] = {}
intra_contact_data: Dict[str, Any] = {}
for contact in crystal.contacts(intermolecular="Any", distance_range=(-3.0, 0.6)):
cid = f"{contact.atoms[0].label}-{contact.atoms[1].label}"
c_dict = {
"central_atom": contact.atoms[0].label,
"central_atom_coordinates": [
contact.atoms[0].coordinates.x,
contact.atoms[0].coordinates.y,
contact.atoms[0].coordinates.z
],
"central_atom_fractional_coordinates": [
contact.atoms[0].fractional_coordinates.x,
contact.atoms[0].fractional_coordinates.y,
contact.atoms[0].fractional_coordinates.z
],
"contact_atom": contact.atoms[1].label,
"contact_atom_coordinates": [
contact.atoms[1].coordinates.x,
contact.atoms[1].coordinates.y,
contact.atoms[1].coordinates.z
],
"contact_atom_fractional_coordinates": [
contact.atoms[1].fractional_coordinates.x,
contact.atoms[1].fractional_coordinates.y,
contact.atoms[1].fractional_coordinates.z
],
"length": contact.length,
"strength": contact.strength,
"is_intermolecular": contact.intermolecular,
"is_in_line_of_sight": contact.is_in_line_of_sight,
"symmetry_operator": contact.symmetry_operators[1]
}
if contact.intermolecular:
inter_contact_data[cid] = c_dict
else:
intra_contact_data[cid] = c_dict
# --- Hydrogen bonds ---
inter_hbond_data: Dict[str, Any] = {}
intra_hbond_data: Dict[str, Any] = {}
for hbond in crystal.hbonds(intermolecular="Any", distance_range=(-3.0, 0.6)):
hid = f"{hbond.atoms[0].label}-{hbond.atoms[1].label}-{hbond.atoms[2].label}"
hb_dict = {
"central_atom": hbond.atoms[0].label,
"central_atom_coordinates": [
hbond.atoms[0].coordinates.x,
hbond.atoms[0].coordinates.y,
hbond.atoms[0].coordinates.z
],
"central_atom_fractional_coordinates": [
hbond.atoms[0].fractional_coordinates.x,
hbond.atoms[0].fractional_coordinates.y,
hbond.atoms[0].fractional_coordinates.z
],
"hydrogen_atom": hbond.atoms[1].label,
"hydrogen_atom_coordinates": [
hbond.atoms[1].coordinates.x,
hbond.atoms[1].coordinates.y,
hbond.atoms[1].coordinates.z
],
"hydrogen_atom_fractional_coordinates": [
hbond.atoms[1].fractional_coordinates.x,
hbond.atoms[1].fractional_coordinates.y,
hbond.atoms[1].fractional_coordinates.z
],
"contact_atom": hbond.atoms[2].label,
"contact_atom_coordinates": [
hbond.atoms[2].coordinates.x,
hbond.atoms[2].coordinates.y,
hbond.atoms[2].coordinates.z
],
"contact_atom_fractional_coordinates": [
hbond.atoms[2].fractional_coordinates.x,
hbond.atoms[2].fractional_coordinates.y,
hbond.atoms[2].fractional_coordinates.z
],
"length": hbond.da_distance,
"angle": hbond.angle,
"is_intermolecular": hbond.intermolecular,
"is_in_line_of_sight": hbond.is_in_line_of_sight,
"symmetry_operator": hbond.symmetry_operators[2]
}
if hbond.intermolecular:
inter_hbond_data[hid] = hb_dict
else:
intra_hbond_data[hid] = hb_dict
raw_data: Dict[str, Any] = {
"crystal_data": crystal_data,
"molecule_data": molecule_data,
"intermolecular_contacts": inter_contact_data,
"intermolecular_hbonds": inter_hbond_data,
"intramolecular_contacts": intra_contact_data,
"intramolecular_hbonds": intra_hbond_data
}
return refcode, raw_data
[docs]
class StructureDataExtractor:
"""
Manage parallel extraction of raw CSD data into an HDF5 container.
This class:
- Loads a list of refcodes from CSV (clustered or unique).
- Initializes or overwrites an HDF5 file with a '/structures' group.
- Batches refcodes and invokes `_extract_one()` in parallel.
- Writes each structure's raw fields into its own HDF5 subgroup.
Attributes
----------
hdf5_path : Path
File path for the output HDF5 container.
filters : Dict[str, Any]
ExtractionConfig.filters dictionary (e.g., data_directory, data_prefix,
center_molecule, etc.).
batch_size : int
Number of refcodes to process concurrently per batch.
reader : io.EntryReader
CCDC EntryReader instance used by `_extract_one`.
Methods
-------
run : () -> None
Execute the full extraction pipeline: overwrite HDF5, initialize,
load refcode list, and process each batch.
_load_refcodes : () -> List[str]
Read the CSV of refcodes to extract.
_process_batch : (batch: List[str], h5: h5py.File) -> None
Extract raw data for each batch of refcodes and write to HDF5.
"""
[docs]
def __init__(
self,
hdf5_path: Union[str, Path],
filters: Dict[str, Any],
batch_size: int
):
"""
Initialize a StructureDataExtractor.
Parameters
----------
hdf5_path : Union[str, Path]
Path for the HDF5 file to create or overwrite.
filters : Dict[str, Any]
ExtractionConfig.filters, containing:
- 'data_directory'
- 'data_prefix'
- 'center_molecule'
and other keys.
batch_size : int
Number of structures to extract concurrently per batch.
"""
self.hdf5_path = Path(hdf5_path)
self.filters = filters
self.batch_size = batch_size
self.reader = io.EntryReader("CSD")
[docs]
def run(self) -> None:
"""
Perform the full raw-data extraction for all refcodes into HDF5.
This method:
- Deletes the existing HDF5 file at `hdf5_path` if it exists.
- Calls `initialize_hdf5_file` to create the '/structures' group.
- Loads all refcodes via `_load_refcodes()` and writes the '/refcode_list' dataset.
- Processes refcodes in batches of size `batch_size`:
- Submits each refcode to `ProcessPoolExecutor` running `_extract_one()`.
- Collects `(refcode, raw_data)` tuples.
- Writes raw fields under `/structures/<refcode>/...` as typed datasets.
- Closes the HDF5 file.
Raises
------
FileNotFoundError
If the refcode list CSV is missing.
Exception
If any CCDC call or HDF5 write operation fails.
"""
# Overwrite existing file
if self.hdf5_path.exists():
logger.info(f"Overwriting existing HDF5 file: {self.hdf5_path}")
self.hdf5_path.unlink()
# Initialize file and group
h5 = initialize_hdf5_file(
self.hdf5_path, compression="gzip", chunk_size=self.batch_size
)
# Load and store refcode list
refcodes = self._load_refcodes()
dt = h5py.string_dtype(encoding="utf-8")
h5.create_dataset(
"refcode_list",
data=np.array(refcodes, dtype=object),
dtype=dt
)
total = len(refcodes)
logger.info(f"{total} structures to extract (batch size {self.batch_size})")
# Process in batches
for start in range(0, total, self.batch_size):
batch = refcodes[start : start + self.batch_size]
logger.info(
f"Extracting batch {start//self.batch_size + 1} (size {len(batch)})"
)
self._process_batch(batch, h5)
h5.close()
logger.info("Raw data extraction complete; HDF5 file closed.")
[docs]
def _load_refcodes(self) -> List[str]:
"""
Read the list of refcodes to extract from a CSV file.
The CSV filename is chosen based on `filters['structure_list'][0]`:
- "csd-unique" → "{data_prefix}_refcode_families_unique.csv"
- otherwise → "{data_prefix}_refcode_families_clustered.csv"
If filters["structure_list"][0] == "cif", returns all .cif filenames
from the directory in filters["structure_list"][1].
Returns
-------
List[str]
Refcode strings to process.
Raises
------
FileNotFoundError
If the CSV file does not exist.
"""
mode = self.filters.get("structure_list", ["csd-unique"])[0].lower()
if mode == "cif":
cif_dir = Path(self.filters["structure_list"][1])
if not cif_dir.is_dir():
raise FileNotFoundError(f"CIF directory {cif_dir} not found")
# return filenames (with .cif) so refcode==filename
return [p.name for p in sorted(cif_dir.glob("*.cif"))]
base = Path(self.filters["data_directory"])
prefix = self.filters["data_prefix"]
mode = self.filters.get("structure_list", ["csd-unique"])[0]
fname = (
f"{prefix}_refcode_families_unique.csv"
if mode == "csd-unique"
else f"{prefix}_refcode_families_clustered.csv"
)
path = base / fname
if not path.is_file():
raise FileNotFoundError(f"Refcode list {path} not found")
import pandas as pd
df = pd.read_csv(path)
return df["refcode"].astype(str).tolist()
[docs]
def _process_batch(self, batch: List[str], h5: h5py.File) -> None:
"""
Extract raw data for a batch of refcodes and write to HDF5.
This method:
- Submits each refcode in `batch` to `ProcessPoolExecutor` calling
`_extract_one(refcode, filters, center_flag)`.
- Collects `(refcode, raw_data)` tuples for successful extractions.
- For each tuple, writes datasets under `/structures/<refcode>/`:
- Crystal-level data: identifier, space_group, z_value, etc.
- Atom data: atom_label, atom_coords, atom_frac_coords, etc.
- Bond data: bond_id, bond_atom1, bond_atom2, etc.
- Contact and hydrogen-bond datasets for intermolecular and intramolecular interactions.
Parameters
----------
batch : List[str]
Sub-list of refcodes to process.
h5 : h5py.File
Open HDF5 file returned by `initialize_hdf5_file()`.
Raises
------
Exception
If any error occurs during extraction or dataset writing.
"""
# 1) Parallel raw extraction
center_flag = self.filters.get('center_molecule', False)
max_workers = min(len(batch), (os.cpu_count() or 1) - 1)
results: List[Tuple[str, Dict[str, Any]]] = []
with ProcessPoolExecutor(max_workers=max_workers) as exe:
futures = {
exe.submit(_extract_one, ref, self.filters, center_flag): ref
for ref in batch
}
for fut in as_completed(futures):
out = fut.result()
if out:
results.append(out)
if not results:
return
# 2) Write each structure’s raw fields as typed datasets
for refcode, raw in results:
grp = h5["structures"].require_group(refcode)
# --- Crystal-level data ---
cd = raw["crystal_data"]
grp.create_dataset("identifier", data=cd["identifier"], dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("space_group", data=cd["space_group"], dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("z_value", data=cd["z_value"])
grp.create_dataset("z_prime", data=cd["z_prime"])
grp.create_dataset("cell_volume", data=cd["cell_volume"])
grp.create_dataset("cell_density", data=cd["cell_density"])
grp.create_dataset("packing_coefficient", data=cd["packing_coefficient"])
grp.create_dataset("cell_lengths", data=np.array(cd["cell_lengths"], dtype=np.float32))
grp.create_dataset("cell_angles", data=np.array(cd["cell_angles"], dtype=np.float32))
grp.create_dataset("symmetry_operators", data=np.array(cd["symmetry_operators"], dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
# --- Molecule atoms ---
atoms = raw["molecule_data"]["atoms"]
labels = list(atoms.keys())
N = len(labels)
coords = np.zeros((N, 3), dtype=np.float32)
frac_coords = np.zeros((N, 3), dtype=np.float32)
weights = np.zeros((N,), dtype=np.float32)
numbers = np.zeros((N,), dtype=np.int32)
charges = np.zeros((N,), dtype=np.float32)
symbols = []
sybyl = []
neighbours = []
for i, lbl in enumerate(labels):
p = atoms[lbl]
coords[i] = p["coordinates"]
frac_coords[i] = p["fractional_coordinates"]
weights[i] = p["atomic_weight"]
numbers[i] = p["atomic_number"]
charges[i] = p["partial_charge"]
symbols.append(p["atomic_symbol"])
sybyl.append(p["sybyl_type"])
neighbours.append(",".join(p["neighbour_labels"]))
grp.create_dataset("atom_label", data=np.array(labels, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("atom_symbol", data=np.array(symbols, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("atom_coords", data=coords)
grp.create_dataset("atom_frac_coords", data=frac_coords)
grp.create_dataset("atom_weight", data=weights)
grp.create_dataset("atom_number", data=numbers)
grp.create_dataset("atom_charge", data=charges)
grp.create_dataset("atom_sybyl_type", data=np.array(sybyl, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("atom_neighbour_list", data=np.array(neighbours, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
# --- Molecule bonds ---
bonds = raw["molecule_data"]["bonds"]
bids = list(bonds.keys())
M = len(bonds)
atom1_idx = np.zeros((M,), dtype=np.int32)
atom2_idx = np.zeros((M,), dtype=np.int32)
cyclic = np.zeros((M,), dtype=bool)
rotat = np.zeros((M,), dtype=bool)
lengths = np.zeros((M,), dtype=np.float32)
atom1 = []
atom2 = []
types = []
for j, (bid, bp) in enumerate(bonds.items()):
atom1_idx[j] = labels.index(bp["atom1"])
atom2_idx[j] = labels.index(bp["atom2"])
cyclic[j] = bp["is_cyclic"]
rotat[j] = bp["is_rotatable"]
lengths[j] = bp["length"]
atom1.append(bp["atom1"])
atom2.append(bp["atom2"])
types.append(bp["bond_type"])
grp.create_dataset("bond_id", data=np.array(bids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("bond_atom1", data=np.array(atom1, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("bond_atom2", data=np.array(atom2, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("bond_atom1_idx", data=atom1_idx)
grp.create_dataset("bond_atom2_idx", data=atom2_idx)
grp.create_dataset("bond_type", data=np.array(types, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("bond_is_cyclic", data=cyclic)
grp.create_dataset("bond_is_rotatable", data=rotat)
grp.create_dataset("bond_length", data=lengths)
# --- Contacts ---
contacts_inter = raw.get("intermolecular_contacts", {})
contacts_intra = raw.get("intramolecular_contacts", {})
# Intermolecular contacts
cids = list(contacts_inter.keys())
if len(cids) == 0:
# no contacts → empty 1D arrays for labels/indices and (0,3) for coords
central = np.empty((0,), dtype=object)
contact = np.empty((0,), dtype=object)
central_idx = np.empty((0,), dtype=np.int32)
contact_idx = np.empty((0,), dtype=np.int32)
lengths = np.empty((0,), dtype=np.float32)
strengths = np.empty((0,), dtype=np.float32)
in_line = np.empty((0,), dtype=bool)
sym = np.empty((0,), dtype=object)
central_coords = np.zeros((0, 3), dtype=np.float32)
contact_coords = np.zeros((0, 3), dtype=np.float32)
central_frac_coords = np.zeros((0, 3), dtype=np.float32)
contact_frac_coords = np.zeros((0, 3), dtype=np.float32)
else:
central = np.array([contacts_inter[c]["central_atom"] for c in cids], dtype=object)
contact = np.array([contacts_inter[c]["contact_atom"] for c in cids], dtype=object )
central_idx = np.array([labels.index(contacts_inter[c]["central_atom"]) for c in cids], dtype=np.int32)
contact_idx = np.array([labels.index(contacts_inter[c]["contact_atom"]) for c in cids], dtype=np.int32)
lengths = np.array([contacts_inter[c]["length"] for c in cids], dtype=np.float32)
strengths = np.array([contacts_inter[c]["strength"] for c in cids], dtype=np.float32)
in_line = np.array([contacts_inter[c]["is_in_line_of_sight"] for c in cids], dtype=bool)
sym = np.array([contacts_inter[c]["symmetry_operator"] for c in cids], dtype=object)
central_coords = np.array([contacts_inter[c]["central_atom_coordinates"] for c in cids], dtype=np.float32)
contact_coords = np.array([contacts_inter[c]["contact_atom_coordinates"] for c in cids], dtype=np.float32)
central_frac_coords = np.array([contacts_inter[c]["central_atom_fractional_coordinates"] for c in cids], dtype=np.float32)
contact_frac_coords = np.array([contacts_inter[c]["contact_atom_fractional_coordinates"] for c in cids], dtype=np.float32)
grp.create_dataset("inter_cc_id", data=np.array(cids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_cc_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_cc_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_cc_central_atom_idx", data=central_idx)
grp.create_dataset("inter_cc_contact_atom_idx", data=contact_idx)
grp.create_dataset("inter_cc_length", data=lengths)
grp.create_dataset("inter_cc_strength", data=strengths)
grp.create_dataset("inter_cc_in_los", data=in_line)
grp.create_dataset("inter_cc_symmetry", data=sym, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_cc_central_atom_coords", data=central_coords)
grp.create_dataset("inter_cc_contact_atom_coords", data=contact_coords)
grp.create_dataset("inter_cc_central_atom_frac_coords", data=central_frac_coords)
grp.create_dataset("inter_cc_contact_atom_frac_coords", data=contact_frac_coords)
# Intramolecular contacts
cids = list(contacts_intra.keys())
if len(cids) == 0:
# no contacts → empty 1D arrays for labels/indices and (0,3) for coords
central = np.empty((0,), dtype=object)
contact = np.empty((0,), dtype=object)
central_idx = np.empty((0,), dtype=np.int32)
contact_idx = np.empty((0,), dtype=np.int32)
lengths = np.empty((0,), dtype=np.float32)
strengths = np.empty((0,), dtype=np.float32)
in_line = np.empty((0,), dtype=bool)
central_coords = np.zeros((0, 3), dtype=np.float32)
contact_coords = np.zeros((0, 3), dtype=np.float32)
central_frac_coords = np.zeros((0, 3), dtype=np.float32)
contact_frac_coords = np.zeros((0, 3), dtype=np.float32)
else:
central = np.array([contacts_intra[c]["central_atom"] for c in cids], dtype=object)
contact = np.array([contacts_intra[c]["contact_atom"] for c in cids], dtype=object )
central_idx = np.array([labels.index(contacts_intra[c]["central_atom"]) for c in cids], dtype=np.int32)
contact_idx = np.array([labels.index(contacts_intra[c]["contact_atom"]) for c in cids], dtype=np.int32)
lengths = np.array([contacts_intra[c]["length"] for c in cids], dtype=np.float32)
strengths = np.array([contacts_intra[c]["strength"] for c in cids], dtype=np.float32)
in_line = np.array([contacts_intra[c]["is_in_line_of_sight"] for c in cids], dtype=bool)
central_coords = np.array([contacts_intra[c]["central_atom_coordinates"] for c in cids], dtype=np.float32)
contact_coords = np.array([contacts_intra[c]["contact_atom_coordinates"] for c in cids], dtype=np.float32)
central_frac_coords = np.array([contacts_intra[c]["central_atom_fractional_coordinates"] for c in cids], dtype=np.float32)
contact_frac_coords = np.array([contacts_intra[c]["contact_atom_fractional_coordinates"] for c in cids], dtype=np.float32)
grp.create_dataset("intra_cc_id", data=np.array(cids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("intra_cc_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("intra_cc_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("intra_cc_central_atom_idx", data=central_idx)
grp.create_dataset("intra_cc_contact_atom_idx", data=contact_idx)
grp.create_dataset("intra_cc_length", data=lengths)
grp.create_dataset("intra_cc_strength", data=strengths)
grp.create_dataset("intra_cc_in_los", data=in_line)
grp.create_dataset("intra_cc_central_atom_coords", data=central_coords)
grp.create_dataset("intra_cc_contact_atom_coords", data=contact_coords)
grp.create_dataset("intra_cc_central_atom_frac_coords", data=central_frac_coords)
grp.create_dataset("intra_cc_contact_atom_frac_coords", data=contact_frac_coords)
# --- Hydrogen bonds ---
hbonds_inter = raw.get("intermolecular_hbonds", {})
hbonds_intra = raw.get("intramolecular_hbonds", {})
# Intermolecular hydrogen bonds
hids = list(hbonds_inter.keys())
if len(hids) == 0:
# no contacts → empty 1D arrays for labels/indices and (0,3) for coords
central = np.empty((0,), dtype=object)
hydrogen = np.empty((0,), dtype=object)
contact = np.empty((0,), dtype=object)
central_idx = np.empty((0,), dtype=np.int32)
hydrogen_idx = np.empty((0,), dtype=np.int32)
contact_idx = np.empty((0,), dtype=np.int32)
lengths = np.empty((0,), dtype=np.float32)
angles = np.empty((0,), dtype=np.float32)
in_line = np.empty((0,), dtype=bool)
sym = np.empty((0,), dtype=object)
central_coords = np.zeros((0, 3), dtype=np.float32)
hydrogen_coords = np.zeros((0, 3), dtype=np.float32)
contact_coords = np.zeros((0, 3), dtype=np.float32)
central_frac_coords = np.zeros((0, 3), dtype=np.float32)
hydrogen_frac_coords = np.zeros((0, 3), dtype=np.float32)
contact_frac_coords = np.zeros((0, 3), dtype=np.float32)
else:
central = np.array([hbonds_inter[h]["central_atom"] for h in hids], dtype=object)
hydrogen = np.array([hbonds_inter[h]["hydrogen_atom"] for h in hids], dtype=object)
contact = np.array([hbonds_inter[h]["contact_atom"] for h in hids], dtype=object)
central_idx = np.array([labels.index(hbonds_inter[h]["central_atom"]) for h in hids], dtype=np.int32)
hydrogen_idx = np.array([labels.index(hbonds_inter[h]["hydrogen_atom"]) for h in hids], dtype=np.int32)
contact_idx = np.array([labels.index(hbonds_inter[h]["contact_atom"]) for h in hids], dtype=np.int32)
lengths = np.array([hbonds_inter[h]["length"] for h in hids], dtype=np.float32)
angles = np.array([hbonds_inter[h]["angle"] for h in hids], dtype=np.float32)
in_line = np.array([hbonds_inter[h]["is_in_line_of_sight"] for h in hids], dtype=bool)
sym = np.array([hbonds_inter[h]["symmetry_operator"] for h in hids], dtype=object)
central_coords = np.array([hbonds_inter[h]["central_atom_coordinates"] for h in hids], dtype=np.float32)
hydrogen_coords = np.array([hbonds_inter[h]["hydrogen_atom_coordinates"] for h in hids], dtype=np.float32)
contact_coords = np.array([hbonds_inter[h]["contact_atom_coordinates"] for h in hids], dtype=np.float32)
central_frac_coords = np.array([hbonds_inter[h]["central_atom_fractional_coordinates"] for h in hids], dtype=np.float32)
hydrogen_frac_coords = np.array([hbonds_inter[h]["hydrogen_atom_fractional_coordinates"] for h in hids], dtype=np.float32)
contact_frac_coords = np.array([hbonds_inter[h]["contact_atom_fractional_coordinates"] for h in hids], dtype=np.float32)
grp.create_dataset("inter_hb_id", data=np.array(hids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_hb_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_hb_hydrogen_atom", data=hydrogen, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_hb_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_hb_central_atom_idx", data=central_idx)
grp.create_dataset("inter_hb_hydrogen_atom_idx", data=hydrogen_idx)
grp.create_dataset("inter_hb_contact_atom_idx", data=contact_idx)
grp.create_dataset("inter_hb_length", data=lengths)
grp.create_dataset("inter_hb_angle", data=angles)
grp.create_dataset("inter_hb_in_los", data=in_line)
grp.create_dataset("inter_hb_symmetry", data=sym, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("inter_hb_central_atom_coords", data=central_coords)
grp.create_dataset("inter_hb_hydrogen_atom_coords", data=hydrogen_coords)
grp.create_dataset("inter_hb_contact_atom_coords", data=contact_coords)
grp.create_dataset("inter_hb_central_atom_frac_coords", data=central_frac_coords)
grp.create_dataset("inter_hb_hydrogen_atom_frac_coords", data=hydrogen_frac_coords)
grp.create_dataset("inter_hb_contact_atom_frac_coords", data=contact_frac_coords)
# Intramolecular hydrogen bonds
hids = list(hbonds_intra.keys())
if len(hids) == 0:
# no contacts → empty 1D arrays for labels/indices and (0,3) for coords
central = np.empty((0,), dtype=object)
hydrogen = np.empty((0,), dtype=object)
contact = np.empty((0,), dtype=object)
central_idx = np.empty((0,), dtype=np.int32)
hydrogen_idx = np.empty((0,), dtype=np.int32)
contact_idx = np.empty((0,), dtype=np.int32)
lengths = np.empty((0,), dtype=np.float32)
angles = np.empty((0,), dtype=np.float32)
in_line = np.empty((0,), dtype=bool)
central_coords = np.zeros((0, 3), dtype=np.float32)
hydrogen_coords = np.zeros((0, 3), dtype=np.float32)
contact_coords = np.zeros((0, 3), dtype=np.float32)
central_frac_coords = np.zeros((0, 3), dtype=np.float32)
hydrogen_frac_coords = np.zeros((0, 3), dtype=np.float32)
contact_frac_coords = np.zeros((0, 3), dtype=np.float32)
else:
central = np.array([hbonds_intra[h]["central_atom"] for h in hids], dtype=object)
hydrogen = np.array([hbonds_intra[h]["hydrogen_atom"] for h in hids], dtype=object)
contact = np.array([hbonds_intra[h]["contact_atom"] for h in hids], dtype=object)
central_idx = np.array([labels.index(hbonds_intra[h]["central_atom"]) for h in hids], dtype=np.int32)
hydrogen_idx = np.array([labels.index(hbonds_intra[h]["hydrogen_atom"]) for h in hids], dtype=np.int32)
contact_idx = np.array([labels.index(hbonds_intra[h]["contact_atom"]) for h in hids], dtype=np.int32)
lengths = np.array([hbonds_intra[h]["length"] for h in hids], dtype=np.float32)
angles = np.array([hbonds_intra[h]["angle"] for h in hids], dtype=np.float32)
in_line = np.array([hbonds_intra[h]["is_in_line_of_sight"] for h in hids], dtype=bool)
central_coords = np.array([hbonds_intra[h]["central_atom_coordinates"] for h in hids], dtype=np.float32)
hydrogen_coords = np.array([hbonds_intra[h]["hydrogen_atom_coordinates"] for h in hids], dtype=np.float32)
contact_coords = np.array([hbonds_intra[h]["contact_atom_coordinates"] for h in hids], dtype=np.float32)
central_frac_coords = np.array([hbonds_intra[h]["central_atom_fractional_coordinates"] for h in hids], dtype=np.float32)
hydrogen_frac_coords = np.array([hbonds_intra[h]["hydrogen_atom_fractional_coordinates"] for h in hids], dtype=np.float32)
contact_frac_coords = np.array([hbonds_intra[h]["contact_atom_fractional_coordinates"] for h in hids], dtype=np.float32)
grp.create_dataset("intra_hb_id", data=np.array(hids, dtype=object), dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("intra_hb_central_atom", data=central, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("intra_hb_hydrogen_atom", data=hydrogen, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("intra_hb_contact_atom", data=contact, dtype=h5py.string_dtype(encoding="utf-8"))
grp.create_dataset("intra_hb_central_atom_idx", data=central_idx)
grp.create_dataset("intra_hb_hydrogen_atom_idx", data=hydrogen_idx)
grp.create_dataset("intra_hb_contact_atom_idx", data=contact_idx)
grp.create_dataset("intra_hb_length", data=lengths)
grp.create_dataset("intra_hb_angle", data=angles)
grp.create_dataset("intra_hb_in_los", data=in_line)
grp.create_dataset("intra_hb_central_atom_coords", data=central_coords)
grp.create_dataset("intra_hb_hydrogen_atom_coords", data=hydrogen_coords)
grp.create_dataset("intra_hb_contact_atom_coords", data=contact_coords)
grp.create_dataset("intra_hb_central_atom_frac_coords", data=central_frac_coords)
grp.create_dataset("intra_hb_hydrogen_atom_frac_coords", data=hydrogen_frac_coords)
grp.create_dataset("intra_hb_contact_atom_frac_coords", data=contact_frac_coords)