Source code for data_writer

"""
Module: data_writer.py

Provides RawDataWriter and ComputedDataWriter to write raw and computed
datasets into a processed HDF5 file. All per‐structure data is written
slice‐by‐slice, with variable‐length (vlen) datasets for atoms, bonds,
contacts, and H‐bonds.

Dependencies
------------
h5py
numpy
torch
"""
import h5py
import numpy as np
import torch
from typing import Dict, Any

[docs] class RawDataWriter: """ Write raw crystal, atom, bond, intramolecular and intermolecular contact, and H-bond data into the output HDF5 file, slice-by-slice. Attributes ---------- h5_out : h5py.File Open HDF5 file for writing processed data. Methods ------- write_raw_crystal_data(start, crystal_parameters) Write raw crystal parameters into the HDF5 datasets. write_raw_atom_data(start, atom_parameters) Write raw per-atom data into the HDF5 datasets. write_raw_bond_data(start, bond_parameters) Write raw per-bond data into the HDF5 datasets. write_raw_intramolecular_contact_data(start, intra_cc_parameters) Write raw intra-molecular contact data into the HDF5 datasets. write_raw_intramolecular_hbond_data(start, intra_hb_parameters) Write raw intra-molecular H-bond data into the HDF5 datasets. """
[docs] def __init__(self, h5_out: h5py.File): """ Parameters ---------- h5_out : h5py.File Open HDF5 file for writing processed data. """ self.h5_out = h5_out
[docs] def write_raw_crystal_data( self, start: int, crystal_parameters: Dict[str, Any] ) -> None: """ Write raw crystal parameters into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. crystal_parameters : Dict[str, Any] Dictionary of raw crystal-level arrays or tensors: 'cell_lengths', 'cell_angles', and any scalar metrics. Returns ------- None """ # 1) Convert any torch tensors to numpy on CPU arrs: Dict[str, np.ndarray] = {} for key, vals in crystal_parameters.items(): if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) # 2) Determine batch‐size B B = next(iter(arrs.values())).shape[0] # 3) Slice each dataset appropriately for key, arr in arrs.items(): ds = self.h5_out[key] if arr.ndim == 1: ds[start:start+B] = arr elif arr.ndim == 2: ds[start:start+B, :] = arr elif arr.ndim == 3: ds[start:start+B, :, :] = arr else: raise ValueError(f"Unsupported ndim {arr.ndim} for raw crystal field '{key}'")
[docs] def write_raw_atom_data( self, start: int, atom_parameters: Dict[str, np.ndarray] ) -> None: """ Write raw per‐atom data into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. atom_parameters : Dict[str, Any] Dictionary containing per-atom raw data: 'atom_label', 'atom_symbol', 'atom_number', 'atom_coords', 'atom_frac_coords', 'atom_weight', 'atom_charge', 'atom_sybyl_type', 'atom_neighbour_list', and 'atom_mask'. Returns ------- None """ B, N_max = atom_parameters['atom_number'].shape # 1) Define your key groups int_keys = ['atom_number'] flat3_keys = ['atom_coords', 'atom_frac_coords'] float1_keys = ['atom_weight', 'atom_charge'] vlen_str_keys = ['atom_label', 'atom_symbol', 'atom_sybyl_type', 'atom_neighbour_list'] # 2) Pre-convert all numeric/coord tensors → NumPy on CPU arrs: Dict[str, np.ndarray] = {} for key in int_keys + float1_keys + flat3_keys + ['atom_mask']: vals = atom_parameters[key] if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) for i in range(B): idx = start + i na = int(arrs['atom_mask'][i].sum()) # 3) number of atoms self.h5_out['n_atoms'][idx] = na # 4) integer arrays for key in int_keys: seq = arrs[key][i, :na].astype(np.int32) self.h5_out[key][idx] = seq # 5) flattened coords for key in flat3_keys: block = arrs[key][i, :na, :] # shape (na,3) self.h5_out[key][idx] = block.reshape(-1) # 6) 1-D float arrays for key in float1_keys: seq = arrs[key][i, :na].astype(np.float32) self.h5_out[key][idx] = seq # 7) vlen-string lists (never force into a 2D array) for key in vlen_str_keys: vals = atom_parameters[key] if torch.is_tensor(vals): # unlikely, but handle it seq = vals[i, :na].detach().cpu().numpy().tolist() elif isinstance(vals, np.ndarray): seq = vals[i, :na].tolist() else: # Python list of lists seq = vals[i][:na] # write as list of str self.h5_out[key][idx] = [str(x) for x in seq]
[docs] def write_raw_bond_data( self, start: int, bond_parameters: Dict[str, Any] ) -> None: """ Write raw per‐bond data into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. bond_parameters : Dict[str, Any] Dictionary containing per-bond raw data: 'n_bonds', 'bond_atom1_idx', 'bond_atom2_idx', 'bond_atom1', 'bond_atom2', 'bond_type', 'bond_is_rotatable_raw', 'bond_is_cyclic', and 'bond_length'. Returns ------- None """ # 1) Key groups count_key = 'n_bonds' int_keys = ['bond_atom1_idx', 'bond_atom2_idx'] bool_keys = ['bond_is_cyclic'] vlen_str_keys = ['bond_atom1', 'bond_atom2', 'bond_type'] vlen_bool_keys = ['bond_is_rotatable_raw'] vlen_float_keys= ['bond_length'] # 2) Pre-convert fixed-shape arrays → NumPy on CPU arrs: Dict[str, np.ndarray] = {} for key in (count_key, *int_keys, *bool_keys): vals = bond_parameters[key] if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) B = int(arrs[count_key].shape[0]) # 3) Write each structure for i in range(B): idx = start + i nb = int(arrs[count_key][i]) # 3a) bond count self.h5_out[count_key][idx] = nb # 3b) fixed-length ints for key in int_keys: seq = arrs[key][i, :nb].astype(np.int32) self.h5_out[key][idx] = seq # 3c) fixed-length bools for key in bool_keys: seq = arrs[key][i, :nb].astype(bool) self.h5_out[key][idx] = seq # 3d) vlen strings for key in vlen_str_keys: vals = bond_parameters[key] if torch.is_tensor(vals): row = vals[i, :nb].detach().cpu().numpy().tolist() elif isinstance(vals, np.ndarray): row = vals[i, :nb].tolist() else: row = vals[i][:nb] self.h5_out[key][idx] = [str(x) for x in row] # 3e) vlen bools for key in vlen_bool_keys: vals = bond_parameters[key] if torch.is_tensor(vals): row = vals[i, :nb].detach().cpu().numpy().astype(bool) elif isinstance(vals, np.ndarray): row = vals[i, :nb].astype(bool) else: row = np.array(vals[i][:nb], dtype=bool) self.h5_out[key][idx] = row # 3f) vlen floats for key in vlen_float_keys: vals = bond_parameters[key] if torch.is_tensor(vals): row = vals[i, :nb].detach().cpu().numpy().astype(np.float32) elif isinstance(vals, np.ndarray): row = vals[i, :nb].astype(np.float32) else: row = np.array(vals[i][:nb], dtype=np.float32) self.h5_out[key][idx] = row
[docs] def write_raw_intramolecular_contact_data( self, start: int, intra_cc_parameters: Dict[str, Any] ) -> None: """ Write raw intramolecular contact data into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. intra_cc_parameters : Dict[str, Any] Dictionary containing raw intra-molecular contact data: 'intra_cc_id', 'intra_cc_central_atom', 'intra_cc_contact_atom', 'intra_cc_central_atom_idx', 'intra_cc_contact_atom_idx', 'intra_cc_central_atom_coords', 'intra_cc_contact_atom_coords', 'intra_cc_central_atom_frac_coords', 'intra_cc_contact_atom_frac_coords', 'intra_cc_length', 'intra_cc_strength', and 'intra_cc_in_los'. Returns ------- None """ flat3_keys = ['intra_cc_central_atom_coords', 'intra_cc_contact_atom_coords'] flat3_frac_keys = ['intra_cc_central_atom_frac_coords', 'intra_cc_contact_atom_frac_coords'] float1_keys = ['intra_cc_length', 'intra_cc_strength'] int_keys = ['intra_cc_central_atom_idx', 'intra_cc_contact_atom_idx'] bool_keys = ['intra_cc_in_los'] arrs: Dict[str, np.ndarray] = {} for key in flat3_keys + flat3_frac_keys + float1_keys + bool_keys + int_keys: vals = intra_cc_parameters[key] if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) # 2) String‐list inputs (keep as Python lists or object‐arrays) labels_cl = intra_cc_parameters['intra_cc_central_atom'] str_keys = ['intra_cc_id', 'intra_cc_central_atom','intra_cc_contact_atom'] B = len(labels_cl) for i in range(B): idx = start + i nC = len(labels_cl[i]) # 4) count self.h5_out['intra_n_contacts'][idx] = nC # 5) vlen‐string fields for key in str_keys: seq = intra_cc_parameters[key] if torch.is_tensor(seq): row = seq[i, :nC].detach().cpu().numpy().tolist() elif isinstance(seq, np.ndarray): row = seq[i, :nC].tolist() else: row = seq[i][:nC] self.h5_out[key][idx] = [str(x) for x in row] # 6) vlen‐int fields for key in int_keys: vals = intra_cc_parameters[key] if torch.is_tensor(vals): row = vals[i, :nC].detach().cpu().numpy().astype(np.int32) elif isinstance(vals, np.ndarray): row = vals[i, :nC].astype(np.int32) else: row = np.array(vals[i][:nC], dtype=np.int32) self.h5_out[key][idx] = row # 7) flattened Cartesian coords for key in flat3_keys: block = arrs[key][i, :nC, :] self.h5_out[key][idx] = block.reshape(-1) # 8) flattened fractional coords for key in flat3_frac_keys: block = arrs[key][i, :nC, :] self.h5_out[key][idx] = block.reshape(-1) # 9) 1‐D floats for key in float1_keys: seq = arrs[key][i, :nC].astype(np.float32) self.h5_out[key][idx] = seq # 10) 1‐D bools for key in bool_keys: seq = arrs[key][i, :nC].astype(bool) self.h5_out[key][idx] = seq
[docs] def write_raw_intramolecular_hbond_data( self, start: int, intra_hb_parameters: Dict[str, Any] ) -> None: """ Write raw intramolecular H‐bond data into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. intra_hb_parameters : Dict[str, Any] Dictionary containing raw intra-molecular H-bond data: 'intra_hb_id', 'intra_hb_central_atom', 'intra_hb_hydrogen_atom', 'intra_hb_contact_atom', 'intra_hb_central_atom_idx', 'intra_hb_hydrogen_atom_idx', 'intra_hb_contact_atom_idx', 'intra_hb_central_atom_coords', 'intra_hb_hydrogen_atom_coords', 'intra_hb_contact_atom_coords', 'intra_hb_central_atom_frac_coords', 'intra_hb_hydrogen_atom_frac_coords', 'intra_hb_contact_atom_frac_coords', 'intra_hb_length', 'intra_hb_angle', and 'intra_hb_in_los'. Returns ------- None """ # 1) Prepare NumPy arrays for all fixed‐shape fields flat3_keys = [ 'intra_hb_central_atom_coords', 'intra_hb_hydrogen_atom_coords', 'intra_hb_contact_atom_coords' ] flat3_frac_keys = [ 'intra_hb_central_atom_frac_coords', 'intra_hb_hydrogen_atom_frac_coords', 'intra_hb_contact_atom_frac_coords' ] float1_keys = ['intra_hb_length', 'intra_hb_angle'] int_keys = [ 'intra_hb_central_atom_idx', 'intra_hb_hydrogen_atom_idx', 'intra_hb_contact_atom_idx' ] bool_keys = ['intra_hb_in_los'] arrs: Dict[str, np.ndarray] = {} for key in flat3_keys + flat3_frac_keys + float1_keys + bool_keys + int_keys: vals = intra_hb_parameters[key] if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) # 2) vlen‐string inputs labels_cl = intra_hb_parameters['intra_hb_central_atom'] str_keys = [ 'intra_hb_id', 'intra_hb_central_atom', 'intra_hb_hydrogen_atom', 'intra_hb_contact_atom' ] B = len(labels_cl) for i in range(B): idx = start + i nH = len(labels_cl[i]) # 4) number of H-bonds self.h5_out['intra_n_hbonds'][idx] = nH # 5) vlen‐string fields for key in str_keys: seq = intra_hb_parameters[key] if torch.is_tensor(seq): row = seq[i, :nH].detach().cpu().numpy().tolist() elif isinstance(seq, np.ndarray): row = seq[i, :nH].tolist() else: row = seq[i][:nH] self.h5_out[key][idx] = [str(x) for x in row] # 6) vlen‐int fields for key in int_keys: vals = intra_hb_parameters[key] if torch.is_tensor(vals): row = vals[i, :nH].detach().cpu().numpy().astype(np.int32) elif isinstance(vals, np.ndarray): row = vals[i, :nH].astype(np.int32) else: row = np.array(vals[i][:nH], dtype=np.int32) self.h5_out[key][idx] = row # 7) flattened Cartesian coords for key in flat3_keys: block = arrs[key][i, :nH, :] # (nH,3) self.h5_out[key][idx] = block.reshape(-1) # 8) flattened fractional coords for key in flat3_frac_keys: block = arrs[key][i, :nH, :] self.h5_out[key][idx] = block.reshape(-1) # 9) 1‐D floats for key in float1_keys: seq = arrs[key][i, :nH].astype(np.float32) self.h5_out[key][idx] = seq # 10) 1‐D bools for key in bool_keys: seq = arrs[key][i, :nH].astype(bool) self.h5_out[key][idx] = seq
[docs] class ComputedDataWriter: """ Write computed crystal, atom, bond, molecule, and contact/H-bond features into the output HDF5 file. Attributes ---------- h5_out : h5py.File Open HDF5 file for writing processed data. Methods ------- write_computed_crystal_data(start: int, crystal_parameters: Dict[str, Any]) -> None Write computed crystal parameters ('scaled_cell', 'cell_matrix') into the HDF5 datasets. write_computed_atom_data(start: int, atom_parameters: Dict[str, Any]) -> None Write computed atom-level features ('atom_fragment_id', 'atom_dist_to_special_planes') into the HDF5 datasets. write_computed_bond_data(start: int, bond_parameters: Dict[str, Any]) -> None Write computed bond-level features ('bond_is_rotatable', 'bond_vector_angles_to_special_planes') into the HDF5 datasets. write_computed_molecule_data(start: int, molecule_parameters: Dict[str, Any]) -> None Write computed intra-molecular bond angles and torsion features into the HDF5 datasets. write_computed_intermolecular_contact_data(start: int, inter_cc_parameters: Dict[str, Any]) -> None Write computed intermolecular contact features (IDs, indices, coords, lengths, strengths, h-bond flags, fragment mappings, vectors) into the HDF5 datasets. write_computed_intermolecular_hbond_data(start: int, inter_hb_parameters: Dict[str, Any]) -> None Write computed intermolecular H-bond features (IDs, donor/acceptor labels, indices, coords, lengths, angles, masks, symmetry ops) into the HDF5 datasets. """
[docs] def __init__(self, h5_out: h5py.File): """ Initialize ComputedDataWriter. Parameters ---------- h5_out : h5py.File Open HDF5 file for writing processed data. """ self.h5_out = h5_out
[docs] def write_computed_crystal_data( self, start: int, crystal_parameters: Dict[str, Any] ) -> None: """ Write computed crystal parameters into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. crystal_parameters : Dict[str, Any] Dictionary containing computed crystal data: 'scaled_cell' (shape (B, 6)) and 'cell_matrix' (shape (B, 3, 3)). Returns ------- None """ # First convert any torch.Tensors to numpy arrays arrs: Dict[str, np.ndarray] = {} for key, vals in crystal_parameters.items(): if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) # Batch‐size B = next(iter(arrs.values())).shape[0] for key, arr in arrs.items(): ds = self.h5_out[key] if arr.ndim == 2: ds[start:start+B, :] = arr elif arr.ndim == 3: ds[start:start+B, :, :] = arr else: # fallback—e.g. someone might pass a 1‐D array of scalars ds[start:start+B] = arr
[docs] def write_computed_atom_data( self, start: int, atom_parameters: Dict[str, Any] ) -> None: """ Write computed atom-level features into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. atom_parameters : Dict[str, Any] Dictionary containing computed atom data: 'atom_fragment_id' (list or array of shape (B, N)), 'atom_dist_to_special_planes' (shape (B, N, P)). Returns ------- None """ B = len(atom_parameters['atom_fragment_id']) # Pre-extract & convert the distances array once dist_arr = atom_parameters['atom_dist_to_special_planes'] if torch.is_tensor(dist_arr): dist_arr = dist_arr.detach().cpu().numpy() # Pre-extract & convert the fragment-id container once frag_container = atom_parameters['atom_fragment_id'] # If it's a single Tensor of shape (B, N_max), convert to numpy if torch.is_tensor(frag_container): frag_container = frag_container.detach().cpu().numpy().tolist() # Otherwise assume it's already a list of lists for i in range(B): idx = start + i # 1) number of real atoms na = int(self.h5_out['n_atoms'][idx]) # 2) fragment IDs # frag_container[i] is now a Python list all_ids = np.array(frag_container[i], dtype=np.int32) frag_ids = all_ids[:na] frag_ids = frag_ids[frag_ids >= 0] self.h5_out['atom_fragment_id'][idx] = frag_ids # 3) distances: slice then flatten block = dist_arr[i, :na, :] # shape (na, 26) flattened = block.reshape(-1) # length = na * 26 self.h5_out['atom_distances_to_special_planes'][idx] = flattened
[docs] def write_computed_bond_data( self, start: int, bond_parameters: Dict[str, Any] ) -> None: """ Write computed bond-level features into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. bond_parameters : Dict[str, Any] Dictionary containing computed bond data: 'bond_is_rotatable' (shape (B, M)) and 'bond_vector_angles_to_special_planes' (shape (B, M, K)). Returns ------- None """ # 1) Extract & convert the full arrays once rot_arr = bond_parameters['bond_is_rotatable'] if torch.is_tensor(rot_arr): rot_arr = rot_arr.detach().cpu().numpy() ang_arr = bond_parameters['bond_vector_angles_to_special_planes'] if torch.is_tensor(ang_arr): ang_arr = ang_arr.detach().cpu().numpy() # 2) Loop over structures B, M_max = rot_arr.shape for i in range(B): idx = start + i # how many real bonds? nb = int(self.h5_out['n_bonds'][idx]) # slice off the first nb flags (shape (nb,)) comp_rot = rot_arr[i, :nb] self.h5_out['bond_is_rotatable'][idx] = comp_rot # slice & flatten the angles (shape (nb,13) → (nb*13,)) angle_block = ang_arr[i, :nb, :] flat_angles = angle_block.reshape(-1) self.h5_out['bond_vector_angles_to_special_planes'][idx] = flat_angles
[docs] def write_computed_molecule_data( self, start: int, molecule_parameters: Dict[str, Any] ) -> None: """ Write computed intra-molecular bond angles and torsion features. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. molecule_parameters : Dict[str, Any] Dictionary containing computed molecule data: 'bond_angle_id', 'bond_angle', 'bond_angle_mask', 'bond_angle_atom_idx', 'torsion_id', 'torsion', 'torsion_mask', and 'torsion_atom_idx'. Returns ------- None """ # 1) Convert any torch.Tensor → numpy array for numeric & boolean keys float_keys = ['bond_angle', 'torsion'] bool_keys = ['bond_angle_mask', 'torsion_mask'] int_keys = ['bond_angle_atom_idx', 'torsion_atom_idx'] arrs: Dict[str, np.ndarray] = {} for key in float_keys + bool_keys + int_keys: vals = molecule_parameters[key] if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) # 2) String‐ID keys stay as Python lists or numpy arrays of strings str_keys = ['bond_angle_id', 'torsion_id'] # 3) Number of structures in this batch B = len(molecule_parameters[str_keys[0]]) for i in range(B): idx = start + i # --- vlen strings --- for key in str_keys: vals = molecule_parameters[key] if torch.is_tensor(vals): row = vals[i].detach().cpu().numpy().tolist() elif isinstance(vals, np.ndarray): row = vals[i].tolist() else: row = list(vals[i]) self.h5_out[key][idx] = row # --- vlen floats --- for key in float_keys: arr = arrs[key] # length = number of IDs for this field n_items = len(molecule_parameters[f"{key}_id"][i]) row = arr[i, :n_items] if isinstance(arr, np.ndarray) else arr[i] self.h5_out[key][idx] = np.array(row, dtype=np.float32) # --- vlen bools --- for key in bool_keys: arr = arrs[key] n_items = len(molecule_parameters[key.replace('_mask', '_id')][i]) row = arr[i, :n_items] if isinstance(arr, np.ndarray) else arr[i] self.h5_out[key][idx] = np.array(row, dtype=bool) # --- vlen ints (flatten last axis) --- for key in int_keys: arr = arrs[key] # pick the right ID list to get length id_key = 'bond_angle_id' if 'angle' in key else 'torsion_id' n_items = len(molecule_parameters[id_key][i]) if isinstance(arr, np.ndarray): block = arr[i, :n_items, :] flat = block.reshape(-1) else: # list of tuples → flatten flat = [atom for group in molecule_parameters[key][i] for atom in group] self.h5_out[key][idx] = np.array(flat, dtype=np.int32)
[docs] def write_computed_intermolecular_contact_data( self, start: int, inter_cc_parameters: Dict[str, Any] ) -> None: """ Write computed intermolecular contact features into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. inter_cc_parameters : Dict[str, Any] Dictionary containing computed intermolecular contact data, including IDs, atom labels, indices, coords, frac_coords, lengths, strengths, masks, symmetry ops, hbond flags, and fragment‐mapping and vector fields. Returns ------- None """ # Define groups of keys by how to write them float3_keys = [ 'inter_cc_central_atom_coords', 'inter_cc_contact_atom_coords', 'inter_cc_central_atom_frac_coords', 'inter_cc_contact_atom_frac_coords', 'inter_cc_contact_atom_to_fragment_com_vec', 'inter_cc_contact_atom_to_fragment_com_frac_vec', 'inter_cc_symmetry_T', 'inter_cc_symmetry_T_inv' ] float1_keys = [ 'inter_cc_length', 'inter_cc_strength', 'inter_cc_contact_atom_to_fragment_com_dist', 'inter_cc_contact_atom_to_fragment_com_frac_dist', ] int_keys = [ 'inter_cc_central_atom_idx', 'inter_cc_contact_atom_idx', 'inter_cc_central_atom_fragment_idx', 'inter_cc_contact_atom_fragment_idx', ] matrix9_keys = ['inter_cc_symmetry_A', 'inter_cc_symmetry_A_inv'] bool_keys = ['inter_cc_in_los', 'inter_cc_is_hbond'] arrs: Dict[str, np.ndarray] = {} for key in float1_keys + float3_keys + int_keys + matrix9_keys + bool_keys: vals = inter_cc_parameters[key] if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) str_keys = ['inter_cc_central_atom', 'inter_cc_contact_atom'] labels_cl = inter_cc_parameters['inter_cc_central_atom'] labels_ct = inter_cc_parameters['inter_cc_contact_atom'] B = len(labels_cl) for i in range(B): idx = start + i nC = len(labels_cl[i]) # 1) number of contacts self.h5_out['inter_cc_n_contacts'][idx] = nC # 2) IDs for each contact pair ids = [f"{labels_cl[i][j]}-{labels_ct[i][j]}" for j in range(nC)] self.h5_out['inter_cc_id'][idx] = np.array(ids) # 3) variable-length string arrays for key in str_keys: seq = inter_cc_parameters[key] vals = seq[i] if isinstance(seq, list) else seq[i][:nC] self.h5_out[key][idx] = np.array([str(x) for x in vals]) # 4) flattened 3-D vectors → (3*nC,) for key in float3_keys: block = arrs[key][i][:nC] flat = block.reshape(-1) self.h5_out[key][idx] = flat # 5) 1-D float arrays → (nC,) for key in float1_keys: row = arrs[key][i][:nC] self.h5_out[key][idx] = row.astype(np.float32) # 6) integer arrays → (nC,) for key in int_keys: row = arrs[key][i][:nC] self.h5_out[key][idx] = row.astype(np.int32) # 7) boolean arrays → (nC,) for key in bool_keys: row = arrs[key][i][:nC] self.h5_out[key][idx] = row.astype(bool) # 8) symmetry matrices (3×3 → 9 floats per contact) → (9*nC,) for key in matrix9_keys: block = arrs[key][i][:nC] flat = block.reshape(-1) self.h5_out[key][idx] = flat.astype(np.float32)
[docs] def write_computed_intermolecular_hbond_data( self, start: int, inter_hb_parameters: Dict[str, Any] ) -> None: """ Write computed intermolecular H‐bond features into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. inter_hb_parameters : Dict[str, Any] Dictionary containing computed intermolecular H‐bond data, including IDs, donor/acceptor labels, indices, coords, frac_coords, lengths, angles, masks, symmetry ops, etc. Returns ------- None """ # Define groups of keys by how to write them float3_keys = [ 'inter_hb_central_atom_coords', 'inter_hb_hydrogen_atom_coords', 'inter_hb_contact_atom_coords', 'inter_hb_central_atom_frac_coords', 'inter_hb_hydrogen_atom_frac_coords', 'inter_hb_contact_atom_frac_coords', 'inter_hb_symmetry_T', 'inter_hb_symmetry_T_inv' ] float1_keys = [ 'inter_hb_length', 'inter_hb_angle', ] int_keys = [ 'inter_hb_central_atom_idx', 'inter_hb_hydrogen_atom_idx', 'inter_hb_contact_atom_idx', ] matrix9_keys = ['inter_hb_symmetry_A', 'inter_hb_symmetry_A_inv'] bool_keys = ['inter_hb_in_los'] arrs: Dict[str, np.ndarray] = {} for key in float1_keys + float3_keys + int_keys + matrix9_keys + bool_keys: vals = inter_hb_parameters[key] if torch.is_tensor(vals): arrs[key] = vals.detach().cpu().numpy() else: arrs[key] = np.asarray(vals) str_keys = [ 'inter_hb_central_atom', 'inter_hb_hydrogen_atom', 'inter_hb_contact_atom' ] labels_cl = inter_hb_parameters['inter_hb_central_atom'] labels_h = inter_hb_parameters['inter_hb_hydrogen_atom'] labels_ct = inter_hb_parameters['inter_hb_contact_atom'] B = len(labels_cl) for i in range(B): idx = start + i nC = len(labels_cl[i]) # 1) number of contacts self.h5_out['inter_hb_n_hbonds'][idx] = nC # 2) IDs for each contact pair ids = [f"{labels_cl[i][j]}-{labels_h[i][j]}-{labels_ct[i][j]}" for j in range(nC)] self.h5_out['inter_hb_id'][idx] = np.array(ids) # 3) variable-length string arrays for key in str_keys: seq = inter_hb_parameters[key] vals = seq[i] if isinstance(seq, list) else seq[i][:nC] self.h5_out[key][idx] = np.array([str(x) for x in vals]) # 4) flattened 3-D vectors → (3*nC,) for key in float3_keys: block = arrs[key][i][:nC] flat = block.reshape(-1) self.h5_out[key][idx] = flat # 5) 1-D float arrays → (nC,) for key in float1_keys: row = arrs[key][i][:nC] self.h5_out[key][idx] = row.astype(np.float32) # 6) integer arrays → (nC,) for key in int_keys: row = arrs[key][i][:nC] self.h5_out[key][idx] = row.astype(np.int32) # 7) boolean arrays → (nC,) for key in bool_keys: row = arrs[key][i][:nC] self.h5_out[key][idx] = row.astype(bool) # 8) symmetry matrices (3×3 → 9 floats per contact) → (9*nC,) for key in matrix9_keys: block = arrs[key][i][:nC] flat = block.reshape(-1) self.h5_out[key][idx] = flat.astype(np.float32)
[docs] def write_computed_fragment_data( self, start: int, fragment_parameters: Dict[str, Any] ) -> None: """ Write computed fragment-level properties into the HDF5 datasets. Parameters ---------- start : int Index offset in the output datasets corresponding to this batch. fragment_parameters : Dict[str, Any] Dictionary containing computed fragment data: - 'n_fragments' : (B,) or list of ints - 'fragment_local_id' : (B, nF) or list of lists of ints - 'fragment_formula' : list of lists of str - 'fragment_n_atoms' : (B, nF) or list of lists of ints - all other keys as numpy/Tensor arrays: 'fragment_com_coords', 'fragment_com_frac_coords', 'fragment_cen_coords', 'fragment_cen_frac_coords', 'fragment_inertia_tensors', 'fragment_inertia_eigvals', 'fragment_inertia_eigvecs', 'fragment_inertia_quaternions', 'fragment_quadrupole_tensors', 'fragment_quadrupole_eigvals', 'fragment_quadrupole_eigvecs', 'fragment_quadrupole_quaternions', 'fragment_atom_to_com_dist', 'fragment_atom_to_com_frac_dist', 'fragment_atom_to_com_vec', 'fragment_atom_to_com_frac_vec', 'fragment_Ql', 'fragment_plane_centroid', 'fragment_plane_normal', 'fragment_planarity_rmsd', 'fragment_planarity_max_dev', 'fragment_planarity_score' Returns ------- None """ # 1) Convert all numeric / tensor fields to numpy arrays # now including fragment_structure_id so we can group by structure int_keys = [ 'fragment_structure_id', 'fragment_local_id', 'fragment_n_atoms' ] float_keys = [ 'fragment_com_coords', 'fragment_com_frac_coords', 'fragment_cen_coords', 'fragment_cen_frac_coords', 'fragment_inertia_tensors', 'fragment_inertia_eigvals', 'fragment_inertia_eigvecs', 'fragment_inertia_quaternions', 'fragment_quadrupole_tensors', 'fragment_quadrupole_eigvals', 'fragment_quadrupole_eigvecs', 'fragment_quadrupole_quaternions', 'fragment_atom_to_com_dist', 'fragment_atom_to_com_frac_dist', 'fragment_atom_to_com_vec', 'fragment_atom_to_com_frac_vec', 'fragment_Ql', 'fragment_plane_centroid', 'fragment_plane_normal', 'fragment_planarity_rmsd', 'fragment_planarity_max_dev', 'fragment_planarity_score' ] arrs_int = {} arrs_flt = {} for key in int_keys: vals = fragment_parameters[key] arrs_int[key] = ( vals.detach().cpu().numpy() if torch.is_tensor(vals) else np.asarray(vals) ) for key in float_keys: vals = fragment_parameters[key] arrs_flt[key] = ( vals.detach().cpu().numpy() if torch.is_tensor(vals) else np.asarray(vals) ) # 2) Extract formulas formulas = fragment_parameters['fragment_formula'] # 3) Book-keeping B = int(fragment_parameters['n_fragments'].shape[0]) struct_ids = arrs_int['fragment_structure_id'] # shape (F,) for i in range(B): idx = start + i nF = int(fragment_parameters['n_fragments'][i]) # write scalar count self.h5_out['n_fragments'][idx] = nF # build a mask to select the F_i fragments of structure i mask = (struct_ids == i) # --- integer‐vlen fields --- for key in ['fragment_local_id', 'fragment_n_atoms']: data = arrs_int[key][mask] # shape (F_i,) row = data[:nF].astype(np.int32) self.h5_out[key][idx] = row # --- string‐vlen field: formulas --- seq = formulas[i] if torch.is_tensor(seq): seq = seq.detach().cpu().numpy().tolist() elif isinstance(seq, np.ndarray): seq = seq.tolist() self.h5_out['fragment_formula'][idx] = [str(x) for x in seq[:nF]] # --- float‐vlen fields --- for key, arr in arrs_flt.items(): block = arr[mask] # shape (F_i, …) block = block[:nF] # drop any extra padding flat = block.reshape(-1) # flatten to 1D self.h5_out[key][idx] = flat.astype(np.float32)