Source code for dimension_scanner

"""
Module: dimension_scanner.py

Scan a raw HDF5 file to determine maximum ragged-array dimensions required
for atoms, bonds, inter-/intra-molecular contacts, and inter-/intra-molecular H-bonds.

Dependencies
------------
h5py
"""
import h5py
from typing import List, Dict

[docs] def scan_max_dimensions(h5_in: h5py.File, refcodes: List[str]) -> Dict[str, int]: """ Compute the maximum sizes needed to pad ragged arrays across all structures. Parameters ---------- h5_in : h5py.File Open HDF5 file containing `/structures/<refcode>` groups produced by the raw-data extractor. refcodes : List[str] List of all structure refcodes to scan. Returns ------- Dict[str, int] Dictionary mapping: - 'atoms' : maximum number of atoms across all structures - 'bonds' : maximum number of bonds - 'contacts_inter' : maximum number of intermolecular contacts - 'contacts_intra' : maximum number of intramolecular contacts - 'hbonds_inter' : maximum number of intermolecular H-bonds - 'hbonds_intra' : maximum number of intramolecular H-bonds - 'fragments' : recommended maximum fragments (equal to 'atoms') Raises ------ KeyError If an expected subgroup or dataset is missing under `/structures/<refcode>`. """ max_atoms = max_bonds = max_contacts_inter = max_contacts_intra = max_hbonds_inter = max_hbonds_intra = 0 for ref in refcodes: grp = h5_in['structures'][ref] # Atoms nat = grp['atom_label'].shape[0] max_atoms = max(max_atoms, nat) # Bonds if 'bond_atom1_idx' in grp: nb = grp['bond_atom1_idx'].shape[0] max_bonds = max(max_bonds, nb) # Contacts if 'inter_cc_id' in grp: nc_inter = grp['inter_cc_id'].shape[0] max_contacts_inter = max(max_contacts_inter, nc_inter) if 'intra_cc_id' in grp: nc_intra = grp['intra_cc_id'].shape[0] max_contacts_intra = max(max_contacts_intra, nc_intra) # H-bonds if 'inter_hb_id' in grp: nh_inter = grp['inter_hb_id'].shape[0] max_hbonds_inter = max(max_hbonds_inter, nh_inter) if 'intra_hb_id' in grp: nh_intra = grp['intra_hb_id'].shape[0] max_hbonds_intra = max(max_hbonds_intra, nh_intra) return { 'atoms': max_atoms, 'bonds': max_bonds, 'contacts_inter': max_contacts_inter, 'contacts_intra': max_contacts_intra, 'hbonds_inter': max_hbonds_inter, 'hbonds_intra': max_hbonds_intra, 'fragments': max_atoms }