Source code for csa_config
"""
Module: csa_config.py
Configuration objects and loader for the Crystal Structure Analysis pipeline.
This module defines:
- ExtractionConfig: dataclass controlling extraction parameters.
- load_config: utility to construct ExtractionConfig from a JSON file.
"""
from dataclasses import dataclass
from typing import Dict, Any, Union
import json
import logging
from pathlib import Path
from inspect import signature
logger = logging.getLogger(__name__)
[docs]
@dataclass
class ExtractionConfig:
"""
Configuration settings for the data-extraction pipeline.
Parameters
----------
data_directory : Path
Directory under which all raw and intermediate extraction outputs will be stored.
Subdirectories (e.g. “structures/”, “csv/”) are created automatically.
data_prefix : str
Prefix used when naming output files, for example
``"{data_prefix}_refcode_families.csv"``.
actions : Dict[str, bool]
Flags to enable or skip individual extraction substeps:
- ``get_refcode_families``
- ``cluster_refcode_families``
- ``get_unique_structures``
- ``get_structure_data``
- ``post_extraction_process``
filters : Dict[str, Any]
Criteria for filtering CSD entries, for example:
- ``elements`` (List[str]): only structures containing these elements
- ``min_resolution`` (float): only structures with resolution ≤ this value
- ``space_groups`` (List[str]): only structures in these space groups
extraction_batch_size : int
Number of structures or refcode families to process per batch during extraction
post_extraction_batch_size : int
Number of structures to process per batch during post-extraction
Methods
-------
from_json(cls, json_path)
Load and validate fields from the “extraction” section of a JSON file.
"""
data_directory: Path
data_prefix: str
actions: Dict[str, bool]
filters: Dict[str, Any]
extraction_batch_size: int
post_extraction_batch_size: int
[docs]
@classmethod
def from_json(cls, json_path: Union[str, Path]) -> 'ExtractionConfig':
"""
Load an ExtractionConfig from a JSON file.
Parameters
----------
json_path : Union[str, Path]
Path to the JSON configuration file.
Returns
-------
ExtractionConfig
Instance populated from the “extraction” section.
Raises
------
FileNotFoundError
If the file does not exist.
KeyError
If the “extraction” section is missing.
json.JSONDecodeError
If the file contains invalid JSON.
"""
json_path = Path(json_path)
if not json_path.is_file():
logger.error(f"Config file not found: {json_path}")
raise FileNotFoundError(f"Config file not found: {json_path}")
raw = json.loads(json_path.read_text())
try:
config = raw['extraction']
except KeyError:
logger.error(f"'extraction' section missing in {json_path}")
raise
# Keep only keys valid for this dataclass
valid_keys = set(signature(cls).parameters)
cleaned = {k: v for k, v in config.items() if k in valid_keys}
# Convert data_directory to Path
if 'data_directory' in cleaned:
cleaned['data_directory'] = Path(cleaned['data_directory'])
return cls(**cleaned)
[docs]
def load_config(config_path: Union[str, Path]) -> ExtractionConfig:
"""
Read a JSON configuration file and return an ExtractionConfig instance.
Parameters
----------
config_path : Union[str, Path]
Path to the JSON configuration file.
Returns
-------
ExtractionConfig
Dataclass instance loaded from the “extraction” section.
Raises
------
FileNotFoundError
If the file does not exist.
KeyError
If the “extraction” section is missing.
json.JSONDecodeError
If the file contains invalid JSON.
"""
config_path = Path(config_path)
extraction_config = ExtractionConfig.from_json(config_path)
return extraction_config