Source code for csd_structure_validator

"""
Module: structure_validator.py

High-level validation of CSD crystal structures and molecules against filter criteria.

This module defines:
- StructureValidationResult: Container for validation outcomes.
- StructureValidator: Applies a sequence of checks to CCDC Crystal and Molecule objects.

Dependencies
------------
ccdc
"""

from dataclasses import dataclass
from typing import Optional, Set
from ccdc.molecule import Molecule
from ccdc.crystal import Crystal

[docs] @dataclass class StructureValidationResult: """ Result of validating a CSD Crystal and Molecule pair. Attributes ---------- is_valid : bool True if all checks pass; False otherwise. failure_reason : Optional[str] Reason for failure when `is_valid` is False. """ is_valid: bool failure_reason: Optional[str] = None
class StructureValidator: """ Validate a CSD Crystal and Molecule against specified criteria. This class executes a pipeline of validation steps: - Zʹ value check - Space group check - Molecule preprocessing (bonds, hydrogens, charges) - Atom existence and coordinate validation - Crystal type determination - Molecular properties validation (charge, weight) - Species inclusion check Attributes ---------- filters : dict Mapping of filter criteria. Supported keys: - target_z_prime_values : List[int] - target_space_groups : List[str] - crystal_type : List[str] - molecule_formal_charges : List[int] - molecule_weight_limit : float - target_species : List[str] """ def __init__(self, filters: dict): self.filters = filters def validate(self, crystal: Crystal, molecule: Molecule) -> StructureValidationResult: """ Run full validation pipeline on a crystal and molecule pair. Parameters ---------- crystal : Crystal CCDC Crystal object to validate. molecule : Molecule CCDC Molecule object to validate. Returns ------- StructureValidationResult Outcome of the validation, with `failure_reason` set when invalid. """ # Z prime check if crystal.z_prime not in self.filters['target_z_prime_values']: return StructureValidationResult(False, "Invalid Z prime value") # Space group check if (self.filters['target_space_groups'] and crystal.spacegroup_symbol not in self.filters['target_space_groups']): return StructureValidationResult(False, "Invalid space group") # Process molecule if not self._process_molecule(molecule): return StructureValidationResult(False, "Failed to process molecule") # Validate atoms atoms_result = self._validate_atoms(molecule) if not atoms_result.is_valid: return atoms_result # Crystal type validation crystal_type = self._determine_crystal_type(molecule) if crystal_type not in self.filters['crystal_type']: return StructureValidationResult(False, f"Invalid crystal type: {crystal_type}") # Molecular properties validation mol_result = self._validate_molecular_properties(molecule, crystal_type) if not mol_result.is_valid: return mol_result # Species validation if not self._validate_species(crystal): return StructureValidationResult(False, "Invalid chemical species") return StructureValidationResult(True) def _process_molecule(self, molecule: Molecule) -> bool: """ Assign bond types, add hydrogens, and compute partial charges. Parameters ---------- molecule : Molecule CCDC Molecule to process. Returns ------- bool True if processing succeeds; False otherwise. """ try: molecule.assign_bond_types() molecule.add_hydrogens(mode='missing') molecule.assign_partial_charges() return True except Exception: return False def _validate_atoms(self, molecule: Molecule) -> StructureValidationResult: """ Ensure the molecule has atoms and each atom has coordinates. Parameters ---------- molecule : Molecule CCDC Molecule to check. Returns ------- StructureValidationResult is_valid=False if no atoms or missing coordinates; otherwise True. """ try: atoms = molecule.atoms if not atoms: return StructureValidationResult(False, "No atoms found") if any(at.coordinates is None for at in atoms): return StructureValidationResult(False, "Missing atomic coordinates") return StructureValidationResult(True) except Exception: return StructureValidationResult(False, "Failed to access atomic data") def _determine_crystal_type(self, molecule: Molecule) -> str: """ Classify a molecule as 'homomolecular', 'hydrate', or 'co-crystal'. Parameters ---------- molecule : Molecule CCDC Molecule containing one or more components. Returns ------- str One of {'homomolecular', 'hydrate', 'co-crystal'}. Notes ----- - 'homomolecular' if all component formulas are identical. - 'hydrate' if any component formula equals 'H2 O1'. - 'co-crystal' otherwise. """ components = [c.formula for c in molecule.components] if all(item == components[0] for item in components): return 'homomolecular' return 'hydrate' if 'H2 O1' in components else 'co-crystal' def _validate_molecular_properties(self, molecule: Molecule, crystal_type: str) -> StructureValidationResult: """ Enforce formal charge and molecular weight limits for each component. Parameters ---------- molecule : Molecule CCDC Molecule containing components. crystal_type : str Classification from `_determine_crystal_type`. Returns ------- StructureValidationResult is_valid=False if any component violates charge or weight limits; otherwise True. """ # Formal charge check for homomolecular crystals if crystal_type == 'homomolecular': if any(c.formal_charge not in self.filters['molecule_formal_charges'] for c in molecule.components): return StructureValidationResult(False, "Invalid formal charge") # Molecular weight check if any(c.molecular_weight > self.filters['molecule_weight_limit'] for c in molecule.components): return StructureValidationResult(False, "Molecular weight exceeds limit") return StructureValidationResult(True) def _validate_species(self, crystal: Crystal) -> bool: """ Check that all unique atomic species in the crystal formula are allowed. Parameters ---------- crystal : Crystal CCDC Crystal object to check. Returns ------- bool True if all species are in `filters['target_species']` or if that list is empty. """ if not self.filters['target_species']: return True species = self._get_unique_species(crystal.formula) return all(s in self.filters['target_species'] for s in species) @staticmethod def _get_unique_species(formula: str) -> Set[str]: """ Parse a chemical formula into unique element symbols. Parameters ---------- formula : str Chemical formula string (e.g., "C6H12O6"). Returns ------- Set[str] Unique element symbols extracted from the formula. Notes ----- - Element symbols start with an uppercase letter, optionally followed by lowercase letters. - Numeric characters are ignored. """ species = set() current = '' for char in formula: if char.isupper(): if current: species.add(current) current = char elif char.islower(): current += char elif current: species.add(current) current = '' if current: species.add(current) return species