Examples
Ready-to-run code examples demonstrating CSA capabilities. These practical examples show how to accomplish specific tasks and can be directly adapted for your own analyses.
Note
All examples include complete, executable code with sample data. Copy-paste and run to see immediate results.
Quick Start Examples
Complete pipeline execution with minimal configuration.
Powerful single-command workflows for common tasks.
Real-world configuration files for different scenarios.
Simple plots and charts for crystal structure data.
Data Access Examples
Efficient methods for accessing CSA datasets.
Select subsets of structures based on criteria.
Export CSA data to various formats (CSV, JSON, etc.).
Convert between different data representations.
Analysis Examples
Calculate distances, angles, and geometric descriptors.
Analyze molecular fragments and rigid bodies.
Study intermolecular interactions and packing.
Apply statistical methods to crystal structure data.
Visualization Examples
Histograms, box plots, and distribution analysis.
Visualize relationships between crystal properties.
Interactive 3D visualization of crystal structures.
Hydrogen bonding networks and contact graphs.
Integration Examples
Work with CSA data using pandas DataFrames.
Machine learning workflows with crystal structure data.
Efficient numerical computing with CSA datasets.
Interactive analysis workflows in notebooks.
Performance Examples
Leverage GPU computing for large-scale analysis.
Efficient processing of large datasets.
Minimize memory usage for large analyses.
Monitor and optimize analysis performance.
Example Categories
Basic Data Operations
Loading and Exploring Data
# Load processed HDF5 file
import h5py
import numpy as np
with h5py.File('analysis_processed.h5', 'r') as f:
refcodes = f['refcode_list'][...].astype(str)
n_structures = len(refcodes)
print(f"Dataset contains {n_structures} structures")
# Quick data overview
print(f"Available datasets: {list(f.keys())}")
Simple Property Analysis
# Analyze cell volumes
import matplotlib.pyplot as plt
with h5py.File('analysis_processed.h5', 'r') as f:
volumes = f['cell_volume'][...]
plt.hist(volumes, bins=50)
plt.xlabel('Cell Volume (Ų)')
plt.ylabel('Frequency')
plt.title('Cell Volume Distribution')
plt.show()
print(f"Mean volume: {np.mean(volumes):.1f} Ų")
print(f"Volume range: {np.min(volumes):.1f} - {np.max(volumes):.1f} Ų")
Advanced Workflows
Fragment Shape Analysis
# Analyze molecular shapes using inertia tensors
def analyze_fragment_shapes(f):
shape_data = []
for i in range(len(f['refcode_list'])):
n_frags = f['n_fragments'][i]
if n_frags > 0:
inertia = f['fragment_inertia_eigvals'][i].reshape(n_frags, 3)
for j in range(n_frags):
I1, I2, I3 = sorted(inertia[j])
asphericity = I3 - 0.5*(I1 + I2)
acylindricity = I2 - I1
shape_data.append({
'refcode': f['refcode_list'][i].decode(),
'fragment_id': j,
'asphericity': asphericity,
'acylindricity': acylindricity
})
return pd.DataFrame(shape_data)
# Usage
with h5py.File('analysis_processed.h5', 'r') as f:
shapes_df = analyze_fragment_shapes(f)
# Plot shape distribution
plt.scatter(shapes_df['asphericity'], shapes_df['acylindricity'], alpha=0.6)
plt.xlabel('Asphericity')
plt.ylabel('Acylindricity')
plt.title('Molecular Shape Distribution')
plt.show()
Contact Network Analysis
# Build hydrogen bonding networks
import networkx as nx
def build_hbond_network(f, structure_idx):
"""Build hydrogen bonding network for a single structure."""
refcode = f['refcode_list'][structure_idx].decode()
n_hbonds = f['inter_hb_n_hbonds'][structure_idx]
if n_hbonds == 0:
return None
# Get H-bond data
donors = f['inter_hb_central_atom'][structure_idx].astype(str)
acceptors = f['inter_hb_contact_atom'][structure_idx].astype(str)
distances = f['inter_hb_length'][structure_idx]
# Build network
G = nx.Graph()
for i in range(n_hbonds):
G.add_edge(donors[i], acceptors[i], distance=distances[i])
return G
# Example usage
with h5py.File('analysis_processed.h5', 'r') as f:
network = build_hbond_network(f, 0)
if network:
print(f"Network has {network.number_of_nodes()} atoms")
print(f"Network has {network.number_of_edges()} H-bonds")
Configuration Templates
Common Configuration Patterns
High-Throughput Screening
{
"extraction": {
"data_directory": "./hts_analysis",
"data_prefix": "screening_run",
"filters": {
"target_z_prime_values": [1],
"crystal_type": ["homomolecular"],
"molecule_weight_limit": 600.0
},
"extraction_batch_size": 128,
"post_extraction_batch_size": 64
}
}
Pharmaceutical Focus
{
"extraction": {
"data_directory": "./pharma_analysis",
"data_prefix": "drug_molecules",
"filters": {
"target_z_prime_values": [1, 2],
"crystal_type": ["homomolecular"],
"molecule_formal_charges": [0, 1, -1],
"target_species": ["C", "H", "N", "O", "S", "F", "Cl"],
"molecule_weight_limit": 800.0
}
}
}
Materials Science
{
"extraction": {
"data_directory": "./materials_analysis",
"data_prefix": "metal_organic",
"filters": {
"target_z_prime_values": [1, 2, 4],
"crystal_type": ["organometallic", "homomolecular"],
"target_species": ["C", "H", "N", "O", "Fe", "Cu", "Zn"],
"exclude_disorder": true
}
}
}
Utility Functions
Commonly Used Helper Functions
Data Loading Utilities
def load_structure_summary(hdf5_file):
"""Load basic structure information into a pandas DataFrame."""
with h5py.File(hdf5_file, 'r') as f:
data = {
'refcode': f['refcode_list'][...].astype(str),
'space_group': [f['space_group'][i].decode() for i in range(len(f['refcode_list']))],
'z_prime': f['z_prime'][...],
'cell_volume': f['cell_volume'][...],
'cell_density': f['cell_density'][...],
'n_atoms': f['n_atoms'][...],
'n_fragments': f['n_fragments'][...],
'n_contacts': f['inter_cc_n_contacts'][...]
}
return pd.DataFrame(data)
Filtering Utilities
def filter_structures(df, criteria):
"""Apply multiple filtering criteria to structure DataFrame."""
mask = pd.Series(True, index=df.index)
if 'min_volume' in criteria:
mask &= df['cell_volume'] >= criteria['min_volume']
if 'max_volume' in criteria:
mask &= df['cell_volume'] <= criteria['max_volume']
if 'space_groups' in criteria:
mask &= df['space_group'].isin(criteria['space_groups'])
if 'z_prime_values' in criteria:
mask &= df['z_prime'].isin(criteria['z_prime_values'])
return df[mask]
Visualization Utilities
def plot_property_comparison(df, x_col, y_col, color_col=None):
"""Create scatter plot comparing two properties."""
plt.figure(figsize=(10, 6))
if color_col:
scatter = plt.scatter(df[x_col], df[y_col], c=df[color_col], alpha=0.6)
plt.colorbar(scatter, label=color_col)
else:
plt.scatter(df[x_col], df[y_col], alpha=0.6)
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title(f'{y_col} vs {x_col}')
plt.show()
Interactive Examples
Jupyter Notebook Examples
Example 1: Quick Dataset Exploration
# Cell 1: Load and explore
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load summary data
df = load_structure_summary('my_analysis_processed.h5')
df.head()
# Cell 2: Basic statistics
df.describe()
# Cell 3: Visualizations
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Volume distribution
axes[0,0].hist(df['cell_volume'], bins=30)
axes[0,0].set_title('Cell Volume Distribution')
# Density vs Volume
axes[0,1].scatter(df['cell_volume'], df['cell_density'], alpha=0.6)
axes[0,1].set_xlabel('Volume')
axes[0,1].set_ylabel('Density')
# Z' distribution
df['z_prime'].value_counts().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title("Z' Distribution")
# Space group popularity
df['space_group'].value_counts().head(10).plot(kind='barh', ax=axes[1,1])
axes[1,1].set_title('Top Space Groups')
plt.tight_layout()
plt.show()
Command Line Examples
Quick Analysis Commands
# Run with default configuration
python src/csa_main.py --config default.json
# Run with custom batch size
python src/csa_main.py --config analysis.json --batch-size 64
# Run only specific stages
python src/csa_main.py --config analysis.json --stages 4,5
# Resume from checkpoint
python src/csa_main.py --config analysis.json --resume
Data Export Commands
# Export to CSV
python scripts/export_data.py structures_processed.h5 --format csv --output analysis_data/
# Export filtered subset
python scripts/export_data.py structures_processed.h5 --filter "z_prime==1" --output subset/
# Export specific properties
python scripts/export_data.py structures_processed.h5 --properties cell_volume,cell_density
Troubleshooting Examples
Common Issues and Solutions
Memory Management
# Problem: Out of memory when loading large datasets
# Solution: Use chunked loading
def load_data_chunked(hdf5_file, chunk_size=1000):
"""Load large datasets in chunks to manage memory."""
results = []
with h5py.File(hdf5_file, 'r') as f:
n_structures = len(f['refcode_list'])
for start in range(0, n_structures, chunk_size):
end = min(start + chunk_size, n_structures)
chunk_data = {
'refcode': f['refcode_list'][start:end].astype(str),
'cell_volume': f['cell_volume'][start:end]
}
results.append(pd.DataFrame(chunk_data))
return pd.concat(results, ignore_index=True)
Performance Optimization
# Problem: Slow data access
# Solution: Use indexing and caching
def create_index_cache(hdf5_file):
"""Create an index for faster data access."""
with h5py.File(hdf5_file, 'r') as f:
index = {
refcode: i for i, refcode in
enumerate(f['refcode_list'][...].astype(str))
}
return index
# Usage
index = create_index_cache('structures_processed.h5')
target_idx = index['AABHTZ']
Example Downloads
All examples are available for download:
# Download example files
wget https://csa-examples.readthedocs.io/downloads/examples.zip
unzip examples.zip
cd csa_examples/
# Run example scripts
python basic_analysis_example.py
python fragment_analysis_example.py
python visualization_example.py