"""Simplified HDF5 storage for data persistence.""" import os import pandas as pd from pathlib import Path from typing import Optional from src.data.config import get_config class Storage: """HDF5 storage manager for saving and loading data.""" def __init__(self, path: Optional[Path] = None): """Initialize storage. Args: path: Base path for data storage (auto-loaded from config if not provided) """ cfg = get_config() self.base_path = path or cfg.data_path_resolved self.base_path.mkdir(parents=True, exist_ok=True) def _get_file_path(self, name: str) -> Path: """Get full path for an HDF5 file.""" return self.base_path / f"{name}.h5" def save(self, name: str, data: pd.DataFrame, mode: str = "append") -> dict: """Save data to HDF5 file. Args: name: Dataset name (also used as filename) data: DataFrame to save mode: 'append' or 'replace' Returns: Dict with save result """ if data.empty: return {"status": "skipped", "rows": 0} file_path = self._get_file_path(name) try: with pd.HDFStore(file_path, mode="a") as store: if mode == "replace" or name not in store.keys(): store.put(name, data, format="table") else: # Merge with existing data existing = store[name] combined = pd.concat([existing, data], ignore_index=True) combined = combined.drop_duplicates(subset=["ts_code", "trade_date"], keep="last") store.put(name, combined, format="table") print(f"[Storage] Saved {len(data)} rows to {file_path}") return {"status": "success", "rows": len(data), "path": str(file_path)} except Exception as e: print(f"[Storage] Error saving {name}: {e}") return {"status": "error", "error": str(e)} def load(self, name: str, start_date: Optional[str] = None, end_date: Optional[str] = None, ts_code: Optional[str] = None) -> pd.DataFrame: """Load data from HDF5 file. Args: name: Dataset name start_date: Start date filter (YYYYMMDD) end_date: End date filter (YYYYMMDD) ts_code: Stock code filter Returns: DataFrame with loaded data """ file_path = self._get_file_path(name) if not file_path.exists(): print(f"[Storage] File not found: {file_path}") return pd.DataFrame() try: with pd.HDFStore(file_path, mode="r") as store: if name not in store.keys(): return pd.DataFrame() data = store[name] # Apply filters if start_date and end_date and "trade_date" in data.columns: data = data[(data["trade_date"] >= start_date) & (data["trade_date"] <= end_date)] if ts_code and "ts_code" in data.columns: data = data[data["ts_code"] == ts_code] return data except Exception as e: print(f"[Storage] Error loading {name}: {e}") return pd.DataFrame() def get_last_date(self, name: str) -> Optional[str]: """Get the latest date in storage. Args: name: Dataset name Returns: Latest date string or None """ data = self.load(name) if data.empty or "trade_date" not in data.columns: return None return str(data["trade_date"].max()) def exists(self, name: str) -> bool: """Check if dataset exists.""" return self._get_file_path(name).exists() def delete(self, name: str) -> bool: """Delete a dataset. Args: name: Dataset name Returns: True if deleted """ file_path = self._get_file_path(name) if file_path.exists(): file_path.unlink() print(f"[Storage] Deleted {file_path}") return True return False