"""Tests for DuckDB storage validation. Validates two key points: 1. All stocks from stock_basic.csv are saved in daily table 2. No abnormal data with very few data points (< 10 rows per stock) 使用 3 个月的真实数据进行测试 (2024年1月-3月) """ import pytest import pandas as pd from datetime import datetime, timedelta from src.data.storage import Storage from src.data.api_wrappers.api_stock_basic import _get_csv_path class TestDailyStorageValidation: """Test daily table storage integrity and completeness.""" # 测试数据时间范围:3个月 TEST_START_DATE = "20240101" TEST_END_DATE = "20240331" @pytest.fixture def storage(self): """Create storage instance.""" return Storage() @pytest.fixture def stock_basic_df(self): """Load stock basic data from CSV.""" csv_path = _get_csv_path() if not csv_path.exists(): pytest.skip(f"stock_basic.csv not found at {csv_path}") return pd.read_csv(csv_path) @pytest.fixture def daily_df(self, storage): """Load daily data from DuckDB (3 months).""" if not storage.exists("daily"): pytest.skip("daily table not found in DuckDB") # 从 DuckDB 加载 3 个月数据 df = storage.load( "daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE ) if df.empty: pytest.skip( f"No data found for period {self.TEST_START_DATE} to {self.TEST_END_DATE}" ) return df def test_duckdb_connection(self, storage): """Test DuckDB connection and basic operations.""" assert storage.exists("daily") or True # 至少连接成功 print(f"[TEST] DuckDB connection successful") def test_load_3months_data(self, storage): """Test loading 3 months of data from DuckDB.""" df = storage.load( "daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE ) if df.empty: pytest.skip("No data available for testing period") # 验证数据覆盖范围 dates = df["trade_date"].astype(str) min_date = dates.min() max_date = dates.max() print(f"[TEST] Loaded {len(df)} rows from {min_date} to {max_date}") assert len(df) > 0, "Should have data in the 3-month period" def test_all_stocks_saved(self, storage, stock_basic_df, daily_df): """Verify all stocks from stock_basic are saved in daily table. This test ensures data completeness - every stock in stock_basic should have corresponding data in daily table. """ if daily_df.empty: pytest.fail("daily table is empty for test period") # Get unique stock codes from both sources expected_codes = set(stock_basic_df["ts_code"].dropna().unique()) actual_codes = set(daily_df["ts_code"].dropna().unique()) # Check for missing stocks missing_codes = expected_codes - actual_codes if missing_codes: missing_list = sorted(missing_codes) # Show first 20 missing stocks as sample sample = missing_list[:20] msg = f"Found {len(missing_codes)} stocks missing from daily table:\n" msg += f"Sample missing: {sample}\n" if len(missing_list) > 20: msg += f"... and {len(missing_list) - 20} more" # 对于3个月数据,允许部分股票缺失(可能是新股或未上市) print(f"[WARNING] {msg}") # 只验证至少有80%的股票存在 coverage = len(actual_codes) / len(expected_codes) * 100 assert coverage >= 80, ( f"Stock coverage {coverage:.1f}% is below 80% threshold" ) else: print( f"[TEST] All {len(expected_codes)} stocks from stock_basic are present in daily table" ) def test_no_stock_with_insufficient_data(self, storage, daily_df): """Verify no stock has abnormally few data points (< 5 rows in 3 months). Stocks with very few data points may indicate sync failures, delisted stocks not properly handled, or data corruption. """ if daily_df.empty: pytest.fail("daily table is empty for test period") # Count rows per stock stock_counts = daily_df.groupby("ts_code").size() # Find stocks with less than 5 data points in 3 months insufficient_stocks = stock_counts[stock_counts < 5] if not insufficient_stocks.empty: # Separate into categories for better reporting empty_stocks = stock_counts[stock_counts == 0] very_few_stocks = stock_counts[(stock_counts > 0) & (stock_counts < 5)] msg = f"Found {len(insufficient_stocks)} stocks with insufficient data (< 5 rows in 3 months):\n" if not empty_stocks.empty: msg += f"\nEmpty stocks (0 rows): {len(empty_stocks)}\n" sample = sorted(empty_stocks.index[:10].tolist()) msg += f"Sample: {sample}" if not very_few_stocks.empty: msg += f"\nVery few data points (1-4 rows): {len(very_few_stocks)}\n" # Show counts for these stocks sample = very_few_stocks.sort_values().head(20) msg += "Sample (ts_code: count):\n" for code, count in sample.items(): msg += f" {code}: {count} rows\n" # 对于3个月数据,允许少量异常,但比例不能超过5% if len(insufficient_stocks) / len(stock_counts) > 0.05: pytest.fail(msg) else: print(f"[WARNING] {msg}") print(f"[TEST] All stocks have sufficient data (>= 5 rows in 3 months)") def test_data_integrity_basic(self, storage, daily_df): """Basic data integrity checks for daily table.""" if daily_df.empty: pytest.fail("daily table is empty for test period") # Check required columns exist required_columns = ["ts_code", "trade_date"] missing_columns = [ col for col in required_columns if col not in daily_df.columns ] if missing_columns: pytest.fail(f"Missing required columns: {missing_columns}") # Check for null values in key columns null_ts_code = daily_df["ts_code"].isna().sum() null_trade_date = daily_df["trade_date"].isna().sum() if null_ts_code > 0: pytest.fail(f"Found {null_ts_code} rows with null ts_code") if null_trade_date > 0: pytest.fail(f"Found {null_trade_date} rows with null trade_date") print(f"[TEST] Data integrity check passed for 3-month period") def test_polars_export(self, storage): """Test Polars export functionality.""" if not storage.exists("daily"): pytest.skip("daily table not found") import polars as pl # 测试 load_polars 方法 df = storage.load_polars( "daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE ) assert isinstance(df, pl.DataFrame), "Should return Polars DataFrame" print(f"[TEST] Polars export successful: {len(df)} rows") def test_stock_data_coverage_report(self, storage, daily_df): """Generate a summary report of stock data coverage. This test provides visibility into data distribution without failing. """ if daily_df.empty: pytest.skip("daily table is empty - cannot generate report") stock_counts = daily_df.groupby("ts_code").size() # Calculate statistics total_stocks = len(stock_counts) min_count = stock_counts.min() max_count = stock_counts.max() median_count = stock_counts.median() mean_count = stock_counts.mean() # Distribution buckets (adjusted for 3-month period, ~60 trading days) very_low = (stock_counts < 5).sum() low = ((stock_counts >= 5) & (stock_counts < 20)).sum() medium = ((stock_counts >= 20) & (stock_counts < 40)).sum() high = (stock_counts >= 40).sum() report = f""" === Stock Data Coverage Report (3 months: {self.TEST_START_DATE} to {self.TEST_END_DATE}) === Total stocks: {total_stocks} Data points per stock: Min: {min_count} Max: {max_count} Median: {median_count:.0f} Mean: {mean_count:.1f} Distribution: < 5 rows: {very_low} stocks ({very_low / total_stocks * 100:.1f}%) 5-19: {low} stocks ({low / total_stocks * 100:.1f}%) 20-39: {medium} stocks ({medium / total_stocks * 100:.1f}%) >= 40: {high} stocks ({high / total_stocks * 100:.1f}%) """ print(report) # This is an informational test - it should not fail # But we assert to mark it as passed assert total_stocks > 0 if __name__ == "__main__": pytest.main([__file__, "-v", "-s"])