Files
ProStock/tests/test_daily_storage.py
liaozhaorun e58b39970c feat: HDF5迁移至DuckDB存储
- 新增DuckDB Storage与ThreadSafeStorage实现
- 新增db_manager模块支持增量同步策略
- DataLoader与Sync模块适配DuckDB
- 补充迁移相关文档与测试
- 修复README文档链接
2026-02-23 00:07:21 +08:00

243 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for DuckDB storage validation.
Validates two key points:
1. All stocks from stock_basic.csv are saved in daily table
2. No abnormal data with very few data points (< 10 rows per stock)
使用 3 个月的真实数据进行测试 (2024年1月-3月)
"""
import pytest
import pandas as pd
from datetime import datetime, timedelta
from src.data.storage import Storage
from src.data.api_wrappers.api_stock_basic import _get_csv_path
class TestDailyStorageValidation:
"""Test daily table storage integrity and completeness."""
# 测试数据时间范围3个月
TEST_START_DATE = "20240101"
TEST_END_DATE = "20240331"
@pytest.fixture
def storage(self):
"""Create storage instance."""
return Storage()
@pytest.fixture
def stock_basic_df(self):
"""Load stock basic data from CSV."""
csv_path = _get_csv_path()
if not csv_path.exists():
pytest.skip(f"stock_basic.csv not found at {csv_path}")
return pd.read_csv(csv_path)
@pytest.fixture
def daily_df(self, storage):
"""Load daily data from DuckDB (3 months)."""
if not storage.exists("daily"):
pytest.skip("daily table not found in DuckDB")
# 从 DuckDB 加载 3 个月数据
df = storage.load(
"daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE
)
if df.empty:
pytest.skip(
f"No data found for period {self.TEST_START_DATE} to {self.TEST_END_DATE}"
)
return df
def test_duckdb_connection(self, storage):
"""Test DuckDB connection and basic operations."""
assert storage.exists("daily") or True # 至少连接成功
print(f"[TEST] DuckDB connection successful")
def test_load_3months_data(self, storage):
"""Test loading 3 months of data from DuckDB."""
df = storage.load(
"daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE
)
if df.empty:
pytest.skip("No data available for testing period")
# 验证数据覆盖范围
dates = df["trade_date"].astype(str)
min_date = dates.min()
max_date = dates.max()
print(f"[TEST] Loaded {len(df)} rows from {min_date} to {max_date}")
assert len(df) > 0, "Should have data in the 3-month period"
def test_all_stocks_saved(self, storage, stock_basic_df, daily_df):
"""Verify all stocks from stock_basic are saved in daily table.
This test ensures data completeness - every stock in stock_basic
should have corresponding data in daily table.
"""
if daily_df.empty:
pytest.fail("daily table is empty for test period")
# Get unique stock codes from both sources
expected_codes = set(stock_basic_df["ts_code"].dropna().unique())
actual_codes = set(daily_df["ts_code"].dropna().unique())
# Check for missing stocks
missing_codes = expected_codes - actual_codes
if missing_codes:
missing_list = sorted(missing_codes)
# Show first 20 missing stocks as sample
sample = missing_list[:20]
msg = f"Found {len(missing_codes)} stocks missing from daily table:\n"
msg += f"Sample missing: {sample}\n"
if len(missing_list) > 20:
msg += f"... and {len(missing_list) - 20} more"
# 对于3个月数据允许部分股票缺失可能是新股或未上市
print(f"[WARNING] {msg}")
# 只验证至少有80%的股票存在
coverage = len(actual_codes) / len(expected_codes) * 100
assert coverage >= 80, (
f"Stock coverage {coverage:.1f}% is below 80% threshold"
)
else:
print(
f"[TEST] All {len(expected_codes)} stocks from stock_basic are present in daily table"
)
def test_no_stock_with_insufficient_data(self, storage, daily_df):
"""Verify no stock has abnormally few data points (< 5 rows in 3 months).
Stocks with very few data points may indicate sync failures,
delisted stocks not properly handled, or data corruption.
"""
if daily_df.empty:
pytest.fail("daily table is empty for test period")
# Count rows per stock
stock_counts = daily_df.groupby("ts_code").size()
# Find stocks with less than 5 data points in 3 months
insufficient_stocks = stock_counts[stock_counts < 5]
if not insufficient_stocks.empty:
# Separate into categories for better reporting
empty_stocks = stock_counts[stock_counts == 0]
very_few_stocks = stock_counts[(stock_counts > 0) & (stock_counts < 5)]
msg = f"Found {len(insufficient_stocks)} stocks with insufficient data (< 5 rows in 3 months):\n"
if not empty_stocks.empty:
msg += f"\nEmpty stocks (0 rows): {len(empty_stocks)}\n"
sample = sorted(empty_stocks.index[:10].tolist())
msg += f"Sample: {sample}"
if not very_few_stocks.empty:
msg += f"\nVery few data points (1-4 rows): {len(very_few_stocks)}\n"
# Show counts for these stocks
sample = very_few_stocks.sort_values().head(20)
msg += "Sample (ts_code: count):\n"
for code, count in sample.items():
msg += f" {code}: {count} rows\n"
# 对于3个月数据允许少量异常但比例不能超过5%
if len(insufficient_stocks) / len(stock_counts) > 0.05:
pytest.fail(msg)
else:
print(f"[WARNING] {msg}")
print(f"[TEST] All stocks have sufficient data (>= 5 rows in 3 months)")
def test_data_integrity_basic(self, storage, daily_df):
"""Basic data integrity checks for daily table."""
if daily_df.empty:
pytest.fail("daily table is empty for test period")
# Check required columns exist
required_columns = ["ts_code", "trade_date"]
missing_columns = [
col for col in required_columns if col not in daily_df.columns
]
if missing_columns:
pytest.fail(f"Missing required columns: {missing_columns}")
# Check for null values in key columns
null_ts_code = daily_df["ts_code"].isna().sum()
null_trade_date = daily_df["trade_date"].isna().sum()
if null_ts_code > 0:
pytest.fail(f"Found {null_ts_code} rows with null ts_code")
if null_trade_date > 0:
pytest.fail(f"Found {null_trade_date} rows with null trade_date")
print(f"[TEST] Data integrity check passed for 3-month period")
def test_polars_export(self, storage):
"""Test Polars export functionality."""
if not storage.exists("daily"):
pytest.skip("daily table not found")
import polars as pl
# 测试 load_polars 方法
df = storage.load_polars(
"daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE
)
assert isinstance(df, pl.DataFrame), "Should return Polars DataFrame"
print(f"[TEST] Polars export successful: {len(df)} rows")
def test_stock_data_coverage_report(self, storage, daily_df):
"""Generate a summary report of stock data coverage.
This test provides visibility into data distribution without failing.
"""
if daily_df.empty:
pytest.skip("daily table is empty - cannot generate report")
stock_counts = daily_df.groupby("ts_code").size()
# Calculate statistics
total_stocks = len(stock_counts)
min_count = stock_counts.min()
max_count = stock_counts.max()
median_count = stock_counts.median()
mean_count = stock_counts.mean()
# Distribution buckets (adjusted for 3-month period, ~60 trading days)
very_low = (stock_counts < 5).sum()
low = ((stock_counts >= 5) & (stock_counts < 20)).sum()
medium = ((stock_counts >= 20) & (stock_counts < 40)).sum()
high = (stock_counts >= 40).sum()
report = f"""
=== Stock Data Coverage Report (3 months: {self.TEST_START_DATE} to {self.TEST_END_DATE}) ===
Total stocks: {total_stocks}
Data points per stock:
Min: {min_count}
Max: {max_count}
Median: {median_count:.0f}
Mean: {mean_count:.1f}
Distribution:
< 5 rows: {very_low} stocks ({very_low / total_stocks * 100:.1f}%)
5-19: {low} stocks ({low / total_stocks * 100:.1f}%)
20-39: {medium} stocks ({medium / total_stocks * 100:.1f}%)
>= 40: {high} stocks ({high / total_stocks * 100:.1f}%)
"""
print(report)
# This is an informational test - it should not fail
# But we assert to mark it as passed
assert total_stocks > 0
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])