2026-02-23 00:07:21 +08:00
|
|
|
|
"""Tests for DuckDB storage validation.
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
Validates two key points:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
1. All stocks from stock_basic.csv are saved in daily table
|
2026-02-01 04:44:01 +08:00
|
|
|
|
2. No abnormal data with very few data points (< 10 rows per stock)
|
2026-02-23 00:07:21 +08:00
|
|
|
|
|
|
|
|
|
|
使用 3 个月的真实数据进行测试 (2024年1月-3月)
|
2026-02-01 04:44:01 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
import pandas as pd
|
2026-02-23 00:07:21 +08:00
|
|
|
|
from datetime import datetime, timedelta
|
2026-02-01 04:44:01 +08:00
|
|
|
|
from src.data.storage import Storage
|
2026-02-21 03:43:30 +08:00
|
|
|
|
from src.data.api_wrappers.api_stock_basic import _get_csv_path
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDailyStorageValidation:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
"""Test daily table storage integrity and completeness."""
|
|
|
|
|
|
|
|
|
|
|
|
# 测试数据时间范围:3个月
|
|
|
|
|
|
TEST_START_DATE = "20240101"
|
|
|
|
|
|
TEST_END_DATE = "20240331"
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
|
def storage(self):
|
|
|
|
|
|
"""Create storage instance."""
|
|
|
|
|
|
return Storage()
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
|
def stock_basic_df(self):
|
|
|
|
|
|
"""Load stock basic data from CSV."""
|
|
|
|
|
|
csv_path = _get_csv_path()
|
|
|
|
|
|
if not csv_path.exists():
|
|
|
|
|
|
pytest.skip(f"stock_basic.csv not found at {csv_path}")
|
|
|
|
|
|
return pd.read_csv(csv_path)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
|
def daily_df(self, storage):
|
2026-02-23 00:07:21 +08:00
|
|
|
|
"""Load daily data from DuckDB (3 months)."""
|
2026-02-01 04:44:01 +08:00
|
|
|
|
if not storage.exists("daily"):
|
2026-02-23 00:07:21 +08:00
|
|
|
|
pytest.skip("daily table not found in DuckDB")
|
|
|
|
|
|
|
|
|
|
|
|
# 从 DuckDB 加载 3 个月数据
|
|
|
|
|
|
df = storage.load(
|
|
|
|
|
|
"daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if df.empty:
|
|
|
|
|
|
pytest.skip(
|
|
|
|
|
|
f"No data found for period {self.TEST_START_DATE} to {self.TEST_END_DATE}"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
def test_duckdb_connection(self, storage):
|
|
|
|
|
|
"""Test DuckDB connection and basic operations."""
|
|
|
|
|
|
assert storage.exists("daily") or True # 至少连接成功
|
|
|
|
|
|
print(f"[TEST] DuckDB connection successful")
|
|
|
|
|
|
|
|
|
|
|
|
def test_load_3months_data(self, storage):
|
|
|
|
|
|
"""Test loading 3 months of data from DuckDB."""
|
|
|
|
|
|
df = storage.load(
|
|
|
|
|
|
"daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if df.empty:
|
|
|
|
|
|
pytest.skip("No data available for testing period")
|
|
|
|
|
|
|
|
|
|
|
|
# 验证数据覆盖范围
|
|
|
|
|
|
dates = df["trade_date"].astype(str)
|
|
|
|
|
|
min_date = dates.min()
|
|
|
|
|
|
max_date = dates.max()
|
|
|
|
|
|
|
|
|
|
|
|
print(f"[TEST] Loaded {len(df)} rows from {min_date} to {max_date}")
|
|
|
|
|
|
assert len(df) > 0, "Should have data in the 3-month period"
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
def test_all_stocks_saved(self, storage, stock_basic_df, daily_df):
|
2026-02-23 00:07:21 +08:00
|
|
|
|
"""Verify all stocks from stock_basic are saved in daily table.
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
This test ensures data completeness - every stock in stock_basic
|
2026-02-23 00:07:21 +08:00
|
|
|
|
should have corresponding data in daily table.
|
2026-02-01 04:44:01 +08:00
|
|
|
|
"""
|
|
|
|
|
|
if daily_df.empty:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
pytest.fail("daily table is empty for test period")
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
# Get unique stock codes from both sources
|
|
|
|
|
|
expected_codes = set(stock_basic_df["ts_code"].dropna().unique())
|
|
|
|
|
|
actual_codes = set(daily_df["ts_code"].dropna().unique())
|
|
|
|
|
|
|
|
|
|
|
|
# Check for missing stocks
|
|
|
|
|
|
missing_codes = expected_codes - actual_codes
|
|
|
|
|
|
|
|
|
|
|
|
if missing_codes:
|
|
|
|
|
|
missing_list = sorted(missing_codes)
|
|
|
|
|
|
# Show first 20 missing stocks as sample
|
|
|
|
|
|
sample = missing_list[:20]
|
2026-02-23 00:07:21 +08:00
|
|
|
|
msg = f"Found {len(missing_codes)} stocks missing from daily table:\n"
|
2026-02-01 04:44:01 +08:00
|
|
|
|
msg += f"Sample missing: {sample}\n"
|
|
|
|
|
|
if len(missing_list) > 20:
|
|
|
|
|
|
msg += f"... and {len(missing_list) - 20} more"
|
2026-02-23 00:07:21 +08:00
|
|
|
|
# 对于3个月数据,允许部分股票缺失(可能是新股或未上市)
|
|
|
|
|
|
print(f"[WARNING] {msg}")
|
|
|
|
|
|
# 只验证至少有80%的股票存在
|
|
|
|
|
|
coverage = len(actual_codes) / len(expected_codes) * 100
|
|
|
|
|
|
assert coverage >= 80, (
|
|
|
|
|
|
f"Stock coverage {coverage:.1f}% is below 80% threshold"
|
|
|
|
|
|
)
|
|
|
|
|
|
else:
|
|
|
|
|
|
print(
|
|
|
|
|
|
f"[TEST] All {len(expected_codes)} stocks from stock_basic are present in daily table"
|
|
|
|
|
|
)
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
def test_no_stock_with_insufficient_data(self, storage, daily_df):
|
2026-02-23 00:07:21 +08:00
|
|
|
|
"""Verify no stock has abnormally few data points (< 5 rows in 3 months).
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
Stocks with very few data points may indicate sync failures,
|
|
|
|
|
|
delisted stocks not properly handled, or data corruption.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if daily_df.empty:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
pytest.fail("daily table is empty for test period")
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
# Count rows per stock
|
|
|
|
|
|
stock_counts = daily_df.groupby("ts_code").size()
|
|
|
|
|
|
|
2026-02-23 00:07:21 +08:00
|
|
|
|
# Find stocks with less than 5 data points in 3 months
|
|
|
|
|
|
insufficient_stocks = stock_counts[stock_counts < 5]
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
if not insufficient_stocks.empty:
|
|
|
|
|
|
# Separate into categories for better reporting
|
|
|
|
|
|
empty_stocks = stock_counts[stock_counts == 0]
|
2026-02-23 00:07:21 +08:00
|
|
|
|
very_few_stocks = stock_counts[(stock_counts > 0) & (stock_counts < 5)]
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
2026-02-23 00:07:21 +08:00
|
|
|
|
msg = f"Found {len(insufficient_stocks)} stocks with insufficient data (< 5 rows in 3 months):\n"
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
if not empty_stocks.empty:
|
|
|
|
|
|
msg += f"\nEmpty stocks (0 rows): {len(empty_stocks)}\n"
|
|
|
|
|
|
sample = sorted(empty_stocks.index[:10].tolist())
|
|
|
|
|
|
msg += f"Sample: {sample}"
|
|
|
|
|
|
|
|
|
|
|
|
if not very_few_stocks.empty:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
msg += f"\nVery few data points (1-4 rows): {len(very_few_stocks)}\n"
|
2026-02-01 04:44:01 +08:00
|
|
|
|
# Show counts for these stocks
|
|
|
|
|
|
sample = very_few_stocks.sort_values().head(20)
|
|
|
|
|
|
msg += "Sample (ts_code: count):\n"
|
|
|
|
|
|
for code, count in sample.items():
|
|
|
|
|
|
msg += f" {code}: {count} rows\n"
|
|
|
|
|
|
|
2026-02-23 00:07:21 +08:00
|
|
|
|
# 对于3个月数据,允许少量异常,但比例不能超过5%
|
|
|
|
|
|
if len(insufficient_stocks) / len(stock_counts) > 0.05:
|
|
|
|
|
|
pytest.fail(msg)
|
|
|
|
|
|
else:
|
|
|
|
|
|
print(f"[WARNING] {msg}")
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
2026-02-23 00:07:21 +08:00
|
|
|
|
print(f"[TEST] All stocks have sufficient data (>= 5 rows in 3 months)")
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
def test_data_integrity_basic(self, storage, daily_df):
|
2026-02-23 00:07:21 +08:00
|
|
|
|
"""Basic data integrity checks for daily table."""
|
2026-02-01 04:44:01 +08:00
|
|
|
|
if daily_df.empty:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
pytest.fail("daily table is empty for test period")
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
# Check required columns exist
|
|
|
|
|
|
required_columns = ["ts_code", "trade_date"]
|
|
|
|
|
|
missing_columns = [
|
|
|
|
|
|
col for col in required_columns if col not in daily_df.columns
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
if missing_columns:
|
|
|
|
|
|
pytest.fail(f"Missing required columns: {missing_columns}")
|
|
|
|
|
|
|
|
|
|
|
|
# Check for null values in key columns
|
|
|
|
|
|
null_ts_code = daily_df["ts_code"].isna().sum()
|
|
|
|
|
|
null_trade_date = daily_df["trade_date"].isna().sum()
|
|
|
|
|
|
|
|
|
|
|
|
if null_ts_code > 0:
|
|
|
|
|
|
pytest.fail(f"Found {null_ts_code} rows with null ts_code")
|
|
|
|
|
|
if null_trade_date > 0:
|
|
|
|
|
|
pytest.fail(f"Found {null_trade_date} rows with null trade_date")
|
|
|
|
|
|
|
2026-02-23 00:07:21 +08:00
|
|
|
|
print(f"[TEST] Data integrity check passed for 3-month period")
|
|
|
|
|
|
|
|
|
|
|
|
def test_polars_export(self, storage):
|
|
|
|
|
|
"""Test Polars export functionality."""
|
|
|
|
|
|
if not storage.exists("daily"):
|
|
|
|
|
|
pytest.skip("daily table not found")
|
|
|
|
|
|
|
|
|
|
|
|
import polars as pl
|
|
|
|
|
|
|
|
|
|
|
|
# 测试 load_polars 方法
|
|
|
|
|
|
df = storage.load_polars(
|
|
|
|
|
|
"daily", start_date=self.TEST_START_DATE, end_date=self.TEST_END_DATE
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
assert isinstance(df, pl.DataFrame), "Should return Polars DataFrame"
|
|
|
|
|
|
print(f"[TEST] Polars export successful: {len(df)} rows")
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
def test_stock_data_coverage_report(self, storage, daily_df):
|
|
|
|
|
|
"""Generate a summary report of stock data coverage.
|
|
|
|
|
|
|
|
|
|
|
|
This test provides visibility into data distribution without failing.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if daily_df.empty:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
pytest.skip("daily table is empty - cannot generate report")
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
stock_counts = daily_df.groupby("ts_code").size()
|
|
|
|
|
|
|
|
|
|
|
|
# Calculate statistics
|
|
|
|
|
|
total_stocks = len(stock_counts)
|
|
|
|
|
|
min_count = stock_counts.min()
|
|
|
|
|
|
max_count = stock_counts.max()
|
|
|
|
|
|
median_count = stock_counts.median()
|
|
|
|
|
|
mean_count = stock_counts.mean()
|
|
|
|
|
|
|
2026-02-23 00:07:21 +08:00
|
|
|
|
# Distribution buckets (adjusted for 3-month period, ~60 trading days)
|
|
|
|
|
|
very_low = (stock_counts < 5).sum()
|
|
|
|
|
|
low = ((stock_counts >= 5) & (stock_counts < 20)).sum()
|
|
|
|
|
|
medium = ((stock_counts >= 20) & (stock_counts < 40)).sum()
|
|
|
|
|
|
high = (stock_counts >= 40).sum()
|
2026-02-01 04:44:01 +08:00
|
|
|
|
|
|
|
|
|
|
report = f"""
|
2026-02-23 00:07:21 +08:00
|
|
|
|
=== Stock Data Coverage Report (3 months: {self.TEST_START_DATE} to {self.TEST_END_DATE}) ===
|
2026-02-01 04:44:01 +08:00
|
|
|
|
Total stocks: {total_stocks}
|
|
|
|
|
|
Data points per stock:
|
|
|
|
|
|
Min: {min_count}
|
|
|
|
|
|
Max: {max_count}
|
|
|
|
|
|
Median: {median_count:.0f}
|
|
|
|
|
|
Mean: {mean_count:.1f}
|
|
|
|
|
|
|
|
|
|
|
|
Distribution:
|
2026-02-23 00:07:21 +08:00
|
|
|
|
< 5 rows: {very_low} stocks ({very_low / total_stocks * 100:.1f}%)
|
|
|
|
|
|
5-19: {low} stocks ({low / total_stocks * 100:.1f}%)
|
|
|
|
|
|
20-39: {medium} stocks ({medium / total_stocks * 100:.1f}%)
|
|
|
|
|
|
>= 40: {high} stocks ({high / total_stocks * 100:.1f}%)
|
2026-02-01 04:44:01 +08:00
|
|
|
|
"""
|
|
|
|
|
|
print(report)
|
|
|
|
|
|
|
|
|
|
|
|
# This is an informational test - it should not fail
|
|
|
|
|
|
# But we assert to mark it as passed
|
|
|
|
|
|
assert total_stocks > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
pytest.main([__file__, "-v", "-s"])
|