refactor: 调整项目结构,新增数据同步和交易日历模块
- 移除 pyproject.toml,改用 uv 管理项目 - 新增 data/* 忽略规则 - 新增数据同步模块 sync.py - 新增交易日历模块 trade_cal.py - 新增相关测试用例 - 更新 API 文档
This commit is contained in:
190
tests/test_daily_storage.py
Normal file
190
tests/test_daily_storage.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Tests for data/daily.h5 storage validation.
|
||||
|
||||
Validates two key points:
|
||||
1. All stocks from stock_basic.csv are saved in daily.h5
|
||||
2. No abnormal data with very few data points (< 10 rows per stock)
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from src.data.storage import Storage
|
||||
from src.data.stock_basic import _get_csv_path
|
||||
|
||||
|
||||
class TestDailyStorageValidation:
|
||||
"""Test daily.h5 storage integrity and completeness."""
|
||||
|
||||
@pytest.fixture
|
||||
def storage(self):
|
||||
"""Create storage instance."""
|
||||
return Storage()
|
||||
|
||||
@pytest.fixture
|
||||
def stock_basic_df(self):
|
||||
"""Load stock basic data from CSV."""
|
||||
csv_path = _get_csv_path()
|
||||
if not csv_path.exists():
|
||||
pytest.skip(f"stock_basic.csv not found at {csv_path}")
|
||||
return pd.read_csv(csv_path)
|
||||
|
||||
@pytest.fixture
|
||||
def daily_df(self, storage):
|
||||
"""Load daily data from HDF5."""
|
||||
if not storage.exists("daily"):
|
||||
pytest.skip("daily.h5 not found")
|
||||
# HDF5 stores keys with leading slash, so we need to handle both '/daily' and 'daily'
|
||||
file_path = storage._get_file_path("daily")
|
||||
try:
|
||||
with pd.HDFStore(file_path, mode="r") as store:
|
||||
if "/daily" in store.keys():
|
||||
return store["/daily"]
|
||||
elif "daily" in store.keys():
|
||||
return store["daily"]
|
||||
return pd.DataFrame()
|
||||
except Exception as e:
|
||||
pytest.skip(f"Error loading daily.h5: {e}")
|
||||
|
||||
def test_all_stocks_saved(self, storage, stock_basic_df, daily_df):
|
||||
"""Verify all stocks from stock_basic are saved in daily.h5.
|
||||
|
||||
This test ensures data completeness - every stock in stock_basic
|
||||
should have corresponding data in daily.h5.
|
||||
"""
|
||||
if daily_df.empty:
|
||||
pytest.fail("daily.h5 is empty")
|
||||
|
||||
# Get unique stock codes from both sources
|
||||
expected_codes = set(stock_basic_df["ts_code"].dropna().unique())
|
||||
actual_codes = set(daily_df["ts_code"].dropna().unique())
|
||||
|
||||
# Check for missing stocks
|
||||
missing_codes = expected_codes - actual_codes
|
||||
|
||||
if missing_codes:
|
||||
missing_list = sorted(missing_codes)
|
||||
# Show first 20 missing stocks as sample
|
||||
sample = missing_list[:20]
|
||||
msg = f"Found {len(missing_codes)} stocks missing from daily.h5:\n"
|
||||
msg += f"Sample missing: {sample}\n"
|
||||
if len(missing_list) > 20:
|
||||
msg += f"... and {len(missing_list) - 20} more"
|
||||
pytest.fail(msg)
|
||||
|
||||
# All stocks present
|
||||
assert len(actual_codes) > 0, "No stocks found in daily.h5"
|
||||
print(
|
||||
f"[TEST] All {len(expected_codes)} stocks from stock_basic are present in daily.h5"
|
||||
)
|
||||
|
||||
def test_no_stock_with_insufficient_data(self, storage, daily_df):
|
||||
"""Verify no stock has abnormally few data points (< 10 rows).
|
||||
|
||||
Stocks with very few data points may indicate sync failures,
|
||||
delisted stocks not properly handled, or data corruption.
|
||||
"""
|
||||
if daily_df.empty:
|
||||
pytest.fail("daily.h5 is empty")
|
||||
|
||||
# Count rows per stock
|
||||
stock_counts = daily_df.groupby("ts_code").size()
|
||||
|
||||
# Find stocks with less than 10 data points
|
||||
insufficient_stocks = stock_counts[stock_counts < 10]
|
||||
|
||||
if not insufficient_stocks.empty:
|
||||
# Separate into categories for better reporting
|
||||
empty_stocks = stock_counts[stock_counts == 0]
|
||||
very_few_stocks = stock_counts[(stock_counts > 0) & (stock_counts < 10)]
|
||||
|
||||
msg = f"Found {len(insufficient_stocks)} stocks with insufficient data (< 10 rows):\n"
|
||||
|
||||
if not empty_stocks.empty:
|
||||
msg += f"\nEmpty stocks (0 rows): {len(empty_stocks)}\n"
|
||||
sample = sorted(empty_stocks.index[:10].tolist())
|
||||
msg += f"Sample: {sample}"
|
||||
|
||||
if not very_few_stocks.empty:
|
||||
msg += f"\nVery few data points (1-9 rows): {len(very_few_stocks)}\n"
|
||||
# Show counts for these stocks
|
||||
sample = very_few_stocks.sort_values().head(20)
|
||||
msg += "Sample (ts_code: count):\n"
|
||||
for code, count in sample.items():
|
||||
msg += f" {code}: {count} rows\n"
|
||||
|
||||
pytest.fail(msg)
|
||||
|
||||
print(f"[TEST] All stocks have sufficient data (>= 10 rows)")
|
||||
|
||||
def test_data_integrity_basic(self, storage, daily_df):
|
||||
"""Basic data integrity checks for daily.h5."""
|
||||
if daily_df.empty:
|
||||
pytest.fail("daily.h5 is empty")
|
||||
|
||||
# Check required columns exist
|
||||
required_columns = ["ts_code", "trade_date"]
|
||||
missing_columns = [
|
||||
col for col in required_columns if col not in daily_df.columns
|
||||
]
|
||||
|
||||
if missing_columns:
|
||||
pytest.fail(f"Missing required columns: {missing_columns}")
|
||||
|
||||
# Check for null values in key columns
|
||||
null_ts_code = daily_df["ts_code"].isna().sum()
|
||||
null_trade_date = daily_df["trade_date"].isna().sum()
|
||||
|
||||
if null_ts_code > 0:
|
||||
pytest.fail(f"Found {null_ts_code} rows with null ts_code")
|
||||
if null_trade_date > 0:
|
||||
pytest.fail(f"Found {null_trade_date} rows with null trade_date")
|
||||
|
||||
print(f"[TEST] Data integrity check passed")
|
||||
|
||||
def test_stock_data_coverage_report(self, storage, daily_df):
|
||||
"""Generate a summary report of stock data coverage.
|
||||
|
||||
This test provides visibility into data distribution without failing.
|
||||
"""
|
||||
if daily_df.empty:
|
||||
pytest.skip("daily.h5 is empty - cannot generate report")
|
||||
|
||||
stock_counts = daily_df.groupby("ts_code").size()
|
||||
|
||||
# Calculate statistics
|
||||
total_stocks = len(stock_counts)
|
||||
min_count = stock_counts.min()
|
||||
max_count = stock_counts.max()
|
||||
median_count = stock_counts.median()
|
||||
mean_count = stock_counts.mean()
|
||||
|
||||
# Distribution buckets
|
||||
very_low = (stock_counts < 10).sum()
|
||||
low = ((stock_counts >= 10) & (stock_counts < 100)).sum()
|
||||
medium = ((stock_counts >= 100) & (stock_counts < 500)).sum()
|
||||
high = (stock_counts >= 500).sum()
|
||||
|
||||
report = f"""
|
||||
=== Stock Data Coverage Report ===
|
||||
Total stocks: {total_stocks}
|
||||
Data points per stock:
|
||||
Min: {min_count}
|
||||
Max: {max_count}
|
||||
Median: {median_count:.0f}
|
||||
Mean: {mean_count:.1f}
|
||||
|
||||
Distribution:
|
||||
< 10 rows: {very_low} stocks ({very_low / total_stocks * 100:.1f}%)
|
||||
10-99: {low} stocks ({low / total_stocks * 100:.1f}%)
|
||||
100-499: {medium} stocks ({medium / total_stocks * 100:.1f}%)
|
||||
>= 500: {high} stocks ({high / total_stocks * 100:.1f}%)
|
||||
"""
|
||||
print(report)
|
||||
|
||||
# This is an informational test - it should not fail
|
||||
# But we assert to mark it as passed
|
||||
assert total_stocks > 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "-s"])
|
||||
20
tests/test_tushare_api.py
Normal file
20
tests/test_tushare_api.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Tushare API 验证脚本 - 快速生成 pro 对象用于调试。"""
|
||||
|
||||
import os
|
||||
|
||||
os.environ.setdefault("DATA_PATH", "data")
|
||||
|
||||
from src.data.config import get_config
|
||||
import tushare as ts
|
||||
|
||||
config = get_config()
|
||||
token = config.tushare_token
|
||||
|
||||
if not token:
|
||||
raise ValueError("请在 config/.env.local 中配置 TUSHARE_TOKEN")
|
||||
|
||||
pro = ts.pro_api(token)
|
||||
print(f"pro_api 对象已创建,token: {token[:10]}...")
|
||||
|
||||
df = pro.query('daily', ts_code='000001.SZ', start_date='20180702', end_date='20180718')
|
||||
print(df)
|
||||
Reference in New Issue
Block a user