"""Tests for data sync with REAL data (read-only). Tests verify: 1. get_global_last_date() correctly reads local data's max date 2. Incremental sync date calculation (local_last_date + 1) 3. Full sync date calculation (20180101) 4. Multi-stock scenario with real data ⚠️ IMPORTANT: These tests ONLY read data, no write operations. - NO sync_all() calls (writes daily.h5) - NO check_sync_needed() calls (writes trade_cal.h5) """ import pytest import pandas as pd from pathlib import Path from src.data.sync import ( DataSync, get_next_date, DEFAULT_START_DATE, ) from src.data.storage import Storage class TestDataSyncReadOnly: """Read-only tests for data sync - verify date calculation logic.""" @pytest.fixture def storage(self): """Create storage instance.""" return Storage() @pytest.fixture def data_sync(self): """Create DataSync instance.""" return DataSync() @pytest.fixture def daily_exists(self, storage): """Check if daily.h5 exists.""" return storage.exists("daily") def test_daily_h5_exists(self, storage): """Verify daily.h5 data file exists before running tests.""" assert storage.exists("daily"), ( "daily.h5 not found. Please run full sync first: " "uv run python -c 'from src.data.sync import sync_all; sync_all(force_full=True)'" ) def test_get_global_last_date(self, data_sync, daily_exists): """Test get_global_last_date returns correct max date from local data.""" if not daily_exists: pytest.skip("daily.h5 not found") last_date = data_sync.get_global_last_date() # Verify it's a valid date string assert last_date is not None, "get_global_last_date returned None" assert isinstance(last_date, str), f"Expected str, got {type(last_date)}" assert len(last_date) == 8, f"Expected 8-digit date, got {last_date}" assert last_date.isdigit(), f"Expected numeric date, got {last_date}" # Verify by reading storage directly daily_data = data_sync.storage.load("daily") expected_max = str(daily_data["trade_date"].max()) assert last_date == expected_max, ( f"get_global_last_date returned {last_date}, " f"but actual max date is {expected_max}" ) print(f"[TEST] Local data last date: {last_date}") def test_incremental_sync_date_calculation(self, data_sync, daily_exists): """Test incremental sync: start_date = local_last_date + 1. This verifies that when local data exists, incremental sync should fetch data from (local_last_date + 1), not from 20180101. """ if not daily_exists: pytest.skip("daily.h5 not found") # Get local last date local_last_date = data_sync.get_global_last_date() assert local_last_date is not None, "No local data found" # Calculate expected incremental start date expected_start_date = get_next_date(local_last_date) # Verify the calculation is correct local_last_int = int(local_last_date) expected_int = local_last_int + 1 actual_int = int(expected_start_date) assert actual_int == expected_int, ( f"Incremental start date calculation error: " f"expected {expected_int}, got {actual_int}" ) print( f"[TEST] Incremental sync: local_last={local_last_date}, " f"start_date should be {expected_start_date}" ) # Verify this is NOT 20180101 (would be full sync) assert expected_start_date != DEFAULT_START_DATE, ( f"Incremental sync should NOT start from {DEFAULT_START_DATE}" ) def test_full_sync_date_calculation(self): """Test full sync: start_date = 20180101 when force_full=True. This verifies that force_full=True always starts from 20180101. """ # Full sync should always use DEFAULT_START_DATE full_sync_start = DEFAULT_START_DATE assert full_sync_start == "20180101", ( f"Full sync should start from 20180101, got {full_sync_start}" ) print(f"[TEST] Full sync start date: {full_sync_start}") def test_date_comparison_logic(self, data_sync, daily_exists): """Test date comparison: incremental vs full sync selection logic. Verify that: - If local_last_date < today: incremental sync needed - If local_last_date >= today: no sync needed """ if not daily_exists: pytest.skip("daily.h5 not found") from datetime import datetime local_last_date = data_sync.get_global_last_date() today = datetime.now().strftime("%Y%m%d") local_last_int = int(local_last_date) today_int = int(today) # Log the comparison print( f"[TEST] Date comparison: local_last={local_last_date} ({local_last_int}), " f"today={today} ({today_int})" ) # This test just verifies the comparison logic works if local_last_int < today_int: print("[TEST] Local data is older than today - sync needed") # Incremental sync should fetch from local_last_date + 1 sync_start = get_next_date(local_last_date) assert int(sync_start) > local_last_int, ( "Sync start should be after local last" ) else: print("[TEST] Local data is up-to-date - no sync needed") def test_get_all_stock_codes_real_data(self, data_sync, daily_exists): """Test get_all_stock_codes returns multiple real stock codes.""" if not daily_exists: pytest.skip("daily.h5 not found") codes = data_sync.get_all_stock_codes() # Verify it's a list assert isinstance(codes, list), f"Expected list, got {type(codes)}" assert len(codes) > 0, "No stock codes found" # Verify multiple stocks assert len(codes) >= 10, ( f"Expected at least 10 stocks for multi-stock test, got {len(codes)}" ) # Verify format (should be like 000001.SZ, 600000.SH) sample_codes = codes[:5] for code in sample_codes: assert "." in code, f"Invalid stock code format: {code}" suffix = code.split(".")[-1] assert suffix in ["SZ", "SH"], f"Invalid exchange suffix: {suffix}" print(f"[TEST] Found {len(codes)} stock codes (sample: {sample_codes})") def test_multi_stock_date_range(self, data_sync, daily_exists): """Test that multiple stocks share the same date range in local data. This verifies that local data has consistent date coverage across stocks. """ if not daily_exists: pytest.skip("daily.h5 not found") daily_data = data_sync.storage.load("daily") # Get date range for each stock stock_dates = daily_data.groupby("ts_code")["trade_date"].agg(["min", "max"]) # Get global min and max global_min = str(daily_data["trade_date"].min()) global_max = str(daily_data["trade_date"].max()) print(f"[TEST] Global date range: {global_min} to {global_max}") print(f"[TEST] Total stocks: {len(stock_dates)}") # Verify we have data for multiple stocks assert len(stock_dates) >= 10, ( f"Expected at least 10 stocks, got {len(stock_dates)}" ) # Verify date range is reasonable (at least 1 year of data) global_min_int = int(global_min) global_max_int = int(global_max) days_span = global_max_int - global_min_int assert days_span > 100, ( f"Date range too small: {days_span} days. " f"Expected at least 100 days of data." ) print(f"[TEST] Date span: {days_span} days") class TestDateUtilities: """Test date utility functions.""" def test_get_next_date(self): """Test get_next_date correctly calculates next day.""" # Test normal cases assert get_next_date("20240101") == "20240102" assert get_next_date("20240131") == "20240201" # Month boundary assert get_next_date("20241231") == "20250101" # Year boundary def test_incremental_vs_full_sync_logic(self): """Test the logic difference between incremental and full sync. Incremental: start_date = local_last_date + 1 Full: start_date = 20180101 """ # Scenario 1: Local data exists local_last_date = "20240115" incremental_start = get_next_date(local_last_date) assert incremental_start == "20240116" assert incremental_start != DEFAULT_START_DATE # Scenario 2: Force full sync full_sync_start = DEFAULT_START_DATE # "20180101" assert full_sync_start == "20180101" assert incremental_start != full_sync_start print("[TEST] Incremental vs Full sync logic verified") if __name__ == "__main__": pytest.main([__file__, "-v", "-s"])