import numpy as np import pandas as pd from qlib.data.dataset import DatasetH dates = pd.to_datetime(pd.date_range("2020-01-01", "2020-01-10")) instruments = ["SH600000", "SH600001"] index = pd.MultiIndex.from_product([dates, instruments], names=["datetime", "instrument"]) data = { "feature_1": np.random.randn(len(index)), "feature_2": np.random.randn(len(index)), "label": np.random.randn(len(index)) * 0.01 } my_df = pd.DataFrame(data, index=index) my_df.iloc[1, 0] = np.nan # 人为制造一个缺失值 my_df.iloc[5, 2] = np.nan # 人为制造一个标签缺失值 print("----------- 原始 DataFrame -----------") print(my_df.head()) # 2. 创建包含 StaticDataLoader 和 Processors 的完整配置 data_handler_config = { "class": "DataHandlerLP", "module_path": "qlib.data.dataset.handler", "kwargs": { # 核心部分:配置数据加载器 "data_loader": { "class": "StaticDataLoader", "module_path": "qlib.data.dataset.loader", "kwargs": { "config": my_df, # <--- 在这里将你的DataFrame传入! } }, "shared_processors": [ ], "infer_processors": [ # {"class": "DropnaLabel", "module_path": "qlib.data.dataset.processor"}, ], "learn_processors": [ {"class": "Fillna", "module_path": "qlib.data.dataset.processor", "kwargs": {"fill_value": 0}}, ] }, } from qlib.utils import init_instance_by_config # 3. 使用配置初始化 DataHandler # 这一步会自动加载 StaticDataLoader 的数据,并运行所有定义的处理器 dh = init_instance_by_config(data_handler_config) ds = DatasetH( dh, segments={ "train": ("20190101", "20221231"), "valid": ("20220101", "20231231"), "test": ("20240101", "20250101"), }, ) # 4. 验证结果 # DK_L (Learn) 数据经过了 DropnaLabel -> ZScoreNorm -> Fillna learn_data = ds.prepare("all", data_key='learn', segments='train') print("----------- train DataFrame -----------") print(learn_data)