66 lines
2.0 KiB
Python
66 lines
2.0 KiB
Python
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
from qlib.data.dataset import DatasetH
|
|||
|
|
|
|||
|
|
dates = pd.to_datetime(pd.date_range("2020-01-01", "2020-01-10"))
|
|||
|
|
instruments = ["SH600000", "SH600001"]
|
|||
|
|
index = pd.MultiIndex.from_product([dates, instruments], names=["datetime", "instrument"])
|
|||
|
|
|
|||
|
|
data = {
|
|||
|
|
"feature_1": np.random.randn(len(index)),
|
|||
|
|
"feature_2": np.random.randn(len(index)),
|
|||
|
|
"label": np.random.randn(len(index)) * 0.01
|
|||
|
|
}
|
|||
|
|
my_df = pd.DataFrame(data, index=index)
|
|||
|
|
my_df.iloc[1, 0] = np.nan # 人为制造一个缺失值
|
|||
|
|
my_df.iloc[5, 2] = np.nan # 人为制造一个标签缺失值
|
|||
|
|
|
|||
|
|
print("----------- 原始 DataFrame -----------")
|
|||
|
|
print(my_df.head())
|
|||
|
|
|
|||
|
|
# 2. 创建包含 StaticDataLoader 和 Processors 的完整配置
|
|||
|
|
data_handler_config = {
|
|||
|
|
"class": "DataHandlerLP",
|
|||
|
|
"module_path": "qlib.data.dataset.handler",
|
|||
|
|
"kwargs": {
|
|||
|
|
# 核心部分:配置数据加载器
|
|||
|
|
"data_loader": {
|
|||
|
|
"class": "StaticDataLoader",
|
|||
|
|
"module_path": "qlib.data.dataset.loader",
|
|||
|
|
"kwargs": {
|
|||
|
|
"config": my_df, # <--- 在这里将你的DataFrame传入!
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
|
|||
|
|
"shared_processors": [
|
|||
|
|
|
|||
|
|
],
|
|||
|
|
"infer_processors": [
|
|||
|
|
# {"class": "DropnaLabel", "module_path": "qlib.data.dataset.processor"},
|
|||
|
|
|
|||
|
|
],
|
|||
|
|
"learn_processors": [
|
|||
|
|
{"class": "Fillna", "module_path": "qlib.data.dataset.processor", "kwargs": {"fill_value": 0}},
|
|||
|
|
]
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
from qlib.utils import init_instance_by_config
|
|||
|
|
|
|||
|
|
# 3. 使用配置初始化 DataHandler
|
|||
|
|
# 这一步会自动加载 StaticDataLoader 的数据,并运行所有定义的处理器
|
|||
|
|
dh = init_instance_by_config(data_handler_config)
|
|||
|
|
ds = DatasetH(
|
|||
|
|
dh,
|
|||
|
|
segments={
|
|||
|
|
"train": ("20190101", "20221231"),
|
|||
|
|
"valid": ("20220101", "20231231"),
|
|||
|
|
"test": ("20240101", "20250101"),
|
|||
|
|
},
|
|||
|
|
)
|
|||
|
|
# 4. 验证结果
|
|||
|
|
# DK_L (Learn) 数据经过了 DropnaLabel -> ZScoreNorm -> Fillna
|
|||
|
|
learn_data = ds.prepare("all", data_key='learn', segments='train')
|
|||
|
|
print("----------- train DataFrame -----------")
|
|||
|
|
|
|||
|
|
print(learn_data)
|