refactor(factors): 简化 add_factor API 并默认启用 metadata
- 合并 add_factor_by_name 到 add_factor,支持三种调用方式 - FactorManager 构造函数改为可选参数,使用默认路径 - FactorEngine 默认启用 metadata,无需手动配置路径
This commit is contained in:
0
src/experiment/data/factors.jsonl
Normal file
0
src/experiment/data/factors.jsonl
Normal file
@@ -22,10 +22,13 @@
|
|||||||
"source": "## 1. 导入依赖"
|
"source": "## 1. 导入依赖"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2026-03-11T16:02:49.975545Z",
|
||||||
|
"start_time": "2026-03-11T16:02:48.487347Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"outputs": [],
|
|
||||||
"execution_count": null,
|
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"from datetime import datetime\n",
|
"from datetime import datetime\n",
|
||||||
@@ -50,7 +53,9 @@
|
|||||||
"from src.training.components.models import LightGBMLambdaRankModel\n",
|
"from src.training.components.models import LightGBMLambdaRankModel\n",
|
||||||
"from src.training.config import TrainingConfig\n",
|
"from src.training.config import TrainingConfig\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 1
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@@ -58,35 +63,49 @@
|
|||||||
"source": "## 2. 辅助函数"
|
"source": "## 2. 辅助函数"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2026-03-11T16:02:49.989220Z",
|
||||||
|
"start_time": "2026-03-11T16:02:49.981542Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"outputs": [],
|
|
||||||
"execution_count": null,
|
|
||||||
"source": [
|
"source": [
|
||||||
"def create_factors_with_metadata(\n",
|
"def create_factors_with_metadata(\n",
|
||||||
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n",
|
" engine: FactorEngine,\n",
|
||||||
|
" selected_factors: List[str],\n",
|
||||||
|
" factor_definitions: dict,\n",
|
||||||
|
" label_factor: dict,\n",
|
||||||
") -> List[str]:\n",
|
") -> List[str]:\n",
|
||||||
" \"\"\"使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)\"\"\"\n",
|
" \"\"\"注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
|
||||||
" print(\"=\" * 80)\n",
|
" print(\"=\" * 80)\n",
|
||||||
" print(\"使用 metadata 注册因子\")\n",
|
" print(\"注册因子\")\n",
|
||||||
" print(\"=\" * 80)\n",
|
" print(\"=\" * 80)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # 注册所有特征因子(通过 metadata 名称)\n",
|
" # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)\n",
|
||||||
" print(\"\\n注册特征因子(从 metadata):\")\n",
|
" print(\"\\n注册特征因子(从 metadata):\")\n",
|
||||||
" for name in factor_definitions.keys():\n",
|
" for name in selected_factors:\n",
|
||||||
" engine.add_factor_by_name(name)\n",
|
" engine.add_factor(name)\n",
|
||||||
" print(f\" - {name}\")\n",
|
" print(f\" - {name}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n",
|
" # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
|
||||||
|
" print(\"\\n注册特征因子(表达式):\")\n",
|
||||||
|
" for name, expr in factor_definitions.items():\n",
|
||||||
|
" engine.add_factor(name, expr)\n",
|
||||||
|
" print(f\" - {name}: {expr}\")\n",
|
||||||
|
"\n",
|
||||||
|
" # 注册 label 因子(通过表达式)\n",
|
||||||
" print(\"\\n注册 Label 因子(表达式):\")\n",
|
" print(\"\\n注册 Label 因子(表达式):\")\n",
|
||||||
" for name, expr in label_factor.items():\n",
|
" for name, expr in label_factor.items():\n",
|
||||||
" engine.add_factor(name, expr)\n",
|
" engine.add_factor(name, expr)\n",
|
||||||
" print(f\" - {name}: {expr}\")\n",
|
" print(f\" - {name}: {expr}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # 从字典自动获取特征列\n",
|
" # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
|
||||||
" feature_cols = list(factor_definitions.keys())\n",
|
" feature_cols = selected_factors + list(factor_definitions.keys())\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
|
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
|
||||||
|
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
|
||||||
|
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
|
||||||
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
|
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
|
||||||
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
|
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -251,7 +270,9 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" return results\n",
|
" return results\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 2
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@@ -263,77 +284,86 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2026-03-11T16:02:50.000875Z",
|
||||||
|
"start_time": "2026-03-11T16:02:49.994082Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"outputs": [],
|
|
||||||
"execution_count": null,
|
|
||||||
"source": [
|
"source": [
|
||||||
"# 特征因子定义字典(复用 regression.ipynb 的因子定义)\n",
|
"# 特征因子定义字典(复用 regression.ipynb 的因子定义)\n",
|
||||||
"LABEL_NAME = \"future_return_5_rank\"\n",
|
"LABEL_NAME = \"future_return_5_rank\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"FACTOR_DEFINITIONS = {\n",
|
"# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
|
||||||
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
|
"SELECTED_FACTORS = [\n",
|
||||||
" \"ma_5\": \"ts_mean(close, 5)\",\n",
|
" # ================= 1. 价格、趋势与路径依赖 =================\n",
|
||||||
" \"ma_20\": \"ts_mean(close, 20)\",\n",
|
" \"ma_5\",\n",
|
||||||
" \"ma_ratio_5_20\": \"ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1\",\n",
|
" \"ma_20\",\n",
|
||||||
" \"bias_10\": \"close / (ts_mean(close, 10) + 1e-8) - 1\",\n",
|
" \"ma_ratio_5_20\",\n",
|
||||||
" \"high_low_ratio\": \"(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)\",\n",
|
" \"bias_10\",\n",
|
||||||
" \"bbi_ratio\": \"(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)\",\n",
|
" \"high_low_ratio\",\n",
|
||||||
" \"return_5\": \"(close / (ts_delay(close, 5) + 1e-8)) - 1\",\n",
|
" \"bbi_ratio\",\n",
|
||||||
" \"return_20\": \"(close / (ts_delay(close, 20) + 1e-8)) - 1\",\n",
|
" \"return_5\",\n",
|
||||||
" \"kaufman_ER_20\": \"abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)\",\n",
|
" \"return_20\",\n",
|
||||||
" \"mom_acceleration_10_20\": \"(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)\",\n",
|
" \"kaufman_ER_20\",\n",
|
||||||
" \"drawdown_from_high_60\": \"close / (ts_max(high, 60) + 1e-8) - 1\",\n",
|
" \"mom_acceleration_10_20\",\n",
|
||||||
" \"up_days_ratio_20\": \"ts_sum(close > ts_delay(close, 1), 20) / 20\",\n",
|
" \"drawdown_from_high_60\",\n",
|
||||||
|
" \"up_days_ratio_20\",\n",
|
||||||
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
|
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
|
||||||
" \"volatility_5\": \"ts_std(close, 5)\",\n",
|
" \"volatility_5\",\n",
|
||||||
" \"volatility_20\": \"ts_std(close, 20)\",\n",
|
" \"volatility_20\",\n",
|
||||||
" \"volatility_ratio\": \"ts_std(close, 5) / (ts_std(close, 20) + 1e-8)\",\n",
|
" \"volatility_ratio\",\n",
|
||||||
" \"std_return_20\": \"ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)\",\n",
|
" \"std_return_20\",\n",
|
||||||
" \"sharpe_ratio_20\": \"ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)\",\n",
|
" \"sharpe_ratio_20\",\n",
|
||||||
" \"min_ret_20\": \"ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n",
|
" \"min_ret_20\",\n",
|
||||||
" \"volatility_squeeze_5_60\": \"ts_std(close, 5) / (ts_std(close, 60) + 1e-8)\",\n",
|
" \"volatility_squeeze_5_60\",\n",
|
||||||
" # ================= 3. 日内微观结构与异象 =================\n",
|
" # ================= 3. 日内微观结构与异象 =================\n",
|
||||||
" \"overnight_intraday_diff\": \"(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)\",\n",
|
" \"overnight_intraday_diff\",\n",
|
||||||
" \"upper_shadow_ratio\": \"(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)\",\n",
|
" \"upper_shadow_ratio\",\n",
|
||||||
" \"capital_retention_20\": \"ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)\",\n",
|
" \"capital_retention_20\",\n",
|
||||||
" \"max_ret_20\": \"ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n",
|
" \"max_ret_20\",\n",
|
||||||
" # ================= 4. 量能、流动性与量价背离 =================\n",
|
" # ================= 4. 量能、流动性与量价背离 =================\n",
|
||||||
" \"volume_ratio_5_20\": \"ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)\",\n",
|
" \"volume_ratio_5_20\",\n",
|
||||||
" \"turnover_rate_mean_5\": \"ts_mean(turnover_rate, 5)\",\n",
|
" \"turnover_rate_mean_5\",\n",
|
||||||
" \"turnover_deviation\": \"(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)\",\n",
|
" \"turnover_deviation\",\n",
|
||||||
" \"amihud_illiq_20\": \"ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)\",\n",
|
" \"amihud_illiq_20\",\n",
|
||||||
" \"turnover_cv_20\": \"ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)\",\n",
|
" \"turnover_cv_20\",\n",
|
||||||
" \"pv_corr_20\": \"ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)\",\n",
|
" \"pv_corr_20\",\n",
|
||||||
" \"close_vwap_deviation\": \"close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1\",\n",
|
" \"close_vwap_deviation\",\n",
|
||||||
" # ================= 5. 基本面财务特征 =================\n",
|
" # ================= 5. 基本面财务特征 =================\n",
|
||||||
" \"roe\": \"n_income / (total_hldr_eqy_exc_min_int + 1e-8)\",\n",
|
" \"roe\",\n",
|
||||||
" \"roa\": \"n_income / (total_assets + 1e-8)\",\n",
|
" \"roa\",\n",
|
||||||
" \"profit_margin\": \"n_income / (revenue + 1e-8)\",\n",
|
" \"profit_margin\",\n",
|
||||||
" \"debt_to_equity\": \"total_liab / (total_hldr_eqy_exc_min_int + 1e-8)\",\n",
|
" \"debt_to_equity\",\n",
|
||||||
" \"current_ratio\": \"total_cur_assets / (total_cur_liab + 1e-8)\",\n",
|
" \"current_ratio\",\n",
|
||||||
" \"net_profit_yoy\": \"(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1\",\n",
|
" \"net_profit_yoy\",\n",
|
||||||
" \"revenue_yoy\": \"(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1\",\n",
|
" \"revenue_yoy\",\n",
|
||||||
" \"healthy_expansion_velocity\": \"(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)\",\n",
|
" \"healthy_expansion_velocity\",\n",
|
||||||
" # ================= 6. 基本面估值与截面动量共振 =================\n",
|
" # ================= 6. 基本面估值与截面动量共振 =================\n",
|
||||||
" \"EP\": \"n_income / (total_mv * 10000 + 1e-8)\",\n",
|
" \"EP\",\n",
|
||||||
" \"BP\": \"total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)\",\n",
|
" \"BP\",\n",
|
||||||
" \"CP\": \"n_cashflow_act / (total_mv * 10000 + 1e-8)\",\n",
|
" \"CP\",\n",
|
||||||
" \"market_cap_rank\": \"cs_rank(total_mv)\",\n",
|
" \"market_cap_rank\",\n",
|
||||||
" \"turnover_rank\": \"cs_rank(turnover_rate)\",\n",
|
" \"turnover_rank\",\n",
|
||||||
" \"return_5_rank\": \"cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)\",\n",
|
" \"return_5_rank\",\n",
|
||||||
" \"EP_rank\": \"cs_rank(n_income / (total_mv + 1e-8))\",\n",
|
" \"EP_rank\",\n",
|
||||||
" \"pe_expansion_trend\": \"(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1\",\n",
|
" \"pe_expansion_trend\",\n",
|
||||||
" \"value_price_divergence\": \"cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))\",\n",
|
" \"value_price_divergence\",\n",
|
||||||
" \"active_market_cap\": \"total_mv * ts_mean(turnover_rate, 20)\",\n",
|
" \"active_market_cap\",\n",
|
||||||
" \"ebit_rank\": \"cs_rank(ebit)\",\n",
|
" \"ebit_rank\",\n",
|
||||||
"}\n",
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# 因子定义字典(完整因子库)\n",
|
||||||
|
"FACTOR_DEFINITIONS = {\"turnover_volatility_ratio\": \"log(ts_std(turnover_rate, 20))\"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Label 因子定义(不参与训练,用于计算目标)\n",
|
"# Label 因子定义(不参与训练,用于计算目标)\n",
|
||||||
"LABEL_FACTOR = {\n",
|
"LABEL_FACTOR = {\n",
|
||||||
" LABEL_NAME: \"(ts_delay(close, -5) / ts_delay(open, -1)) - 1\",\n",
|
" LABEL_NAME: \"(ts_delay(close, -5) / ts_delay(open, -1)) - 1\",\n",
|
||||||
"}"
|
"}"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 3
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@@ -341,10 +371,13 @@
|
|||||||
"source": "### 3.2 训练参数配置"
|
"source": "### 3.2 训练参数配置"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2026-03-11T16:02:50.009081Z",
|
||||||
|
"start_time": "2026-03-11T16:02:50.005330Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"outputs": [],
|
|
||||||
"execution_count": null,
|
|
||||||
"source": [
|
"source": [
|
||||||
"# 日期范围配置(正确的 train/val/test 三分法)\n",
|
"# 日期范围配置(正确的 train/val/test 三分法)\n",
|
||||||
"TRAIN_START = \"20200101\"\n",
|
"TRAIN_START = \"20200101\"\n",
|
||||||
@@ -377,7 +410,7 @@
|
|||||||
"N_QUANTILES = 20 # 将 label 分为 20 组\n",
|
"N_QUANTILES = 20 # 将 label 分为 20 组\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 特征列(用于数据处理器)\n",
|
"# 特征列(用于数据处理器)\n",
|
||||||
"FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())\n",
|
"FEATURE_COLS = SELECTED_FACTORS\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 数据处理器配置\n",
|
"# 数据处理器配置\n",
|
||||||
"PROCESSORS = [\n",
|
"PROCESSORS = [\n",
|
||||||
@@ -421,7 +454,9 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# Top N 配置:每日推荐股票数量\n",
|
"# Top N 配置:每日推荐股票数量\n",
|
||||||
"TOP_N = 5 # 可调整为 10, 20 等"
|
"TOP_N = 5 # 可调整为 10, 20 等"
|
||||||
]
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 4
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@@ -429,10 +464,13 @@
|
|||||||
"source": "## 4. 训练流程"
|
"source": "## 4. 训练流程"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2026-03-11T16:02:50.330018Z",
|
||||||
|
"start_time": "2026-03-11T16:02:50.012964Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"outputs": [],
|
|
||||||
"execution_count": null,
|
|
||||||
"source": [
|
"source": [
|
||||||
"print(\"\\n\" + \"=\" * 80)\n",
|
"print(\"\\n\" + \"=\" * 80)\n",
|
||||||
"print(\"LightGBM LambdaRank 排序学习训练\")\n",
|
"print(\"LightGBM LambdaRank 排序学习训练\")\n",
|
||||||
@@ -444,7 +482,9 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# 2. 使用 metadata 定义因子\n",
|
"# 2. 使用 metadata 定义因子\n",
|
||||||
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
|
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
|
||||||
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n",
|
"feature_cols = create_factors_with_metadata(\n",
|
||||||
|
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 3. 准备数据\n",
|
"# 3. 准备数据\n",
|
||||||
"print(\"\\n[3] 准备数据\")\n",
|
"print(\"\\n[3] 准备数据\")\n",
|
||||||
@@ -507,7 +547,49 @@
|
|||||||
" feature_cols=feature_cols,\n",
|
" feature_cols=feature_cols,\n",
|
||||||
" persist_model=PERSIST_MODEL,\n",
|
" persist_model=PERSIST_MODEL,\n",
|
||||||
")"
|
")"
|
||||||
]
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"================================================================================\n",
|
||||||
|
"LightGBM LambdaRank 排序学习训练\n",
|
||||||
|
"================================================================================\n",
|
||||||
|
"\n",
|
||||||
|
"[1] 创建 FactorEngine\n",
|
||||||
|
"\n",
|
||||||
|
"[2] 定义因子(从 metadata 注册)\n",
|
||||||
|
"================================================================================\n",
|
||||||
|
"注册因子\n",
|
||||||
|
"================================================================================\n",
|
||||||
|
"\n",
|
||||||
|
"注册特征因子(从 metadata):\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "QueryError",
|
||||||
|
"evalue": "查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n ",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
|
||||||
|
"\u001B[31mBinderException\u001B[39m Traceback (most recent call last)",
|
||||||
|
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:296\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 295\u001B[39m conn = \u001B[38;5;28mself\u001B[39m._get_connection()\n\u001B[32m--> \u001B[39m\u001B[32m296\u001B[39m result = \u001B[43mconn\u001B[49m\u001B[43m.\u001B[49m\u001B[43mexecute\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m.pl()\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n",
|
||||||
|
"\u001B[31mBinderException\u001B[39m: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^",
|
||||||
|
"\nDuring handling of the above exception, another exception occurred:\n",
|
||||||
|
"\u001B[31mQueryError\u001B[39m Traceback (most recent call last)",
|
||||||
|
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[5]\u001B[39m\u001B[32m, line 11\u001B[39m\n\u001B[32m 9\u001B[39m \u001B[38;5;66;03m# 2. 使用 metadata 定义因子\u001B[39;00m\n\u001B[32m 10\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[2] 定义因子(从 metadata 注册)\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m---> \u001B[39m\u001B[32m11\u001B[39m feature_cols = \u001B[43mcreate_factors_with_metadata\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 12\u001B[39m \u001B[43m \u001B[49m\u001B[43mengine\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mSELECTED_FACTORS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mFACTOR_DEFINITIONS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mLABEL_FACTOR\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m)\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[38;5;66;03m# 3. 准备数据\u001B[39;00m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[3] 准备数据\u001B[39m\u001B[33m\"\u001B[39m)\n",
|
||||||
|
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 15\u001B[39m, in \u001B[36mcreate_factors_with_metadata\u001B[39m\u001B[34m(engine, selected_factors, factor_definitions, label_factor)\u001B[39m\n\u001B[32m 13\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m注册特征因子(从 metadata):\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 14\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m name \u001B[38;5;129;01min\u001B[39;00m selected_factors:\n\u001B[32m---> \u001B[39m\u001B[32m15\u001B[39m \u001B[43mengine\u001B[49m\u001B[43m.\u001B[49m\u001B[43madd_factor\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m - \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 18\u001B[39m \u001B[38;5;66;03m# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\u001B[39;00m\n",
|
||||||
|
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:225\u001B[39m, in \u001B[36mFactorEngine.add_factor\u001B[39m\u001B[34m(self, name, expression, data_specs)\u001B[39m\n\u001B[32m 182\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"注册因子(支持多种调用方式)。\u001B[39;00m\n\u001B[32m 183\u001B[39m \n\u001B[32m 184\u001B[39m \u001B[33;03m这是 register 方法的增强版,支持以下调用方式:\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 221\u001B[39m \u001B[33;03m ... .add_factor(\"golden_cross\", \"ma5 > ma10\"))\u001B[39;00m\n\u001B[32m 222\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 223\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m expression \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 224\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询表达式\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m225\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_add_factor_from_metadata\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdata_specs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 227\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(expression, \u001B[38;5;28mstr\u001B[39m):\n\u001B[32m 228\u001B[39m \u001B[38;5;66;03m# Fail-Fast:立即解析,失败立即报错\u001B[39;00m\n\u001B[32m 229\u001B[39m node = \u001B[38;5;28mself\u001B[39m._parser.parse(expression)\n",
|
||||||
|
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:159\u001B[39m, in \u001B[36mFactorEngine._add_factor_from_metadata\u001B[39m\u001B[34m(self, name, factor_name_in_metadata, data_specs)\u001B[39m\n\u001B[32m 153\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\n\u001B[32m 154\u001B[39m \u001B[33m\"\u001B[39m\u001B[33m引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 155\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m例如:FactorEngine(metadata_path=\u001B[39m\u001B[33m'\u001B[39m\u001B[33mdata/factors.jsonl\u001B[39m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 156\u001B[39m )\n\u001B[32m 158\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询因子\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m159\u001B[39m df = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_metadata\u001B[49m\u001B[43m.\u001B[49m\u001B[43mget_factors_by_name\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfactor_name_in_metadata\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 161\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(df) == \u001B[32m0\u001B[39m:\n\u001B[32m 162\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[32m 163\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m在 metadata 中未找到因子 \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfactor_name_in_metadata\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 164\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m请确认因子名称正确,或先使用 FactorManager 添加该因子。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 165\u001B[39m )\n",
|
||||||
|
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:177\u001B[39m, in \u001B[36mFactorManager.get_factors_by_name\u001B[39m\u001B[34m(self, name)\u001B[39m\n\u001B[32m 154\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"根据名称查询因子。\u001B[39;00m\n\u001B[32m 155\u001B[39m \n\u001B[32m 156\u001B[39m \u001B[33;03m使用DuckDB执行SQL查询,返回Polars DataFrame。\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 170\u001B[39m \u001B[33;03m ... print(df[\"dsl\"][0])\u001B[39;00m\n\u001B[32m 171\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 172\u001B[39m sql = \u001B[33mf\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m 173\u001B[39m \u001B[33m SELECT *\u001B[39m\n\u001B[32m 174\u001B[39m \u001B[33m FROM read_json_auto(\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mself\u001B[39m.filepath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\n\u001B[32m 175\u001B[39m \u001B[33m WHERE name = \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\n\u001B[32m 176\u001B[39m \u001B[33m\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m--> \u001B[39m\u001B[32m177\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_execute_query\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m\n",
|
||||||
|
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:299\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n\u001B[32m 298\u001B[39m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[32m--> \u001B[39m\u001B[32m299\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m QueryError(sql, e)\n",
|
||||||
|
"\u001B[31mQueryError\u001B[39m: 查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 5
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
|||||||
@@ -40,29 +40,40 @@ from src.training.config import TrainingConfig
|
|||||||
# ## 2. 辅助函数
|
# ## 2. 辅助函数
|
||||||
# %%
|
# %%
|
||||||
def create_factors_with_metadata(
|
def create_factors_with_metadata(
|
||||||
engine: FactorEngine, factor_definitions: dict, label_factor: dict
|
engine: FactorEngine,
|
||||||
|
selected_factors: List[str],
|
||||||
|
factor_definitions: dict,
|
||||||
|
label_factor: dict,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)"""
|
"""注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)"""
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
print("使用 metadata 注册因子")
|
print("注册因子")
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
# 注册所有特征因子(通过 metadata 名称)
|
# 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)
|
||||||
print("\n注册特征因子(从 metadata):")
|
print("\n注册特征因子(从 metadata):")
|
||||||
for name in factor_definitions.keys():
|
for name in selected_factors:
|
||||||
engine.add_factor_by_name(name)
|
engine.add_factor(name)
|
||||||
print(f" - {name}")
|
print(f" - {name}")
|
||||||
|
|
||||||
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
|
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
|
||||||
|
print("\n注册特征因子(表达式):")
|
||||||
|
for name, expr in factor_definitions.items():
|
||||||
|
engine.add_factor(name, expr)
|
||||||
|
print(f" - {name}: {expr}")
|
||||||
|
|
||||||
|
# 注册 label 因子(通过表达式)
|
||||||
print("\n注册 Label 因子(表达式):")
|
print("\n注册 Label 因子(表达式):")
|
||||||
for name, expr in label_factor.items():
|
for name, expr in label_factor.items():
|
||||||
engine.add_factor(name, expr)
|
engine.add_factor(name, expr)
|
||||||
print(f" - {name}: {expr}")
|
print(f" - {name}: {expr}")
|
||||||
|
|
||||||
# 从字典自动获取特征列
|
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
|
||||||
feature_cols = list(factor_definitions.keys())
|
feature_cols = selected_factors + list(factor_definitions.keys())
|
||||||
|
|
||||||
print(f"\n特征因子数: {len(feature_cols)}")
|
print(f"\n特征因子数: {len(feature_cols)}")
|
||||||
|
print(f" - 来自 metadata: {len(selected_factors)}")
|
||||||
|
print(f" - 来自表达式: {len(factor_definitions)}")
|
||||||
print(f"Label: {list(label_factor.keys())[0]}")
|
print(f"Label: {list(label_factor.keys())[0]}")
|
||||||
print(f"已注册因子总数: {len(engine.list_registered())}")
|
print(f"已注册因子总数: {len(engine.list_registered())}")
|
||||||
|
|
||||||
@@ -236,62 +247,68 @@ def evaluate_ndcg_at_k(
|
|||||||
# 特征因子定义字典(复用 regression.ipynb 的因子定义)
|
# 特征因子定义字典(复用 regression.ipynb 的因子定义)
|
||||||
LABEL_NAME = "future_return_5_rank"
|
LABEL_NAME = "future_return_5_rank"
|
||||||
|
|
||||||
FACTOR_DEFINITIONS = {
|
# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
|
||||||
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
|
SELECTED_FACTORS = [
|
||||||
"ma_5": "ts_mean(close, 5)",
|
# ================= 1. 价格、趋势与路径依赖 =================
|
||||||
"ma_20": "ts_mean(close, 20)",
|
"ma_5",
|
||||||
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1",
|
"ma_20",
|
||||||
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1",
|
"ma_ratio_5_20",
|
||||||
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)",
|
"bias_10",
|
||||||
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)",
|
"high_low_ratio",
|
||||||
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1",
|
"bbi_ratio",
|
||||||
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1",
|
"return_5",
|
||||||
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
|
"return_20",
|
||||||
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
|
"kaufman_ER_20",
|
||||||
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
|
"mom_acceleration_10_20",
|
||||||
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
|
"drawdown_from_high_60",
|
||||||
|
"up_days_ratio_20",
|
||||||
# ================= 2. 波动率、风险调整与高阶矩 =================
|
# ================= 2. 波动率、风险调整与高阶矩 =================
|
||||||
"volatility_5": "ts_std(close, 5)",
|
"volatility_5",
|
||||||
"volatility_20": "ts_std(close, 20)",
|
"volatility_20",
|
||||||
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)",
|
"volatility_ratio",
|
||||||
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)",
|
"std_return_20",
|
||||||
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
|
"sharpe_ratio_20",
|
||||||
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
|
"min_ret_20",
|
||||||
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
|
"volatility_squeeze_5_60",
|
||||||
# ================= 3. 日内微观结构与异象 =================
|
# ================= 3. 日内微观结构与异象 =================
|
||||||
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
|
"overnight_intraday_diff",
|
||||||
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
|
"upper_shadow_ratio",
|
||||||
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
|
"capital_retention_20",
|
||||||
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
|
"max_ret_20",
|
||||||
# ================= 4. 量能、流动性与量价背离 =================
|
# ================= 4. 量能、流动性与量价背离 =================
|
||||||
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)",
|
"volume_ratio_5_20",
|
||||||
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)",
|
"turnover_rate_mean_5",
|
||||||
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)",
|
"turnover_deviation",
|
||||||
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
|
"amihud_illiq_20",
|
||||||
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
|
"turnover_cv_20",
|
||||||
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
|
"pv_corr_20",
|
||||||
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
|
"close_vwap_deviation",
|
||||||
# ================= 5. 基本面财务特征 =================
|
# ================= 5. 基本面财务特征 =================
|
||||||
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)",
|
"roe",
|
||||||
"roa": "n_income / (total_assets + 1e-8)",
|
"roa",
|
||||||
"profit_margin": "n_income / (revenue + 1e-8)",
|
"profit_margin",
|
||||||
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)",
|
"debt_to_equity",
|
||||||
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)",
|
"current_ratio",
|
||||||
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
|
"net_profit_yoy",
|
||||||
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
|
"revenue_yoy",
|
||||||
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
|
"healthy_expansion_velocity",
|
||||||
# ================= 6. 基本面估值与截面动量共振 =================
|
# ================= 6. 基本面估值与截面动量共振 =================
|
||||||
"EP": "n_income / (total_mv * 10000 + 1e-8)",
|
"EP",
|
||||||
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)",
|
"BP",
|
||||||
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)",
|
"CP",
|
||||||
"market_cap_rank": "cs_rank(total_mv)",
|
"market_cap_rank",
|
||||||
"turnover_rank": "cs_rank(turnover_rate)",
|
"turnover_rank",
|
||||||
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
|
"return_5_rank",
|
||||||
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))",
|
"EP_rank",
|
||||||
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
|
"pe_expansion_trend",
|
||||||
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
|
"value_price_divergence",
|
||||||
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
|
"active_market_cap",
|
||||||
"ebit_rank": "cs_rank(ebit)",
|
"ebit_rank",
|
||||||
|
]
|
||||||
|
|
||||||
|
# 因子定义字典(完整因子库)
|
||||||
|
FACTOR_DEFINITIONS = {
|
||||||
|
# "turnover_volatility_ratio": "log(ts_std(turnover_rate, 20))"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Label 因子定义(不参与训练,用于计算目标)
|
# Label 因子定义(不参与训练,用于计算目标)
|
||||||
@@ -332,7 +349,7 @@ MODEL_PARAMS = {
|
|||||||
N_QUANTILES = 20 # 将 label 分为 20 组
|
N_QUANTILES = 20 # 将 label 分为 20 组
|
||||||
|
|
||||||
# 特征列(用于数据处理器)
|
# 特征列(用于数据处理器)
|
||||||
FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())
|
FEATURE_COLS = SELECTED_FACTORS
|
||||||
|
|
||||||
# 数据处理器配置
|
# 数据处理器配置
|
||||||
PROCESSORS = [
|
PROCESSORS = [
|
||||||
@@ -385,11 +402,13 @@ print("=" * 80)
|
|||||||
|
|
||||||
# 1. 创建 FactorEngine(启用 metadata 功能)
|
# 1. 创建 FactorEngine(启用 metadata 功能)
|
||||||
print("\n[1] 创建 FactorEngine")
|
print("\n[1] 创建 FactorEngine")
|
||||||
engine = FactorEngine(metadata_path="data/factors.jsonl")
|
engine = FactorEngine()
|
||||||
|
|
||||||
# 2. 使用 metadata 定义因子
|
# 2. 使用 metadata 定义因子
|
||||||
print("\n[2] 定义因子(从 metadata 注册)")
|
print("\n[2] 定义因子(从 metadata 注册)")
|
||||||
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
|
feature_cols = create_factors_with_metadata(
|
||||||
|
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
|
||||||
|
)
|
||||||
|
|
||||||
# 3. 准备数据
|
# 3. 准备数据
|
||||||
print("\n[3] 准备数据")
|
print("\n[3] 准备数据")
|
||||||
|
|||||||
@@ -47,29 +47,40 @@
|
|||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"source": [
|
"source": [
|
||||||
"def create_factors_with_metadata(\n",
|
"def create_factors_with_metadata(\n",
|
||||||
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n",
|
" engine: FactorEngine,\n",
|
||||||
|
" selected_factors: List[str],\n",
|
||||||
|
" factor_definitions: dict,\n",
|
||||||
|
" label_factor: dict,\n",
|
||||||
") -> List[str]:\n",
|
") -> List[str]:\n",
|
||||||
" \"\"\"使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)\"\"\"\n",
|
" \"\"\"注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
|
||||||
" print(\"=\" * 80)\n",
|
" print(\"=\" * 80)\n",
|
||||||
" print(\"使用 metadata 注册因子\")\n",
|
" print(\"注册因子\")\n",
|
||||||
" print(\"=\" * 80)\n",
|
" print(\"=\" * 80)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # 注册所有特征因子(通过 metadata 名称)\n",
|
" # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)\n",
|
||||||
" print(\"\\n注册特征因子(从 metadata):\")\n",
|
" print(\"\\n注册特征因子(从 metadata):\")\n",
|
||||||
" for name in factor_definitions.keys():\n",
|
" for name in selected_factors:\n",
|
||||||
" engine.add_factor_by_name(name)\n",
|
" engine.add_factor(name)\n",
|
||||||
" print(f\" - {name}\")\n",
|
" print(f\" - {name}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n",
|
" # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
|
||||||
|
" print(\"\\n注册特征因子(表达式):\")\n",
|
||||||
|
" for name, expr in factor_definitions.items():\n",
|
||||||
|
" engine.add_factor(name, expr)\n",
|
||||||
|
" print(f\" - {name}: {expr}\")\n",
|
||||||
|
"\n",
|
||||||
|
" # 注册 label 因子(通过表达式)\n",
|
||||||
" print(\"\\n注册 Label 因子(表达式):\")\n",
|
" print(\"\\n注册 Label 因子(表达式):\")\n",
|
||||||
" for name, expr in label_factor.items():\n",
|
" for name, expr in label_factor.items():\n",
|
||||||
" engine.add_factor(name, expr)\n",
|
" engine.add_factor(name, expr)\n",
|
||||||
" print(f\" - {name}: {expr}\")\n",
|
" print(f\" - {name}: {expr}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # 从字典自动获取特征列\n",
|
" # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
|
||||||
" feature_cols = list(factor_definitions.keys())\n",
|
" feature_cols = selected_factors + list(factor_definitions.keys())\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
|
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
|
||||||
|
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
|
||||||
|
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
|
||||||
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
|
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
|
||||||
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
|
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -123,7 +134,67 @@
|
|||||||
"# 特征因子定义字典:新增因子只需在此处添加一行\n",
|
"# 特征因子定义字典:新增因子只需在此处添加一行\n",
|
||||||
"LABEL_NAME = \"future_return_5\"\n",
|
"LABEL_NAME = \"future_return_5\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"FACTOR_DEFINITIONS = FACTOR_DICT = {\n",
|
"# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
|
||||||
|
"SELECTED_FACTORS = [\n",
|
||||||
|
" # ================= 1. 价格、趋势与路径依赖 =================\n",
|
||||||
|
" \"ma_5\",\n",
|
||||||
|
" \"ma_20\",\n",
|
||||||
|
" \"ma_ratio_5_20\",\n",
|
||||||
|
" \"bias_10\",\n",
|
||||||
|
" \"high_low_ratio\",\n",
|
||||||
|
" \"bbi_ratio\",\n",
|
||||||
|
" \"return_5\",\n",
|
||||||
|
" \"return_20\",\n",
|
||||||
|
" \"kaufman_ER_20\",\n",
|
||||||
|
" \"mom_acceleration_10_20\",\n",
|
||||||
|
" \"drawdown_from_high_60\",\n",
|
||||||
|
" \"up_days_ratio_20\",\n",
|
||||||
|
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
|
||||||
|
" \"volatility_5\",\n",
|
||||||
|
" \"volatility_20\",\n",
|
||||||
|
" \"volatility_ratio\",\n",
|
||||||
|
" \"std_return_20\",\n",
|
||||||
|
" \"sharpe_ratio_20\",\n",
|
||||||
|
" \"min_ret_20\",\n",
|
||||||
|
" \"volatility_squeeze_5_60\",\n",
|
||||||
|
" # ================= 3. 日内微观结构与异象 =================\n",
|
||||||
|
" \"overnight_intraday_diff\",\n",
|
||||||
|
" \"upper_shadow_ratio\",\n",
|
||||||
|
" \"capital_retention_20\",\n",
|
||||||
|
" \"max_ret_20\",\n",
|
||||||
|
" # ================= 4. 量能、流动性与量价背离 =================\n",
|
||||||
|
" \"volume_ratio_5_20\",\n",
|
||||||
|
" \"turnover_rate_mean_5\",\n",
|
||||||
|
" \"turnover_deviation\",\n",
|
||||||
|
" \"amihud_illiq_20\",\n",
|
||||||
|
" \"turnover_cv_20\",\n",
|
||||||
|
" \"pv_corr_20\",\n",
|
||||||
|
" \"close_vwap_deviation\",\n",
|
||||||
|
" # ================= 5. 基本面财务特征 =================\n",
|
||||||
|
" \"roe\",\n",
|
||||||
|
" \"roa\",\n",
|
||||||
|
" \"profit_margin\",\n",
|
||||||
|
" \"debt_to_equity\",\n",
|
||||||
|
" \"current_ratio\",\n",
|
||||||
|
" \"net_profit_yoy\",\n",
|
||||||
|
" \"revenue_yoy\",\n",
|
||||||
|
" \"healthy_expansion_velocity\",\n",
|
||||||
|
" # ================= 6. 基本面估值与截面动量共振 =================\n",
|
||||||
|
" \"EP\",\n",
|
||||||
|
" \"BP\",\n",
|
||||||
|
" \"CP\",\n",
|
||||||
|
" \"market_cap_rank\",\n",
|
||||||
|
" \"turnover_rank\",\n",
|
||||||
|
" \"return_5_rank\",\n",
|
||||||
|
" \"EP_rank\",\n",
|
||||||
|
" \"pe_expansion_trend\",\n",
|
||||||
|
" \"value_price_divergence\",\n",
|
||||||
|
" \"active_market_cap\",\n",
|
||||||
|
" \"ebit_rank\",\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# 因子定义字典(完整因子库)\n",
|
||||||
|
"FACTOR_DEFINITIONS = {\n",
|
||||||
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
|
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
|
||||||
" \"ma_5\": \"ts_mean(close, 5)\",\n",
|
" \"ma_5\": \"ts_mean(close, 5)\",\n",
|
||||||
" \"ma_20\": \"ts_mean(close, 20)\",\n",
|
" \"ma_20\": \"ts_mean(close, 20)\",\n",
|
||||||
@@ -338,7 +409,9 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# 2. 使用 metadata 定义因子\n",
|
"# 2. 使用 metadata 定义因子\n",
|
||||||
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
|
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
|
||||||
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n",
|
"feature_cols = create_factors_with_metadata(\n",
|
||||||
|
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
|
||||||
|
")\n",
|
||||||
"target_col = LABEL_NAME\n",
|
"target_col = LABEL_NAME\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# 3. 准备数据(使用模块级别的日期配置)\n",
|
"# 3. 准备数据(使用模块级别的日期配置)\n",
|
||||||
|
|||||||
@@ -26,29 +26,40 @@ from src.training.config import TrainingConfig
|
|||||||
# ## 2. 定义辅助函数
|
# ## 2. 定义辅助函数
|
||||||
# %%
|
# %%
|
||||||
def create_factors_with_metadata(
|
def create_factors_with_metadata(
|
||||||
engine: FactorEngine, factor_definitions: dict, label_factor: dict
|
engine: FactorEngine,
|
||||||
|
selected_factors: List[str],
|
||||||
|
factor_definitions: dict,
|
||||||
|
label_factor: dict,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)"""
|
"""注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)"""
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
print("使用 metadata 注册因子")
|
print("注册因子")
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
# 注册所有特征因子(通过 metadata 名称)
|
# 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)
|
||||||
print("\n注册特征因子(从 metadata):")
|
print("\n注册特征因子(从 metadata):")
|
||||||
for name in factor_definitions.keys():
|
for name in selected_factors:
|
||||||
engine.add_factor_by_name(name)
|
engine.add_factor(name)
|
||||||
print(f" - {name}")
|
print(f" - {name}")
|
||||||
|
|
||||||
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
|
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
|
||||||
|
print("\n注册特征因子(表达式):")
|
||||||
|
for name, expr in factor_definitions.items():
|
||||||
|
engine.add_factor(name, expr)
|
||||||
|
print(f" - {name}: {expr}")
|
||||||
|
|
||||||
|
# 注册 label 因子(通过表达式)
|
||||||
print("\n注册 Label 因子(表达式):")
|
print("\n注册 Label 因子(表达式):")
|
||||||
for name, expr in label_factor.items():
|
for name, expr in label_factor.items():
|
||||||
engine.add_factor(name, expr)
|
engine.add_factor(name, expr)
|
||||||
print(f" - {name}: {expr}")
|
print(f" - {name}: {expr}")
|
||||||
|
|
||||||
# 从字典自动获取特征列
|
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
|
||||||
feature_cols = list(factor_definitions.keys())
|
feature_cols = selected_factors + list(factor_definitions.keys())
|
||||||
|
|
||||||
print(f"\n特征因子数: {len(feature_cols)}")
|
print(f"\n特征因子数: {len(feature_cols)}")
|
||||||
|
print(f" - 来自 metadata: {len(selected_factors)}")
|
||||||
|
print(f" - 来自表达式: {len(factor_definitions)}")
|
||||||
print(f"Label: {list(label_factor.keys())[0]}")
|
print(f"Label: {list(label_factor.keys())[0]}")
|
||||||
print(f"已注册因子总数: {len(engine.list_registered())}")
|
print(f"已注册因子总数: {len(engine.list_registered())}")
|
||||||
|
|
||||||
@@ -91,7 +102,67 @@ def prepare_data(
|
|||||||
# 特征因子定义字典:新增因子只需在此处添加一行
|
# 特征因子定义字典:新增因子只需在此处添加一行
|
||||||
LABEL_NAME = "future_return_5"
|
LABEL_NAME = "future_return_5"
|
||||||
|
|
||||||
FACTOR_DEFINITIONS = FACTOR_DICT = {
|
# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
|
||||||
|
SELECTED_FACTORS = [
|
||||||
|
# ================= 1. 价格、趋势与路径依赖 =================
|
||||||
|
"ma_5",
|
||||||
|
"ma_20",
|
||||||
|
"ma_ratio_5_20",
|
||||||
|
"bias_10",
|
||||||
|
"high_low_ratio",
|
||||||
|
"bbi_ratio",
|
||||||
|
"return_5",
|
||||||
|
"return_20",
|
||||||
|
"kaufman_ER_20",
|
||||||
|
"mom_acceleration_10_20",
|
||||||
|
"drawdown_from_high_60",
|
||||||
|
"up_days_ratio_20",
|
||||||
|
# ================= 2. 波动率、风险调整与高阶矩 =================
|
||||||
|
"volatility_5",
|
||||||
|
"volatility_20",
|
||||||
|
"volatility_ratio",
|
||||||
|
"std_return_20",
|
||||||
|
"sharpe_ratio_20",
|
||||||
|
"min_ret_20",
|
||||||
|
"volatility_squeeze_5_60",
|
||||||
|
# ================= 3. 日内微观结构与异象 =================
|
||||||
|
"overnight_intraday_diff",
|
||||||
|
"upper_shadow_ratio",
|
||||||
|
"capital_retention_20",
|
||||||
|
"max_ret_20",
|
||||||
|
# ================= 4. 量能、流动性与量价背离 =================
|
||||||
|
"volume_ratio_5_20",
|
||||||
|
"turnover_rate_mean_5",
|
||||||
|
"turnover_deviation",
|
||||||
|
"amihud_illiq_20",
|
||||||
|
"turnover_cv_20",
|
||||||
|
"pv_corr_20",
|
||||||
|
"close_vwap_deviation",
|
||||||
|
# ================= 5. 基本面财务特征 =================
|
||||||
|
"roe",
|
||||||
|
"roa",
|
||||||
|
"profit_margin",
|
||||||
|
"debt_to_equity",
|
||||||
|
"current_ratio",
|
||||||
|
"net_profit_yoy",
|
||||||
|
"revenue_yoy",
|
||||||
|
"healthy_expansion_velocity",
|
||||||
|
# ================= 6. 基本面估值与截面动量共振 =================
|
||||||
|
"EP",
|
||||||
|
"BP",
|
||||||
|
"CP",
|
||||||
|
"market_cap_rank",
|
||||||
|
"turnover_rank",
|
||||||
|
"return_5_rank",
|
||||||
|
"EP_rank",
|
||||||
|
"pe_expansion_trend",
|
||||||
|
"value_price_divergence",
|
||||||
|
"active_market_cap",
|
||||||
|
"ebit_rank",
|
||||||
|
]
|
||||||
|
|
||||||
|
# 因子定义字典(完整因子库)
|
||||||
|
FACTOR_DEFINITIONS = {
|
||||||
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
|
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
|
||||||
"ma_5": "ts_mean(close, 5)",
|
"ma_5": "ts_mean(close, 5)",
|
||||||
"ma_20": "ts_mean(close, 20)",
|
"ma_20": "ts_mean(close, 20)",
|
||||||
@@ -284,7 +355,9 @@ engine = FactorEngine(metadata_path="data/factors.jsonl")
|
|||||||
|
|
||||||
# 2. 使用 metadata 定义因子
|
# 2. 使用 metadata 定义因子
|
||||||
print("\n[2] 定义因子(从 metadata 注册)")
|
print("\n[2] 定义因子(从 metadata 注册)")
|
||||||
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
|
feature_cols = create_factors_with_metadata(
|
||||||
|
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
|
||||||
|
)
|
||||||
target_col = LABEL_NAME
|
target_col = LABEL_NAME
|
||||||
|
|
||||||
# 3. 准备数据(使用模块级别的日期配置)
|
# 3. 准备数据(使用模块级别的日期配置)
|
||||||
|
|||||||
@@ -81,12 +81,16 @@ class FactorEngine:
|
|||||||
self._registry = registry if registry is not None else FunctionRegistry()
|
self._registry = registry if registry is not None else FunctionRegistry()
|
||||||
self._parser = FormulaParser(self._registry)
|
self._parser = FormulaParser(self._registry)
|
||||||
|
|
||||||
# 初始化 metadata 管理器(可选)
|
# 初始化 metadata 管理器(可选,默认启用)
|
||||||
self._metadata: Optional["FactorManager"] = None
|
|
||||||
if metadata_path is not None:
|
if metadata_path is not None:
|
||||||
from src.factors.metadata import FactorManager
|
from src.factors.metadata import FactorManager
|
||||||
|
|
||||||
self._metadata = FactorManager(metadata_path)
|
self._metadata = FactorManager(metadata_path)
|
||||||
|
else:
|
||||||
|
# 使用 FactorManager 的默认路径
|
||||||
|
from src.factors.metadata import FactorManager
|
||||||
|
|
||||||
|
self._metadata = FactorManager()
|
||||||
|
|
||||||
def register(
|
def register(
|
||||||
self,
|
self,
|
||||||
@@ -128,22 +132,68 @@ class FactorEngine:
|
|||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def _add_factor_from_metadata(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
factor_name_in_metadata: str,
|
||||||
|
data_specs: Optional[List[DataSpec]] = None,
|
||||||
|
) -> "FactorEngine":
|
||||||
|
"""从 metadata 中查询并注册因子(内部方法)。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: 要注册的因子名称(引擎中使用的名称)
|
||||||
|
factor_name_in_metadata: metadata 中的因子名称
|
||||||
|
data_specs: 可选的数据规格
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
self,支持链式调用
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: 当引擎未配置 metadata 路径时
|
||||||
|
ValueError: 当在 metadata 中未找到因子时
|
||||||
|
FormulaParseError: 当 DSL 表达式解析失败时
|
||||||
|
"""
|
||||||
|
if self._metadata is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,"
|
||||||
|
+ "例如:FactorEngine(metadata_path='data/factors.jsonl')"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 从 metadata 查询因子
|
||||||
|
df = self._metadata.get_factors_by_name(factor_name_in_metadata)
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"在 metadata 中未找到因子 '{factor_name_in_metadata}'。"
|
||||||
|
+ "请确认因子名称正确,或先使用 FactorManager 添加该因子。"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 获取 DSL 表达式
|
||||||
|
dsl_expr = df["dsl"][0]
|
||||||
|
|
||||||
|
# 解析表达式为 Node
|
||||||
|
node = self._parser.parse(dsl_expr)
|
||||||
|
|
||||||
|
# 委托给 register 方法
|
||||||
|
return self.register(name, node, data_specs)
|
||||||
|
|
||||||
def add_factor(
|
def add_factor(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
expression: Union[str, Node],
|
expression: Optional[Union[str, Node]] = None,
|
||||||
data_specs: Optional[List[DataSpec]] = None,
|
data_specs: Optional[List[DataSpec]] = None,
|
||||||
) -> "FactorEngine":
|
) -> "FactorEngine":
|
||||||
"""注册因子(支持字符串或 Node 表达式)。
|
"""注册因子(支持多种调用方式)。
|
||||||
|
|
||||||
这是 register 方法的增强版,支持字符串表达式解析。
|
这是 register 方法的增强版,支持以下调用方式:
|
||||||
向后兼容:register 方法保持不变,继续只接受 Node 类型。
|
1. 传入 name 和 expression:直接注册表达式(字符串或 Node)
|
||||||
|
2. 只传入 name:从 metadata 中查询表达式并注册
|
||||||
|
|
||||||
遵循 Fail-Fast 原则:字符串表达式会立即解析,失败时立即抛出异常。
|
遵循 Fail-Fast 原则:字符串表达式会立即解析,失败时立即抛出异常。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
name: 因子名称
|
name: 因子名称(引擎中使用的名称)
|
||||||
expression: 字符串表达式或 Node 对象
|
expression: 字符串表达式或 Node 对象,为 None 时从 metadata 查询
|
||||||
data_specs: 可选的数据规格
|
data_specs: 可选的数据规格
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -152,19 +202,21 @@ class FactorEngine:
|
|||||||
Raises:
|
Raises:
|
||||||
TypeError: 当 expression 类型不支持时
|
TypeError: 当 expression 类型不支持时
|
||||||
FormulaParseError: 当字符串解析失败时(立即报错)
|
FormulaParseError: 当字符串解析失败时(立即报错)
|
||||||
|
RuntimeError: 当 expression 为 None 但未配置 metadata 时
|
||||||
|
ValueError: 当在 metadata 中未找到因子时
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> engine = FactorEngine()
|
>>> engine = FactorEngine()
|
||||||
>>>
|
>>>
|
||||||
>>> # 字符串方式(新功能)
|
>>> # 方式1:字符串表达式
|
||||||
>>> engine.add_factor("ma20", "ts_mean(close, 20)")
|
>>> engine.add_factor("ma20", "ts_mean(close, 20)")
|
||||||
>>>
|
>>>
|
||||||
>>> # Node 方式(与 register 相同)
|
>>> # 方式2:Node 表达式
|
||||||
>>> from src.factors.api import close, ts_mean
|
>>> from src.factors.api import close, ts_mean
|
||||||
>>> engine.add_factor("ma20", ts_mean(close, 20))
|
>>> engine.add_factor("ma20", ts_mean(close, 20))
|
||||||
>>>
|
>>>
|
||||||
>>> # 复杂表达式
|
>>> # 方式3:从 metadata 查询(需要初始化时配置 metadata_path)
|
||||||
>>> engine.add_factor("alpha1", "cs_rank(close / open)")
|
>>> engine.add_factor("return_5") # 从 metadata 查询名为 return_5 的因子
|
||||||
>>>
|
>>>
|
||||||
>>> # 链式调用
|
>>> # 链式调用
|
||||||
>>> (engine
|
>>> (engine
|
||||||
@@ -172,6 +224,10 @@ class FactorEngine:
|
|||||||
... .add_factor("ma10", "ts_mean(close, 10)")
|
... .add_factor("ma10", "ts_mean(close, 10)")
|
||||||
... .add_factor("golden_cross", "ma5 > ma10"))
|
... .add_factor("golden_cross", "ma5 > ma10"))
|
||||||
"""
|
"""
|
||||||
|
if expression is None:
|
||||||
|
# 从 metadata 查询表达式
|
||||||
|
return self._add_factor_from_metadata(name, name, data_specs)
|
||||||
|
|
||||||
if isinstance(expression, str):
|
if isinstance(expression, str):
|
||||||
# Fail-Fast:立即解析,失败立即报错
|
# Fail-Fast:立即解析,失败立即报错
|
||||||
node = self._parser.parse(expression)
|
node = self._parser.parse(expression)
|
||||||
@@ -185,76 +241,6 @@ class FactorEngine:
|
|||||||
# 委托给现有的 register 方法
|
# 委托给现有的 register 方法
|
||||||
return self.register(name, node, data_specs)
|
return self.register(name, node, data_specs)
|
||||||
|
|
||||||
def add_factor_by_name(
|
|
||||||
self,
|
|
||||||
name: str,
|
|
||||||
factor_name_in_metadata: Optional[str] = None,
|
|
||||||
data_specs: Optional[List[DataSpec]] = None,
|
|
||||||
) -> "FactorEngine":
|
|
||||||
"""根据 metadata 中的因子名称注册因子。
|
|
||||||
|
|
||||||
从 metadata 管理器中根据因子名称查询 DSL 表达式,
|
|
||||||
然后解析并注册到引擎中。
|
|
||||||
|
|
||||||
Args:
|
|
||||||
name: 要注册的因子名称(引擎中使用的名称)
|
|
||||||
factor_name_in_metadata: metadata 中的因子名称,
|
|
||||||
为 None 时默认使用 name 参数
|
|
||||||
data_specs: 可选的数据规格
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
self,支持链式调用
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
RuntimeError: 当引擎未配置 metadata 路径时
|
|
||||||
ValueError: 当在 metadata 中未找到因子时
|
|
||||||
FormulaParseError: 当 DSL 表达式解析失败时
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> # 初始化时启用 metadata
|
|
||||||
>>> engine = FactorEngine(metadata_path="data/factors.jsonl")
|
|
||||||
>>>
|
|
||||||
>>> # 注册 metadata 中的因子(使用相同名称)
|
|
||||||
>>> engine.add_factor_by_name("return_5")
|
|
||||||
>>>
|
|
||||||
>>> # 使用不同名称注册
|
|
||||||
>>> engine.add_factor_by_name("my_mom", "momentum_5d")
|
|
||||||
>>>
|
|
||||||
>>> # 链式调用
|
|
||||||
>>> (engine
|
|
||||||
... .add_factor_by_name("ma20")
|
|
||||||
... .add_factor_by_name("rsi14")
|
|
||||||
... .compute(["ma20", "rsi14"], "20240101", "20240131"))
|
|
||||||
"""
|
|
||||||
if self._metadata is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
"引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,"
|
|
||||||
+ "例如:FactorEngine(metadata_path='data/factors.jsonl')"
|
|
||||||
)
|
|
||||||
|
|
||||||
# 使用传入的名称或默认使用 name
|
|
||||||
query_name = (
|
|
||||||
factor_name_in_metadata if factor_name_in_metadata is not None else name
|
|
||||||
)
|
|
||||||
|
|
||||||
# 从 metadata 查询因子
|
|
||||||
df = self._metadata.get_factors_by_name(query_name)
|
|
||||||
|
|
||||||
if len(df) == 0:
|
|
||||||
raise ValueError(
|
|
||||||
f"在 metadata 中未找到因子 '{query_name}'。"
|
|
||||||
+ "请确认因子名称正确,或先使用 FactorManager 添加该因子。"
|
|
||||||
)
|
|
||||||
|
|
||||||
# 获取 DSL 表达式
|
|
||||||
dsl_expr = df["dsl"][0]
|
|
||||||
|
|
||||||
# 解析表达式为 Node
|
|
||||||
node = self._parser.parse(dsl_expr)
|
|
||||||
|
|
||||||
# 委托给 register 方法
|
|
||||||
return self.register(name, node, data_specs)
|
|
||||||
|
|
||||||
def compute(
|
def compute(
|
||||||
self,
|
self,
|
||||||
factor_names: Union[str, List[str]],
|
factor_names: Union[str, List[str]],
|
||||||
|
|||||||
@@ -53,23 +53,32 @@ class FactorManager:
|
|||||||
_conn: DuckDB连接对象(懒加载)
|
_conn: DuckDB连接对象(懒加载)
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> manager = FactorManager("data/factors.jsonl")
|
>>> manager = FactorManager() # 使用默认路径
|
||||||
>>> df = manager.get_factors_by_name("mom_5d")
|
>>> df = manager.get_factors_by_name("mom_5d")
|
||||||
>>> print(df["dsl"][0])
|
>>> print(df["dsl"][0])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, filepath: str) -> None:
|
_DEFAULT_FILENAME = "factors.jsonl"
|
||||||
|
|
||||||
|
def __init__(self, filepath: Optional[str] = None) -> None:
|
||||||
"""初始化因子管理器。
|
"""初始化因子管理器。
|
||||||
|
|
||||||
如果文件不存在,会自动创建空的JSONL文件。
|
如果文件不存在,会自动创建空的JSONL文件。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
filepath: JSONL文件路径(相对或绝对路径)
|
filepath: JSONL文件路径(相对或绝对路径),为None时使用默认路径
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
FileOperationError: 当文件创建失败时
|
FileOperationError: 当文件创建失败时
|
||||||
"""
|
"""
|
||||||
self.filepath = Path(filepath).resolve()
|
if filepath is None:
|
||||||
|
# 使用默认路径:从配置读取数据目录
|
||||||
|
from src.config.settings import settings
|
||||||
|
|
||||||
|
self.filepath = settings.data_path_resolved / self._DEFAULT_FILENAME
|
||||||
|
else:
|
||||||
|
self.filepath = Path(filepath).resolve()
|
||||||
|
|
||||||
self._conn: Optional[duckdb.DuckDBPyConnection] = None
|
self._conn: Optional[duckdb.DuckDBPyConnection] = None
|
||||||
|
|
||||||
# 确保文件存在
|
# 确保文件存在
|
||||||
|
|||||||
Reference in New Issue
Block a user