refactor(factors): 简化 add_factor API 并默认启用 metadata

- 合并 add_factor_by_name 到 add_factor,支持三种调用方式
- FactorManager 构造函数改为可选参数,使用默认路径
- FactorEngine 默认启用 metadata,无需手动配置路径
This commit is contained in:
2026-03-12 22:34:25 +08:00
parent 2bb7718dd1
commit ced7a929c3
7 changed files with 496 additions and 254 deletions

View File

View File

@@ -22,10 +22,13 @@
"source": "## 1. 导入依赖"
},
{
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:49.975545Z",
"start_time": "2026-03-11T16:02:48.487347Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"import os\n",
"from datetime import datetime\n",
@@ -50,7 +53,9 @@
"from src.training.components.models import LightGBMLambdaRankModel\n",
"from src.training.config import TrainingConfig\n",
"\n"
]
],
"outputs": [],
"execution_count": 1
},
{
"metadata": {},
@@ -58,35 +63,49 @@
"source": "## 2. 辅助函数"
},
{
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:49.989220Z",
"start_time": "2026-03-11T16:02:49.981542Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"def create_factors_with_metadata(\n",
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n",
" engine: FactorEngine,\n",
" selected_factors: List[str],\n",
" factor_definitions: dict,\n",
" label_factor: dict,\n",
") -> List[str]:\n",
" \"\"\"使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)\"\"\"\n",
" \"\"\"注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
" print(\"=\" * 80)\n",
" print(\"使用 metadata 注册因子\")\n",
" print(\"注册因子\")\n",
" print(\"=\" * 80)\n",
"\n",
" # 注册所有特征因子(通过 metadata 名称\n",
" # 注册 SELECTED_FACTORS 中的因子(已在 metadata \n",
" print(\"\\n注册特征因子从 metadata:\")\n",
" for name in factor_definitions.keys():\n",
" engine.add_factor_by_name(name)\n",
" for name in selected_factors:\n",
" engine.add_factor(name)\n",
" print(f\" - {name}\")\n",
"\n",
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n",
" # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
" print(\"\\n注册特征因子表达式:\")\n",
" for name, expr in factor_definitions.items():\n",
" engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n",
"\n",
" # 注册 label 因子(通过表达式)\n",
" print(\"\\n注册 Label 因子(表达式):\")\n",
" for name, expr in label_factor.items():\n",
" engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n",
"\n",
" # 从字典自动获取特征列\n",
" feature_cols = list(factor_definitions.keys())\n",
" # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
" feature_cols = selected_factors + list(factor_definitions.keys())\n",
"\n",
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
"\n",
@@ -251,7 +270,9 @@
"\n",
" return results\n",
"\n"
]
],
"outputs": [],
"execution_count": 2
},
{
"metadata": {},
@@ -263,77 +284,86 @@
]
},
{
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:50.000875Z",
"start_time": "2026-03-11T16:02:49.994082Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# 特征因子定义字典(复用 regression.ipynb 的因子定义)\n",
"LABEL_NAME = \"future_return_5_rank\"\n",
"\n",
"FACTOR_DEFINITIONS = {\n",
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
" \"ma_5\": \"ts_mean(close, 5)\",\n",
" \"ma_20\": \"ts_mean(close, 20)\",\n",
" \"ma_ratio_5_20\": \"ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1\",\n",
" \"bias_10\": \"close / (ts_mean(close, 10) + 1e-8) - 1\",\n",
" \"high_low_ratio\": \"(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)\",\n",
" \"bbi_ratio\": \"(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)\",\n",
" \"return_5\": \"(close / (ts_delay(close, 5) + 1e-8)) - 1\",\n",
" \"return_20\": \"(close / (ts_delay(close, 20) + 1e-8)) - 1\",\n",
" \"kaufman_ER_20\": \"abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)\",\n",
" \"mom_acceleration_10_20\": \"(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)\",\n",
" \"drawdown_from_high_60\": \"close / (ts_max(high, 60) + 1e-8) - 1\",\n",
" \"up_days_ratio_20\": \"ts_sum(close > ts_delay(close, 1), 20) / 20\",\n",
"# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
"SELECTED_FACTORS = [\n",
" # ================= 1. 价格、趋势与路径依赖 =================\n",
" \"ma_5\",\n",
" \"ma_20\",\n",
" \"ma_ratio_5_20\",\n",
" \"bias_10\",\n",
" \"high_low_ratio\",\n",
" \"bbi_ratio\",\n",
" \"return_5\",\n",
" \"return_20\",\n",
" \"kaufman_ER_20\",\n",
" \"mom_acceleration_10_20\",\n",
" \"drawdown_from_high_60\",\n",
" \"up_days_ratio_20\",\n",
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
" \"volatility_5\": \"ts_std(close, 5)\",\n",
" \"volatility_20\": \"ts_std(close, 20)\",\n",
" \"volatility_ratio\": \"ts_std(close, 5) / (ts_std(close, 20) + 1e-8)\",\n",
" \"std_return_20\": \"ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)\",\n",
" \"sharpe_ratio_20\": \"ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)\",\n",
" \"min_ret_20\": \"ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n",
" \"volatility_squeeze_5_60\": \"ts_std(close, 5) / (ts_std(close, 60) + 1e-8)\",\n",
" \"volatility_5\",\n",
" \"volatility_20\",\n",
" \"volatility_ratio\",\n",
" \"std_return_20\",\n",
" \"sharpe_ratio_20\",\n",
" \"min_ret_20\",\n",
" \"volatility_squeeze_5_60\",\n",
" # ================= 3. 日内微观结构与异象 =================\n",
" \"overnight_intraday_diff\": \"(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)\",\n",
" \"upper_shadow_ratio\": \"(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)\",\n",
" \"capital_retention_20\": \"ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)\",\n",
" \"max_ret_20\": \"ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n",
" \"overnight_intraday_diff\",\n",
" \"upper_shadow_ratio\",\n",
" \"capital_retention_20\",\n",
" \"max_ret_20\",\n",
" # ================= 4. 量能、流动性与量价背离 =================\n",
" \"volume_ratio_5_20\": \"ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)\",\n",
" \"turnover_rate_mean_5\": \"ts_mean(turnover_rate, 5)\",\n",
" \"turnover_deviation\": \"(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)\",\n",
" \"amihud_illiq_20\": \"ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)\",\n",
" \"turnover_cv_20\": \"ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)\",\n",
" \"pv_corr_20\": \"ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)\",\n",
" \"close_vwap_deviation\": \"close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1\",\n",
" \"volume_ratio_5_20\",\n",
" \"turnover_rate_mean_5\",\n",
" \"turnover_deviation\",\n",
" \"amihud_illiq_20\",\n",
" \"turnover_cv_20\",\n",
" \"pv_corr_20\",\n",
" \"close_vwap_deviation\",\n",
" # ================= 5. 基本面财务特征 =================\n",
" \"roe\": \"n_income / (total_hldr_eqy_exc_min_int + 1e-8)\",\n",
" \"roa\": \"n_income / (total_assets + 1e-8)\",\n",
" \"profit_margin\": \"n_income / (revenue + 1e-8)\",\n",
" \"debt_to_equity\": \"total_liab / (total_hldr_eqy_exc_min_int + 1e-8)\",\n",
" \"current_ratio\": \"total_cur_assets / (total_cur_liab + 1e-8)\",\n",
" \"net_profit_yoy\": \"(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1\",\n",
" \"revenue_yoy\": \"(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1\",\n",
" \"healthy_expansion_velocity\": \"(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)\",\n",
" \"roe\",\n",
" \"roa\",\n",
" \"profit_margin\",\n",
" \"debt_to_equity\",\n",
" \"current_ratio\",\n",
" \"net_profit_yoy\",\n",
" \"revenue_yoy\",\n",
" \"healthy_expansion_velocity\",\n",
" # ================= 6. 基本面估值与截面动量共振 =================\n",
" \"EP\": \"n_income / (total_mv * 10000 + 1e-8)\",\n",
" \"BP\": \"total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)\",\n",
" \"CP\": \"n_cashflow_act / (total_mv * 10000 + 1e-8)\",\n",
" \"market_cap_rank\": \"cs_rank(total_mv)\",\n",
" \"turnover_rank\": \"cs_rank(turnover_rate)\",\n",
" \"return_5_rank\": \"cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)\",\n",
" \"EP_rank\": \"cs_rank(n_income / (total_mv + 1e-8))\",\n",
" \"pe_expansion_trend\": \"(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1\",\n",
" \"value_price_divergence\": \"cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))\",\n",
" \"active_market_cap\": \"total_mv * ts_mean(turnover_rate, 20)\",\n",
" \"ebit_rank\": \"cs_rank(ebit)\",\n",
"}\n",
" \"EP\",\n",
" \"BP\",\n",
" \"CP\",\n",
" \"market_cap_rank\",\n",
" \"turnover_rank\",\n",
" \"return_5_rank\",\n",
" \"EP_rank\",\n",
" \"pe_expansion_trend\",\n",
" \"value_price_divergence\",\n",
" \"active_market_cap\",\n",
" \"ebit_rank\",\n",
"]\n",
"\n",
"# 因子定义字典(完整因子库)\n",
"FACTOR_DEFINITIONS = {\"turnover_volatility_ratio\": \"log(ts_std(turnover_rate, 20))\"}\n",
"\n",
"# Label 因子定义(不参与训练,用于计算目标)\n",
"LABEL_FACTOR = {\n",
" LABEL_NAME: \"(ts_delay(close, -5) / ts_delay(open, -1)) - 1\",\n",
"}"
]
],
"outputs": [],
"execution_count": 3
},
{
"metadata": {},
@@ -341,10 +371,13 @@
"source": "### 3.2 训练参数配置"
},
{
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:50.009081Z",
"start_time": "2026-03-11T16:02:50.005330Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# 日期范围配置(正确的 train/val/test 三分法)\n",
"TRAIN_START = \"20200101\"\n",
@@ -377,7 +410,7 @@
"N_QUANTILES = 20 # 将 label 分为 20 组\n",
"\n",
"# 特征列(用于数据处理器)\n",
"FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())\n",
"FEATURE_COLS = SELECTED_FACTORS\n",
"\n",
"# 数据处理器配置\n",
"PROCESSORS = [\n",
@@ -421,7 +454,9 @@
"\n",
"# Top N 配置:每日推荐股票数量\n",
"TOP_N = 5 # 可调整为 10, 20 等"
]
],
"outputs": [],
"execution_count": 4
},
{
"metadata": {},
@@ -429,10 +464,13 @@
"source": "## 4. 训练流程"
},
{
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:50.330018Z",
"start_time": "2026-03-11T16:02:50.012964Z"
}
},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"print(\"\\n\" + \"=\" * 80)\n",
"print(\"LightGBM LambdaRank 排序学习训练\")\n",
@@ -444,7 +482,9 @@
"\n",
"# 2. 使用 metadata 定义因子\n",
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n",
"feature_cols = create_factors_with_metadata(\n",
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
")\n",
"\n",
"# 3. 准备数据\n",
"print(\"\\n[3] 准备数据\")\n",
@@ -507,7 +547,49 @@
" feature_cols=feature_cols,\n",
" persist_model=PERSIST_MODEL,\n",
")"
]
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"================================================================================\n",
"LightGBM LambdaRank 排序学习训练\n",
"================================================================================\n",
"\n",
"[1] 创建 FactorEngine\n",
"\n",
"[2] 定义因子(从 metadata 注册)\n",
"================================================================================\n",
"注册因子\n",
"================================================================================\n",
"\n",
"注册特征因子(从 metadata:\n"
]
},
{
"ename": "QueryError",
"evalue": "查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n ",
"output_type": "error",
"traceback": [
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
"\u001B[31mBinderException\u001B[39m Traceback (most recent call last)",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:296\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 295\u001B[39m conn = \u001B[38;5;28mself\u001B[39m._get_connection()\n\u001B[32m--> \u001B[39m\u001B[32m296\u001B[39m result = \u001B[43mconn\u001B[49m\u001B[43m.\u001B[49m\u001B[43mexecute\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m.pl()\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n",
"\u001B[31mBinderException\u001B[39m: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001B[31mQueryError\u001B[39m Traceback (most recent call last)",
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[5]\u001B[39m\u001B[32m, line 11\u001B[39m\n\u001B[32m 9\u001B[39m \u001B[38;5;66;03m# 2. 使用 metadata 定义因子\u001B[39;00m\n\u001B[32m 10\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[2] 定义因子(从 metadata 注册)\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m---> \u001B[39m\u001B[32m11\u001B[39m feature_cols = \u001B[43mcreate_factors_with_metadata\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 12\u001B[39m \u001B[43m \u001B[49m\u001B[43mengine\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mSELECTED_FACTORS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mFACTOR_DEFINITIONS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mLABEL_FACTOR\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m)\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[38;5;66;03m# 3. 准备数据\u001B[39;00m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[3] 准备数据\u001B[39m\u001B[33m\"\u001B[39m)\n",
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 15\u001B[39m, in \u001B[36mcreate_factors_with_metadata\u001B[39m\u001B[34m(engine, selected_factors, factor_definitions, label_factor)\u001B[39m\n\u001B[32m 13\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m注册特征因子从 metadata:\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 14\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m name \u001B[38;5;129;01min\u001B[39;00m selected_factors:\n\u001B[32m---> \u001B[39m\u001B[32m15\u001B[39m \u001B[43mengine\u001B[49m\u001B[43m.\u001B[49m\u001B[43madd_factor\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m - \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 18\u001B[39m \u001B[38;5;66;03m# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\u001B[39;00m\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:225\u001B[39m, in \u001B[36mFactorEngine.add_factor\u001B[39m\u001B[34m(self, name, expression, data_specs)\u001B[39m\n\u001B[32m 182\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"注册因子(支持多种调用方式)。\u001B[39;00m\n\u001B[32m 183\u001B[39m \n\u001B[32m 184\u001B[39m \u001B[33;03m这是 register 方法的增强版,支持以下调用方式:\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 221\u001B[39m \u001B[33;03m ... .add_factor(\"golden_cross\", \"ma5 > ma10\"))\u001B[39;00m\n\u001B[32m 222\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 223\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m expression \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 224\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询表达式\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m225\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_add_factor_from_metadata\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdata_specs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 227\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(expression, \u001B[38;5;28mstr\u001B[39m):\n\u001B[32m 228\u001B[39m \u001B[38;5;66;03m# Fail-Fast立即解析失败立即报错\u001B[39;00m\n\u001B[32m 229\u001B[39m node = \u001B[38;5;28mself\u001B[39m._parser.parse(expression)\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:159\u001B[39m, in \u001B[36mFactorEngine._add_factor_from_metadata\u001B[39m\u001B[34m(self, name, factor_name_in_metadata, data_specs)\u001B[39m\n\u001B[32m 153\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\n\u001B[32m 154\u001B[39m \u001B[33m\"\u001B[39m\u001B[33m引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 155\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m例如FactorEngine(metadata_path=\u001B[39m\u001B[33m'\u001B[39m\u001B[33mdata/factors.jsonl\u001B[39m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 156\u001B[39m )\n\u001B[32m 158\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询因子\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m159\u001B[39m df = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_metadata\u001B[49m\u001B[43m.\u001B[49m\u001B[43mget_factors_by_name\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfactor_name_in_metadata\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 161\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(df) == \u001B[32m0\u001B[39m:\n\u001B[32m 162\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[32m 163\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m在 metadata 中未找到因子 \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfactor_name_in_metadata\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 164\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m请确认因子名称正确或先使用 FactorManager 添加该因子。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 165\u001B[39m )\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:177\u001B[39m, in \u001B[36mFactorManager.get_factors_by_name\u001B[39m\u001B[34m(self, name)\u001B[39m\n\u001B[32m 154\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"根据名称查询因子。\u001B[39;00m\n\u001B[32m 155\u001B[39m \n\u001B[32m 156\u001B[39m \u001B[33;03m使用DuckDB执行SQL查询返回Polars DataFrame。\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 170\u001B[39m \u001B[33;03m ... print(df[\"dsl\"][0])\u001B[39;00m\n\u001B[32m 171\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 172\u001B[39m sql = \u001B[33mf\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m 173\u001B[39m \u001B[33m SELECT *\u001B[39m\n\u001B[32m 174\u001B[39m \u001B[33m FROM read_json_auto(\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mself\u001B[39m.filepath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\n\u001B[32m 175\u001B[39m \u001B[33m WHERE name = \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\n\u001B[32m 176\u001B[39m \u001B[33m\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m--> \u001B[39m\u001B[32m177\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_execute_query\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:299\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n\u001B[32m 298\u001B[39m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[32m--> \u001B[39m\u001B[32m299\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m QueryError(sql, e)\n",
"\u001B[31mQueryError\u001B[39m: 查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n "
]
}
],
"execution_count": 5
},
{
"metadata": {},

View File

@@ -40,29 +40,40 @@ from src.training.config import TrainingConfig
# ## 2. 辅助函数
# %%
def create_factors_with_metadata(
engine: FactorEngine, factor_definitions: dict, label_factor: dict
engine: FactorEngine,
selected_factors: List[str],
factor_definitions: dict,
label_factor: dict,
) -> List[str]:
"""使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)"""
"""注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)"""
print("=" * 80)
print("使用 metadata 注册因子")
print("注册因子")
print("=" * 80)
# 注册所有特征因子(通过 metadata 名称
# 注册 SELECTED_FACTORS 中的因子(已在 metadata
print("\n注册特征因子(从 metadata:")
for name in factor_definitions.keys():
engine.add_factor_by_name(name)
for name in selected_factors:
engine.add_factor(name)
print(f" - {name}")
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
print("\n注册特征因子(表达式):")
for name, expr in factor_definitions.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 注册 label 因子(通过表达式)
print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 从字典自动获取特征列
feature_cols = list(factor_definitions.keys())
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
feature_cols = selected_factors + list(factor_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}")
print(f" - 来自 metadata: {len(selected_factors)}")
print(f" - 来自表达式: {len(factor_definitions)}")
print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}")
@@ -236,62 +247,68 @@ def evaluate_ndcg_at_k(
# 特征因子定义字典(复用 regression.ipynb 的因子定义)
LABEL_NAME = "future_return_5_rank"
FACTOR_DEFINITIONS = {
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
"ma_5": "ts_mean(close, 5)",
"ma_20": "ts_mean(close, 20)",
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1",
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1",
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)",
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)",
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1",
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1",
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
SELECTED_FACTORS = [
# ================= 1. 价格、趋势与路径依赖 =================
"ma_5",
"ma_20",
"ma_ratio_5_20",
"bias_10",
"high_low_ratio",
"bbi_ratio",
"return_5",
"return_20",
"kaufman_ER_20",
"mom_acceleration_10_20",
"drawdown_from_high_60",
"up_days_ratio_20",
# ================= 2. 波动率、风险调整与高阶矩 =================
"volatility_5": "ts_std(close, 5)",
"volatility_20": "ts_std(close, 20)",
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)",
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)",
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
"volatility_5",
"volatility_20",
"volatility_ratio",
"std_return_20",
"sharpe_ratio_20",
"min_ret_20",
"volatility_squeeze_5_60",
# ================= 3. 日内微观结构与异象 =================
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
"overnight_intraday_diff",
"upper_shadow_ratio",
"capital_retention_20",
"max_ret_20",
# ================= 4. 量能、流动性与量价背离 =================
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)",
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)",
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)",
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
"volume_ratio_5_20",
"turnover_rate_mean_5",
"turnover_deviation",
"amihud_illiq_20",
"turnover_cv_20",
"pv_corr_20",
"close_vwap_deviation",
# ================= 5. 基本面财务特征 =================
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)",
"roa": "n_income / (total_assets + 1e-8)",
"profit_margin": "n_income / (revenue + 1e-8)",
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)",
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)",
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
"roe",
"roa",
"profit_margin",
"debt_to_equity",
"current_ratio",
"net_profit_yoy",
"revenue_yoy",
"healthy_expansion_velocity",
# ================= 6. 基本面估值与截面动量共振 =================
"EP": "n_income / (total_mv * 10000 + 1e-8)",
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)",
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)",
"market_cap_rank": "cs_rank(total_mv)",
"turnover_rank": "cs_rank(turnover_rate)",
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))",
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
"ebit_rank": "cs_rank(ebit)",
"EP",
"BP",
"CP",
"market_cap_rank",
"turnover_rank",
"return_5_rank",
"EP_rank",
"pe_expansion_trend",
"value_price_divergence",
"active_market_cap",
"ebit_rank",
]
# 因子定义字典(完整因子库)
FACTOR_DEFINITIONS = {
# "turnover_volatility_ratio": "log(ts_std(turnover_rate, 20))"
}
# Label 因子定义(不参与训练,用于计算目标)
@@ -332,7 +349,7 @@ MODEL_PARAMS = {
N_QUANTILES = 20 # 将 label 分为 20 组
# 特征列(用于数据处理器)
FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())
FEATURE_COLS = SELECTED_FACTORS
# 数据处理器配置
PROCESSORS = [
@@ -385,11 +402,13 @@ print("=" * 80)
# 1. 创建 FactorEngine启用 metadata 功能)
print("\n[1] 创建 FactorEngine")
engine = FactorEngine(metadata_path="data/factors.jsonl")
engine = FactorEngine()
# 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
feature_cols = create_factors_with_metadata(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
)
# 3. 准备数据
print("\n[3] 准备数据")

View File

@@ -47,29 +47,40 @@
"execution_count": null,
"source": [
"def create_factors_with_metadata(\n",
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n",
" engine: FactorEngine,\n",
" selected_factors: List[str],\n",
" factor_definitions: dict,\n",
" label_factor: dict,\n",
") -> List[str]:\n",
" \"\"\"使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)\"\"\"\n",
" \"\"\"注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
" print(\"=\" * 80)\n",
" print(\"使用 metadata 注册因子\")\n",
" print(\"注册因子\")\n",
" print(\"=\" * 80)\n",
"\n",
" # 注册所有特征因子(通过 metadata 名称\n",
" # 注册 SELECTED_FACTORS 中的因子(已在 metadata \n",
" print(\"\\n注册特征因子从 metadata:\")\n",
" for name in factor_definitions.keys():\n",
" engine.add_factor_by_name(name)\n",
" for name in selected_factors:\n",
" engine.add_factor(name)\n",
" print(f\" - {name}\")\n",
"\n",
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n",
" # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
" print(\"\\n注册特征因子表达式:\")\n",
" for name, expr in factor_definitions.items():\n",
" engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n",
"\n",
" # 注册 label 因子(通过表达式)\n",
" print(\"\\n注册 Label 因子(表达式):\")\n",
" for name, expr in label_factor.items():\n",
" engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n",
"\n",
" # 从字典自动获取特征列\n",
" feature_cols = list(factor_definitions.keys())\n",
" # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
" feature_cols = selected_factors + list(factor_definitions.keys())\n",
"\n",
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
"\n",
@@ -123,7 +134,67 @@
"# 特征因子定义字典:新增因子只需在此处添加一行\n",
"LABEL_NAME = \"future_return_5\"\n",
"\n",
"FACTOR_DEFINITIONS = FACTOR_DICT = {\n",
"# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
"SELECTED_FACTORS = [\n",
" # ================= 1. 价格、趋势与路径依赖 =================\n",
" \"ma_5\",\n",
" \"ma_20\",\n",
" \"ma_ratio_5_20\",\n",
" \"bias_10\",\n",
" \"high_low_ratio\",\n",
" \"bbi_ratio\",\n",
" \"return_5\",\n",
" \"return_20\",\n",
" \"kaufman_ER_20\",\n",
" \"mom_acceleration_10_20\",\n",
" \"drawdown_from_high_60\",\n",
" \"up_days_ratio_20\",\n",
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
" \"volatility_5\",\n",
" \"volatility_20\",\n",
" \"volatility_ratio\",\n",
" \"std_return_20\",\n",
" \"sharpe_ratio_20\",\n",
" \"min_ret_20\",\n",
" \"volatility_squeeze_5_60\",\n",
" # ================= 3. 日内微观结构与异象 =================\n",
" \"overnight_intraday_diff\",\n",
" \"upper_shadow_ratio\",\n",
" \"capital_retention_20\",\n",
" \"max_ret_20\",\n",
" # ================= 4. 量能、流动性与量价背离 =================\n",
" \"volume_ratio_5_20\",\n",
" \"turnover_rate_mean_5\",\n",
" \"turnover_deviation\",\n",
" \"amihud_illiq_20\",\n",
" \"turnover_cv_20\",\n",
" \"pv_corr_20\",\n",
" \"close_vwap_deviation\",\n",
" # ================= 5. 基本面财务特征 =================\n",
" \"roe\",\n",
" \"roa\",\n",
" \"profit_margin\",\n",
" \"debt_to_equity\",\n",
" \"current_ratio\",\n",
" \"net_profit_yoy\",\n",
" \"revenue_yoy\",\n",
" \"healthy_expansion_velocity\",\n",
" # ================= 6. 基本面估值与截面动量共振 =================\n",
" \"EP\",\n",
" \"BP\",\n",
" \"CP\",\n",
" \"market_cap_rank\",\n",
" \"turnover_rank\",\n",
" \"return_5_rank\",\n",
" \"EP_rank\",\n",
" \"pe_expansion_trend\",\n",
" \"value_price_divergence\",\n",
" \"active_market_cap\",\n",
" \"ebit_rank\",\n",
"]\n",
"\n",
"# 因子定义字典(完整因子库)\n",
"FACTOR_DEFINITIONS = {\n",
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
" \"ma_5\": \"ts_mean(close, 5)\",\n",
" \"ma_20\": \"ts_mean(close, 20)\",\n",
@@ -338,7 +409,9 @@
"\n",
"# 2. 使用 metadata 定义因子\n",
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n",
"feature_cols = create_factors_with_metadata(\n",
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
")\n",
"target_col = LABEL_NAME\n",
"\n",
"# 3. 准备数据(使用模块级别的日期配置)\n",

View File

@@ -26,29 +26,40 @@ from src.training.config import TrainingConfig
# ## 2. 定义辅助函数
# %%
def create_factors_with_metadata(
engine: FactorEngine, factor_definitions: dict, label_factor: dict
engine: FactorEngine,
selected_factors: List[str],
factor_definitions: dict,
label_factor: dict,
) -> List[str]:
"""使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)"""
"""注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)"""
print("=" * 80)
print("使用 metadata 注册因子")
print("注册因子")
print("=" * 80)
# 注册所有特征因子(通过 metadata 名称
# 注册 SELECTED_FACTORS 中的因子(已在 metadata
print("\n注册特征因子(从 metadata:")
for name in factor_definitions.keys():
engine.add_factor_by_name(name)
for name in selected_factors:
engine.add_factor(name)
print(f" - {name}")
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
print("\n注册特征因子(表达式):")
for name, expr in factor_definitions.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 注册 label 因子(通过表达式)
print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 从字典自动获取特征列
feature_cols = list(factor_definitions.keys())
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
feature_cols = selected_factors + list(factor_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}")
print(f" - 来自 metadata: {len(selected_factors)}")
print(f" - 来自表达式: {len(factor_definitions)}")
print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}")
@@ -91,7 +102,67 @@ def prepare_data(
# 特征因子定义字典:新增因子只需在此处添加一行
LABEL_NAME = "future_return_5"
FACTOR_DEFINITIONS = FACTOR_DICT = {
# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
SELECTED_FACTORS = [
# ================= 1. 价格、趋势与路径依赖 =================
"ma_5",
"ma_20",
"ma_ratio_5_20",
"bias_10",
"high_low_ratio",
"bbi_ratio",
"return_5",
"return_20",
"kaufman_ER_20",
"mom_acceleration_10_20",
"drawdown_from_high_60",
"up_days_ratio_20",
# ================= 2. 波动率、风险调整与高阶矩 =================
"volatility_5",
"volatility_20",
"volatility_ratio",
"std_return_20",
"sharpe_ratio_20",
"min_ret_20",
"volatility_squeeze_5_60",
# ================= 3. 日内微观结构与异象 =================
"overnight_intraday_diff",
"upper_shadow_ratio",
"capital_retention_20",
"max_ret_20",
# ================= 4. 量能、流动性与量价背离 =================
"volume_ratio_5_20",
"turnover_rate_mean_5",
"turnover_deviation",
"amihud_illiq_20",
"turnover_cv_20",
"pv_corr_20",
"close_vwap_deviation",
# ================= 5. 基本面财务特征 =================
"roe",
"roa",
"profit_margin",
"debt_to_equity",
"current_ratio",
"net_profit_yoy",
"revenue_yoy",
"healthy_expansion_velocity",
# ================= 6. 基本面估值与截面动量共振 =================
"EP",
"BP",
"CP",
"market_cap_rank",
"turnover_rank",
"return_5_rank",
"EP_rank",
"pe_expansion_trend",
"value_price_divergence",
"active_market_cap",
"ebit_rank",
]
# 因子定义字典(完整因子库)
FACTOR_DEFINITIONS = {
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
"ma_5": "ts_mean(close, 5)",
"ma_20": "ts_mean(close, 20)",
@@ -284,7 +355,9 @@ engine = FactorEngine(metadata_path="data/factors.jsonl")
# 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
feature_cols = create_factors_with_metadata(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
)
target_col = LABEL_NAME
# 3. 准备数据(使用模块级别的日期配置)