refactor(factors): 简化 add_factor API 并默认启用 metadata
- 合并 add_factor_by_name 到 add_factor,支持三种调用方式 - FactorManager 构造函数改为可选参数,使用默认路径 - FactorEngine 默认启用 metadata,无需手动配置路径
This commit is contained in:
0
src/experiment/data/factors.jsonl
Normal file
0
src/experiment/data/factors.jsonl
Normal file
@@ -22,10 +22,13 @@
|
||||
"source": "## 1. 导入依赖"
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2026-03-11T16:02:49.975545Z",
|
||||
"start_time": "2026-03-11T16:02:48.487347Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from datetime import datetime\n",
|
||||
@@ -50,7 +53,9 @@
|
||||
"from src.training.components.models import LightGBMLambdaRankModel\n",
|
||||
"from src.training.config import TrainingConfig\n",
|
||||
"\n"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 1
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
@@ -58,35 +63,49 @@
|
||||
"source": "## 2. 辅助函数"
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2026-03-11T16:02:49.989220Z",
|
||||
"start_time": "2026-03-11T16:02:49.981542Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"def create_factors_with_metadata(\n",
|
||||
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n",
|
||||
" engine: FactorEngine,\n",
|
||||
" selected_factors: List[str],\n",
|
||||
" factor_definitions: dict,\n",
|
||||
" label_factor: dict,\n",
|
||||
") -> List[str]:\n",
|
||||
" \"\"\"使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)\"\"\"\n",
|
||||
" \"\"\"注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
" print(\"使用 metadata 注册因子\")\n",
|
||||
" print(\"注册因子\")\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
"\n",
|
||||
" # 注册所有特征因子(通过 metadata 名称)\n",
|
||||
" # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)\n",
|
||||
" print(\"\\n注册特征因子(从 metadata):\")\n",
|
||||
" for name in factor_definitions.keys():\n",
|
||||
" engine.add_factor_by_name(name)\n",
|
||||
" for name in selected_factors:\n",
|
||||
" engine.add_factor(name)\n",
|
||||
" print(f\" - {name}\")\n",
|
||||
"\n",
|
||||
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n",
|
||||
" # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
|
||||
" print(\"\\n注册特征因子(表达式):\")\n",
|
||||
" for name, expr in factor_definitions.items():\n",
|
||||
" engine.add_factor(name, expr)\n",
|
||||
" print(f\" - {name}: {expr}\")\n",
|
||||
"\n",
|
||||
" # 注册 label 因子(通过表达式)\n",
|
||||
" print(\"\\n注册 Label 因子(表达式):\")\n",
|
||||
" for name, expr in label_factor.items():\n",
|
||||
" engine.add_factor(name, expr)\n",
|
||||
" print(f\" - {name}: {expr}\")\n",
|
||||
"\n",
|
||||
" # 从字典自动获取特征列\n",
|
||||
" feature_cols = list(factor_definitions.keys())\n",
|
||||
" # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
|
||||
" feature_cols = selected_factors + list(factor_definitions.keys())\n",
|
||||
"\n",
|
||||
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
|
||||
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
|
||||
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
|
||||
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
|
||||
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
|
||||
"\n",
|
||||
@@ -251,7 +270,9 @@
|
||||
"\n",
|
||||
" return results\n",
|
||||
"\n"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 2
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
@@ -263,77 +284,86 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2026-03-11T16:02:50.000875Z",
|
||||
"start_time": "2026-03-11T16:02:49.994082Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"# 特征因子定义字典(复用 regression.ipynb 的因子定义)\n",
|
||||
"LABEL_NAME = \"future_return_5_rank\"\n",
|
||||
"\n",
|
||||
"FACTOR_DEFINITIONS = {\n",
|
||||
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
|
||||
" \"ma_5\": \"ts_mean(close, 5)\",\n",
|
||||
" \"ma_20\": \"ts_mean(close, 20)\",\n",
|
||||
" \"ma_ratio_5_20\": \"ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1\",\n",
|
||||
" \"bias_10\": \"close / (ts_mean(close, 10) + 1e-8) - 1\",\n",
|
||||
" \"high_low_ratio\": \"(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)\",\n",
|
||||
" \"bbi_ratio\": \"(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)\",\n",
|
||||
" \"return_5\": \"(close / (ts_delay(close, 5) + 1e-8)) - 1\",\n",
|
||||
" \"return_20\": \"(close / (ts_delay(close, 20) + 1e-8)) - 1\",\n",
|
||||
" \"kaufman_ER_20\": \"abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)\",\n",
|
||||
" \"mom_acceleration_10_20\": \"(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)\",\n",
|
||||
" \"drawdown_from_high_60\": \"close / (ts_max(high, 60) + 1e-8) - 1\",\n",
|
||||
" \"up_days_ratio_20\": \"ts_sum(close > ts_delay(close, 1), 20) / 20\",\n",
|
||||
"# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
|
||||
"SELECTED_FACTORS = [\n",
|
||||
" # ================= 1. 价格、趋势与路径依赖 =================\n",
|
||||
" \"ma_5\",\n",
|
||||
" \"ma_20\",\n",
|
||||
" \"ma_ratio_5_20\",\n",
|
||||
" \"bias_10\",\n",
|
||||
" \"high_low_ratio\",\n",
|
||||
" \"bbi_ratio\",\n",
|
||||
" \"return_5\",\n",
|
||||
" \"return_20\",\n",
|
||||
" \"kaufman_ER_20\",\n",
|
||||
" \"mom_acceleration_10_20\",\n",
|
||||
" \"drawdown_from_high_60\",\n",
|
||||
" \"up_days_ratio_20\",\n",
|
||||
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
|
||||
" \"volatility_5\": \"ts_std(close, 5)\",\n",
|
||||
" \"volatility_20\": \"ts_std(close, 20)\",\n",
|
||||
" \"volatility_ratio\": \"ts_std(close, 5) / (ts_std(close, 20) + 1e-8)\",\n",
|
||||
" \"std_return_20\": \"ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)\",\n",
|
||||
" \"sharpe_ratio_20\": \"ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)\",\n",
|
||||
" \"min_ret_20\": \"ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n",
|
||||
" \"volatility_squeeze_5_60\": \"ts_std(close, 5) / (ts_std(close, 60) + 1e-8)\",\n",
|
||||
" \"volatility_5\",\n",
|
||||
" \"volatility_20\",\n",
|
||||
" \"volatility_ratio\",\n",
|
||||
" \"std_return_20\",\n",
|
||||
" \"sharpe_ratio_20\",\n",
|
||||
" \"min_ret_20\",\n",
|
||||
" \"volatility_squeeze_5_60\",\n",
|
||||
" # ================= 3. 日内微观结构与异象 =================\n",
|
||||
" \"overnight_intraday_diff\": \"(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)\",\n",
|
||||
" \"upper_shadow_ratio\": \"(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)\",\n",
|
||||
" \"capital_retention_20\": \"ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)\",\n",
|
||||
" \"max_ret_20\": \"ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n",
|
||||
" \"overnight_intraday_diff\",\n",
|
||||
" \"upper_shadow_ratio\",\n",
|
||||
" \"capital_retention_20\",\n",
|
||||
" \"max_ret_20\",\n",
|
||||
" # ================= 4. 量能、流动性与量价背离 =================\n",
|
||||
" \"volume_ratio_5_20\": \"ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)\",\n",
|
||||
" \"turnover_rate_mean_5\": \"ts_mean(turnover_rate, 5)\",\n",
|
||||
" \"turnover_deviation\": \"(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)\",\n",
|
||||
" \"amihud_illiq_20\": \"ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)\",\n",
|
||||
" \"turnover_cv_20\": \"ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)\",\n",
|
||||
" \"pv_corr_20\": \"ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)\",\n",
|
||||
" \"close_vwap_deviation\": \"close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1\",\n",
|
||||
" \"volume_ratio_5_20\",\n",
|
||||
" \"turnover_rate_mean_5\",\n",
|
||||
" \"turnover_deviation\",\n",
|
||||
" \"amihud_illiq_20\",\n",
|
||||
" \"turnover_cv_20\",\n",
|
||||
" \"pv_corr_20\",\n",
|
||||
" \"close_vwap_deviation\",\n",
|
||||
" # ================= 5. 基本面财务特征 =================\n",
|
||||
" \"roe\": \"n_income / (total_hldr_eqy_exc_min_int + 1e-8)\",\n",
|
||||
" \"roa\": \"n_income / (total_assets + 1e-8)\",\n",
|
||||
" \"profit_margin\": \"n_income / (revenue + 1e-8)\",\n",
|
||||
" \"debt_to_equity\": \"total_liab / (total_hldr_eqy_exc_min_int + 1e-8)\",\n",
|
||||
" \"current_ratio\": \"total_cur_assets / (total_cur_liab + 1e-8)\",\n",
|
||||
" \"net_profit_yoy\": \"(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1\",\n",
|
||||
" \"revenue_yoy\": \"(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1\",\n",
|
||||
" \"healthy_expansion_velocity\": \"(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)\",\n",
|
||||
" \"roe\",\n",
|
||||
" \"roa\",\n",
|
||||
" \"profit_margin\",\n",
|
||||
" \"debt_to_equity\",\n",
|
||||
" \"current_ratio\",\n",
|
||||
" \"net_profit_yoy\",\n",
|
||||
" \"revenue_yoy\",\n",
|
||||
" \"healthy_expansion_velocity\",\n",
|
||||
" # ================= 6. 基本面估值与截面动量共振 =================\n",
|
||||
" \"EP\": \"n_income / (total_mv * 10000 + 1e-8)\",\n",
|
||||
" \"BP\": \"total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)\",\n",
|
||||
" \"CP\": \"n_cashflow_act / (total_mv * 10000 + 1e-8)\",\n",
|
||||
" \"market_cap_rank\": \"cs_rank(total_mv)\",\n",
|
||||
" \"turnover_rank\": \"cs_rank(turnover_rate)\",\n",
|
||||
" \"return_5_rank\": \"cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)\",\n",
|
||||
" \"EP_rank\": \"cs_rank(n_income / (total_mv + 1e-8))\",\n",
|
||||
" \"pe_expansion_trend\": \"(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1\",\n",
|
||||
" \"value_price_divergence\": \"cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))\",\n",
|
||||
" \"active_market_cap\": \"total_mv * ts_mean(turnover_rate, 20)\",\n",
|
||||
" \"ebit_rank\": \"cs_rank(ebit)\",\n",
|
||||
"}\n",
|
||||
" \"EP\",\n",
|
||||
" \"BP\",\n",
|
||||
" \"CP\",\n",
|
||||
" \"market_cap_rank\",\n",
|
||||
" \"turnover_rank\",\n",
|
||||
" \"return_5_rank\",\n",
|
||||
" \"EP_rank\",\n",
|
||||
" \"pe_expansion_trend\",\n",
|
||||
" \"value_price_divergence\",\n",
|
||||
" \"active_market_cap\",\n",
|
||||
" \"ebit_rank\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# 因子定义字典(完整因子库)\n",
|
||||
"FACTOR_DEFINITIONS = {\"turnover_volatility_ratio\": \"log(ts_std(turnover_rate, 20))\"}\n",
|
||||
"\n",
|
||||
"# Label 因子定义(不参与训练,用于计算目标)\n",
|
||||
"LABEL_FACTOR = {\n",
|
||||
" LABEL_NAME: \"(ts_delay(close, -5) / ts_delay(open, -1)) - 1\",\n",
|
||||
"}"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 3
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
@@ -341,10 +371,13 @@
|
||||
"source": "### 3.2 训练参数配置"
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2026-03-11T16:02:50.009081Z",
|
||||
"start_time": "2026-03-11T16:02:50.005330Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"# 日期范围配置(正确的 train/val/test 三分法)\n",
|
||||
"TRAIN_START = \"20200101\"\n",
|
||||
@@ -377,7 +410,7 @@
|
||||
"N_QUANTILES = 20 # 将 label 分为 20 组\n",
|
||||
"\n",
|
||||
"# 特征列(用于数据处理器)\n",
|
||||
"FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())\n",
|
||||
"FEATURE_COLS = SELECTED_FACTORS\n",
|
||||
"\n",
|
||||
"# 数据处理器配置\n",
|
||||
"PROCESSORS = [\n",
|
||||
@@ -421,7 +454,9 @@
|
||||
"\n",
|
||||
"# Top N 配置:每日推荐股票数量\n",
|
||||
"TOP_N = 5 # 可调整为 10, 20 等"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 4
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
@@ -429,10 +464,13 @@
|
||||
"source": "## 4. 训练流程"
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2026-03-11T16:02:50.330018Z",
|
||||
"start_time": "2026-03-11T16:02:50.012964Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"print(\"\\n\" + \"=\" * 80)\n",
|
||||
"print(\"LightGBM LambdaRank 排序学习训练\")\n",
|
||||
@@ -444,7 +482,9 @@
|
||||
"\n",
|
||||
"# 2. 使用 metadata 定义因子\n",
|
||||
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
|
||||
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n",
|
||||
"feature_cols = create_factors_with_metadata(\n",
|
||||
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# 3. 准备数据\n",
|
||||
"print(\"\\n[3] 准备数据\")\n",
|
||||
@@ -507,7 +547,49 @@
|
||||
" feature_cols=feature_cols,\n",
|
||||
" persist_model=PERSIST_MODEL,\n",
|
||||
")"
|
||||
]
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"================================================================================\n",
|
||||
"LightGBM LambdaRank 排序学习训练\n",
|
||||
"================================================================================\n",
|
||||
"\n",
|
||||
"[1] 创建 FactorEngine\n",
|
||||
"\n",
|
||||
"[2] 定义因子(从 metadata 注册)\n",
|
||||
"================================================================================\n",
|
||||
"注册因子\n",
|
||||
"================================================================================\n",
|
||||
"\n",
|
||||
"注册特征因子(从 metadata):\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "QueryError",
|
||||
"evalue": "查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n ",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
|
||||
"\u001B[31mBinderException\u001B[39m Traceback (most recent call last)",
|
||||
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:296\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 295\u001B[39m conn = \u001B[38;5;28mself\u001B[39m._get_connection()\n\u001B[32m--> \u001B[39m\u001B[32m296\u001B[39m result = \u001B[43mconn\u001B[49m\u001B[43m.\u001B[49m\u001B[43mexecute\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m.pl()\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n",
|
||||
"\u001B[31mBinderException\u001B[39m: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^",
|
||||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||||
"\u001B[31mQueryError\u001B[39m Traceback (most recent call last)",
|
||||
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[5]\u001B[39m\u001B[32m, line 11\u001B[39m\n\u001B[32m 9\u001B[39m \u001B[38;5;66;03m# 2. 使用 metadata 定义因子\u001B[39;00m\n\u001B[32m 10\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[2] 定义因子(从 metadata 注册)\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m---> \u001B[39m\u001B[32m11\u001B[39m feature_cols = \u001B[43mcreate_factors_with_metadata\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 12\u001B[39m \u001B[43m \u001B[49m\u001B[43mengine\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mSELECTED_FACTORS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mFACTOR_DEFINITIONS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mLABEL_FACTOR\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m)\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[38;5;66;03m# 3. 准备数据\u001B[39;00m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[3] 准备数据\u001B[39m\u001B[33m\"\u001B[39m)\n",
|
||||
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 15\u001B[39m, in \u001B[36mcreate_factors_with_metadata\u001B[39m\u001B[34m(engine, selected_factors, factor_definitions, label_factor)\u001B[39m\n\u001B[32m 13\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m注册特征因子(从 metadata):\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 14\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m name \u001B[38;5;129;01min\u001B[39;00m selected_factors:\n\u001B[32m---> \u001B[39m\u001B[32m15\u001B[39m \u001B[43mengine\u001B[49m\u001B[43m.\u001B[49m\u001B[43madd_factor\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m - \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 18\u001B[39m \u001B[38;5;66;03m# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\u001B[39;00m\n",
|
||||
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:225\u001B[39m, in \u001B[36mFactorEngine.add_factor\u001B[39m\u001B[34m(self, name, expression, data_specs)\u001B[39m\n\u001B[32m 182\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"注册因子(支持多种调用方式)。\u001B[39;00m\n\u001B[32m 183\u001B[39m \n\u001B[32m 184\u001B[39m \u001B[33;03m这是 register 方法的增强版,支持以下调用方式:\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 221\u001B[39m \u001B[33;03m ... .add_factor(\"golden_cross\", \"ma5 > ma10\"))\u001B[39;00m\n\u001B[32m 222\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 223\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m expression \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 224\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询表达式\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m225\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_add_factor_from_metadata\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdata_specs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 227\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(expression, \u001B[38;5;28mstr\u001B[39m):\n\u001B[32m 228\u001B[39m \u001B[38;5;66;03m# Fail-Fast:立即解析,失败立即报错\u001B[39;00m\n\u001B[32m 229\u001B[39m node = \u001B[38;5;28mself\u001B[39m._parser.parse(expression)\n",
|
||||
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:159\u001B[39m, in \u001B[36mFactorEngine._add_factor_from_metadata\u001B[39m\u001B[34m(self, name, factor_name_in_metadata, data_specs)\u001B[39m\n\u001B[32m 153\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\n\u001B[32m 154\u001B[39m \u001B[33m\"\u001B[39m\u001B[33m引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 155\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m例如:FactorEngine(metadata_path=\u001B[39m\u001B[33m'\u001B[39m\u001B[33mdata/factors.jsonl\u001B[39m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 156\u001B[39m )\n\u001B[32m 158\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询因子\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m159\u001B[39m df = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_metadata\u001B[49m\u001B[43m.\u001B[49m\u001B[43mget_factors_by_name\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfactor_name_in_metadata\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 161\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(df) == \u001B[32m0\u001B[39m:\n\u001B[32m 162\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[32m 163\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m在 metadata 中未找到因子 \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfactor_name_in_metadata\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 164\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m请确认因子名称正确,或先使用 FactorManager 添加该因子。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 165\u001B[39m )\n",
|
||||
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:177\u001B[39m, in \u001B[36mFactorManager.get_factors_by_name\u001B[39m\u001B[34m(self, name)\u001B[39m\n\u001B[32m 154\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"根据名称查询因子。\u001B[39;00m\n\u001B[32m 155\u001B[39m \n\u001B[32m 156\u001B[39m \u001B[33;03m使用DuckDB执行SQL查询,返回Polars DataFrame。\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 170\u001B[39m \u001B[33;03m ... print(df[\"dsl\"][0])\u001B[39;00m\n\u001B[32m 171\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 172\u001B[39m sql = \u001B[33mf\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m 173\u001B[39m \u001B[33m SELECT *\u001B[39m\n\u001B[32m 174\u001B[39m \u001B[33m FROM read_json_auto(\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mself\u001B[39m.filepath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\n\u001B[32m 175\u001B[39m \u001B[33m WHERE name = \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\n\u001B[32m 176\u001B[39m \u001B[33m\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m--> \u001B[39m\u001B[32m177\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_execute_query\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m\n",
|
||||
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:299\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n\u001B[32m 298\u001B[39m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[32m--> \u001B[39m\u001B[32m299\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m QueryError(sql, e)\n",
|
||||
"\u001B[31mQueryError\u001B[39m: 查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n "
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 5
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
|
||||
@@ -40,29 +40,40 @@ from src.training.config import TrainingConfig
|
||||
# ## 2. 辅助函数
|
||||
# %%
|
||||
def create_factors_with_metadata(
|
||||
engine: FactorEngine, factor_definitions: dict, label_factor: dict
|
||||
engine: FactorEngine,
|
||||
selected_factors: List[str],
|
||||
factor_definitions: dict,
|
||||
label_factor: dict,
|
||||
) -> List[str]:
|
||||
"""使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)"""
|
||||
"""注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)"""
|
||||
print("=" * 80)
|
||||
print("使用 metadata 注册因子")
|
||||
print("注册因子")
|
||||
print("=" * 80)
|
||||
|
||||
# 注册所有特征因子(通过 metadata 名称)
|
||||
# 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)
|
||||
print("\n注册特征因子(从 metadata):")
|
||||
for name in factor_definitions.keys():
|
||||
engine.add_factor_by_name(name)
|
||||
for name in selected_factors:
|
||||
engine.add_factor(name)
|
||||
print(f" - {name}")
|
||||
|
||||
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
|
||||
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
|
||||
print("\n注册特征因子(表达式):")
|
||||
for name, expr in factor_definitions.items():
|
||||
engine.add_factor(name, expr)
|
||||
print(f" - {name}: {expr}")
|
||||
|
||||
# 注册 label 因子(通过表达式)
|
||||
print("\n注册 Label 因子(表达式):")
|
||||
for name, expr in label_factor.items():
|
||||
engine.add_factor(name, expr)
|
||||
print(f" - {name}: {expr}")
|
||||
|
||||
# 从字典自动获取特征列
|
||||
feature_cols = list(factor_definitions.keys())
|
||||
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
|
||||
feature_cols = selected_factors + list(factor_definitions.keys())
|
||||
|
||||
print(f"\n特征因子数: {len(feature_cols)}")
|
||||
print(f" - 来自 metadata: {len(selected_factors)}")
|
||||
print(f" - 来自表达式: {len(factor_definitions)}")
|
||||
print(f"Label: {list(label_factor.keys())[0]}")
|
||||
print(f"已注册因子总数: {len(engine.list_registered())}")
|
||||
|
||||
@@ -236,62 +247,68 @@ def evaluate_ndcg_at_k(
|
||||
# 特征因子定义字典(复用 regression.ipynb 的因子定义)
|
||||
LABEL_NAME = "future_return_5_rank"
|
||||
|
||||
FACTOR_DEFINITIONS = {
|
||||
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
|
||||
"ma_5": "ts_mean(close, 5)",
|
||||
"ma_20": "ts_mean(close, 20)",
|
||||
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1",
|
||||
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1",
|
||||
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)",
|
||||
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)",
|
||||
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1",
|
||||
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1",
|
||||
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)",
|
||||
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)",
|
||||
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1",
|
||||
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20",
|
||||
# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
|
||||
SELECTED_FACTORS = [
|
||||
# ================= 1. 价格、趋势与路径依赖 =================
|
||||
"ma_5",
|
||||
"ma_20",
|
||||
"ma_ratio_5_20",
|
||||
"bias_10",
|
||||
"high_low_ratio",
|
||||
"bbi_ratio",
|
||||
"return_5",
|
||||
"return_20",
|
||||
"kaufman_ER_20",
|
||||
"mom_acceleration_10_20",
|
||||
"drawdown_from_high_60",
|
||||
"up_days_ratio_20",
|
||||
# ================= 2. 波动率、风险调整与高阶矩 =================
|
||||
"volatility_5": "ts_std(close, 5)",
|
||||
"volatility_20": "ts_std(close, 20)",
|
||||
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)",
|
||||
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)",
|
||||
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)",
|
||||
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
|
||||
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)",
|
||||
"volatility_5",
|
||||
"volatility_20",
|
||||
"volatility_ratio",
|
||||
"std_return_20",
|
||||
"sharpe_ratio_20",
|
||||
"min_ret_20",
|
||||
"volatility_squeeze_5_60",
|
||||
# ================= 3. 日内微观结构与异象 =================
|
||||
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)",
|
||||
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)",
|
||||
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)",
|
||||
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)",
|
||||
"overnight_intraday_diff",
|
||||
"upper_shadow_ratio",
|
||||
"capital_retention_20",
|
||||
"max_ret_20",
|
||||
# ================= 4. 量能、流动性与量价背离 =================
|
||||
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)",
|
||||
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)",
|
||||
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)",
|
||||
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)",
|
||||
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)",
|
||||
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)",
|
||||
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1",
|
||||
"volume_ratio_5_20",
|
||||
"turnover_rate_mean_5",
|
||||
"turnover_deviation",
|
||||
"amihud_illiq_20",
|
||||
"turnover_cv_20",
|
||||
"pv_corr_20",
|
||||
"close_vwap_deviation",
|
||||
# ================= 5. 基本面财务特征 =================
|
||||
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)",
|
||||
"roa": "n_income / (total_assets + 1e-8)",
|
||||
"profit_margin": "n_income / (revenue + 1e-8)",
|
||||
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)",
|
||||
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)",
|
||||
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1",
|
||||
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1",
|
||||
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)",
|
||||
"roe",
|
||||
"roa",
|
||||
"profit_margin",
|
||||
"debt_to_equity",
|
||||
"current_ratio",
|
||||
"net_profit_yoy",
|
||||
"revenue_yoy",
|
||||
"healthy_expansion_velocity",
|
||||
# ================= 6. 基本面估值与截面动量共振 =================
|
||||
"EP": "n_income / (total_mv * 10000 + 1e-8)",
|
||||
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)",
|
||||
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)",
|
||||
"market_cap_rank": "cs_rank(total_mv)",
|
||||
"turnover_rank": "cs_rank(turnover_rate)",
|
||||
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)",
|
||||
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))",
|
||||
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1",
|
||||
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))",
|
||||
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)",
|
||||
"ebit_rank": "cs_rank(ebit)",
|
||||
"EP",
|
||||
"BP",
|
||||
"CP",
|
||||
"market_cap_rank",
|
||||
"turnover_rank",
|
||||
"return_5_rank",
|
||||
"EP_rank",
|
||||
"pe_expansion_trend",
|
||||
"value_price_divergence",
|
||||
"active_market_cap",
|
||||
"ebit_rank",
|
||||
]
|
||||
|
||||
# 因子定义字典(完整因子库)
|
||||
FACTOR_DEFINITIONS = {
|
||||
# "turnover_volatility_ratio": "log(ts_std(turnover_rate, 20))"
|
||||
}
|
||||
|
||||
# Label 因子定义(不参与训练,用于计算目标)
|
||||
@@ -332,7 +349,7 @@ MODEL_PARAMS = {
|
||||
N_QUANTILES = 20 # 将 label 分为 20 组
|
||||
|
||||
# 特征列(用于数据处理器)
|
||||
FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())
|
||||
FEATURE_COLS = SELECTED_FACTORS
|
||||
|
||||
# 数据处理器配置
|
||||
PROCESSORS = [
|
||||
@@ -385,11 +402,13 @@ print("=" * 80)
|
||||
|
||||
# 1. 创建 FactorEngine(启用 metadata 功能)
|
||||
print("\n[1] 创建 FactorEngine")
|
||||
engine = FactorEngine(metadata_path="data/factors.jsonl")
|
||||
engine = FactorEngine()
|
||||
|
||||
# 2. 使用 metadata 定义因子
|
||||
print("\n[2] 定义因子(从 metadata 注册)")
|
||||
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
|
||||
feature_cols = create_factors_with_metadata(
|
||||
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
|
||||
)
|
||||
|
||||
# 3. 准备数据
|
||||
print("\n[3] 准备数据")
|
||||
|
||||
@@ -47,29 +47,40 @@
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"def create_factors_with_metadata(\n",
|
||||
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n",
|
||||
" engine: FactorEngine,\n",
|
||||
" selected_factors: List[str],\n",
|
||||
" factor_definitions: dict,\n",
|
||||
" label_factor: dict,\n",
|
||||
") -> List[str]:\n",
|
||||
" \"\"\"使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)\"\"\"\n",
|
||||
" \"\"\"注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
" print(\"使用 metadata 注册因子\")\n",
|
||||
" print(\"注册因子\")\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
"\n",
|
||||
" # 注册所有特征因子(通过 metadata 名称)\n",
|
||||
" # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)\n",
|
||||
" print(\"\\n注册特征因子(从 metadata):\")\n",
|
||||
" for name in factor_definitions.keys():\n",
|
||||
" engine.add_factor_by_name(name)\n",
|
||||
" for name in selected_factors:\n",
|
||||
" engine.add_factor(name)\n",
|
||||
" print(f\" - {name}\")\n",
|
||||
"\n",
|
||||
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n",
|
||||
" # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
|
||||
" print(\"\\n注册特征因子(表达式):\")\n",
|
||||
" for name, expr in factor_definitions.items():\n",
|
||||
" engine.add_factor(name, expr)\n",
|
||||
" print(f\" - {name}: {expr}\")\n",
|
||||
"\n",
|
||||
" # 注册 label 因子(通过表达式)\n",
|
||||
" print(\"\\n注册 Label 因子(表达式):\")\n",
|
||||
" for name, expr in label_factor.items():\n",
|
||||
" engine.add_factor(name, expr)\n",
|
||||
" print(f\" - {name}: {expr}\")\n",
|
||||
"\n",
|
||||
" # 从字典自动获取特征列\n",
|
||||
" feature_cols = list(factor_definitions.keys())\n",
|
||||
" # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
|
||||
" feature_cols = selected_factors + list(factor_definitions.keys())\n",
|
||||
"\n",
|
||||
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
|
||||
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
|
||||
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
|
||||
" print(f\"Label: {list(label_factor.keys())[0]}\")\n",
|
||||
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
|
||||
"\n",
|
||||
@@ -123,7 +134,67 @@
|
||||
"# 特征因子定义字典:新增因子只需在此处添加一行\n",
|
||||
"LABEL_NAME = \"future_return_5\"\n",
|
||||
"\n",
|
||||
"FACTOR_DEFINITIONS = FACTOR_DICT = {\n",
|
||||
"# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
|
||||
"SELECTED_FACTORS = [\n",
|
||||
" # ================= 1. 价格、趋势与路径依赖 =================\n",
|
||||
" \"ma_5\",\n",
|
||||
" \"ma_20\",\n",
|
||||
" \"ma_ratio_5_20\",\n",
|
||||
" \"bias_10\",\n",
|
||||
" \"high_low_ratio\",\n",
|
||||
" \"bbi_ratio\",\n",
|
||||
" \"return_5\",\n",
|
||||
" \"return_20\",\n",
|
||||
" \"kaufman_ER_20\",\n",
|
||||
" \"mom_acceleration_10_20\",\n",
|
||||
" \"drawdown_from_high_60\",\n",
|
||||
" \"up_days_ratio_20\",\n",
|
||||
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
|
||||
" \"volatility_5\",\n",
|
||||
" \"volatility_20\",\n",
|
||||
" \"volatility_ratio\",\n",
|
||||
" \"std_return_20\",\n",
|
||||
" \"sharpe_ratio_20\",\n",
|
||||
" \"min_ret_20\",\n",
|
||||
" \"volatility_squeeze_5_60\",\n",
|
||||
" # ================= 3. 日内微观结构与异象 =================\n",
|
||||
" \"overnight_intraday_diff\",\n",
|
||||
" \"upper_shadow_ratio\",\n",
|
||||
" \"capital_retention_20\",\n",
|
||||
" \"max_ret_20\",\n",
|
||||
" # ================= 4. 量能、流动性与量价背离 =================\n",
|
||||
" \"volume_ratio_5_20\",\n",
|
||||
" \"turnover_rate_mean_5\",\n",
|
||||
" \"turnover_deviation\",\n",
|
||||
" \"amihud_illiq_20\",\n",
|
||||
" \"turnover_cv_20\",\n",
|
||||
" \"pv_corr_20\",\n",
|
||||
" \"close_vwap_deviation\",\n",
|
||||
" # ================= 5. 基本面财务特征 =================\n",
|
||||
" \"roe\",\n",
|
||||
" \"roa\",\n",
|
||||
" \"profit_margin\",\n",
|
||||
" \"debt_to_equity\",\n",
|
||||
" \"current_ratio\",\n",
|
||||
" \"net_profit_yoy\",\n",
|
||||
" \"revenue_yoy\",\n",
|
||||
" \"healthy_expansion_velocity\",\n",
|
||||
" # ================= 6. 基本面估值与截面动量共振 =================\n",
|
||||
" \"EP\",\n",
|
||||
" \"BP\",\n",
|
||||
" \"CP\",\n",
|
||||
" \"market_cap_rank\",\n",
|
||||
" \"turnover_rank\",\n",
|
||||
" \"return_5_rank\",\n",
|
||||
" \"EP_rank\",\n",
|
||||
" \"pe_expansion_trend\",\n",
|
||||
" \"value_price_divergence\",\n",
|
||||
" \"active_market_cap\",\n",
|
||||
" \"ebit_rank\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# 因子定义字典(完整因子库)\n",
|
||||
"FACTOR_DEFINITIONS = {\n",
|
||||
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
|
||||
" \"ma_5\": \"ts_mean(close, 5)\",\n",
|
||||
" \"ma_20\": \"ts_mean(close, 20)\",\n",
|
||||
@@ -338,7 +409,9 @@
|
||||
"\n",
|
||||
"# 2. 使用 metadata 定义因子\n",
|
||||
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
|
||||
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n",
|
||||
"feature_cols = create_factors_with_metadata(\n",
|
||||
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
|
||||
")\n",
|
||||
"target_col = LABEL_NAME\n",
|
||||
"\n",
|
||||
"# 3. 准备数据(使用模块级别的日期配置)\n",
|
||||
|
||||
@@ -26,29 +26,40 @@ from src.training.config import TrainingConfig
|
||||
# ## 2. 定义辅助函数
|
||||
# %%
|
||||
def create_factors_with_metadata(
|
||||
engine: FactorEngine, factor_definitions: dict, label_factor: dict
|
||||
engine: FactorEngine,
|
||||
selected_factors: List[str],
|
||||
factor_definitions: dict,
|
||||
label_factor: dict,
|
||||
) -> List[str]:
|
||||
"""使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)"""
|
||||
"""注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)"""
|
||||
print("=" * 80)
|
||||
print("使用 metadata 注册因子")
|
||||
print("注册因子")
|
||||
print("=" * 80)
|
||||
|
||||
# 注册所有特征因子(通过 metadata 名称)
|
||||
# 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)
|
||||
print("\n注册特征因子(从 metadata):")
|
||||
for name in factor_definitions.keys():
|
||||
engine.add_factor_by_name(name)
|
||||
for name in selected_factors:
|
||||
engine.add_factor(name)
|
||||
print(f" - {name}")
|
||||
|
||||
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中)
|
||||
# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
|
||||
print("\n注册特征因子(表达式):")
|
||||
for name, expr in factor_definitions.items():
|
||||
engine.add_factor(name, expr)
|
||||
print(f" - {name}: {expr}")
|
||||
|
||||
# 注册 label 因子(通过表达式)
|
||||
print("\n注册 Label 因子(表达式):")
|
||||
for name, expr in label_factor.items():
|
||||
engine.add_factor(name, expr)
|
||||
print(f" - {name}: {expr}")
|
||||
|
||||
# 从字典自动获取特征列
|
||||
feature_cols = list(factor_definitions.keys())
|
||||
# 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
|
||||
feature_cols = selected_factors + list(factor_definitions.keys())
|
||||
|
||||
print(f"\n特征因子数: {len(feature_cols)}")
|
||||
print(f" - 来自 metadata: {len(selected_factors)}")
|
||||
print(f" - 来自表达式: {len(factor_definitions)}")
|
||||
print(f"Label: {list(label_factor.keys())[0]}")
|
||||
print(f"已注册因子总数: {len(engine.list_registered())}")
|
||||
|
||||
@@ -91,7 +102,67 @@ def prepare_data(
|
||||
# 特征因子定义字典:新增因子只需在此处添加一行
|
||||
LABEL_NAME = "future_return_5"
|
||||
|
||||
FACTOR_DEFINITIONS = FACTOR_DICT = {
|
||||
# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
|
||||
SELECTED_FACTORS = [
|
||||
# ================= 1. 价格、趋势与路径依赖 =================
|
||||
"ma_5",
|
||||
"ma_20",
|
||||
"ma_ratio_5_20",
|
||||
"bias_10",
|
||||
"high_low_ratio",
|
||||
"bbi_ratio",
|
||||
"return_5",
|
||||
"return_20",
|
||||
"kaufman_ER_20",
|
||||
"mom_acceleration_10_20",
|
||||
"drawdown_from_high_60",
|
||||
"up_days_ratio_20",
|
||||
# ================= 2. 波动率、风险调整与高阶矩 =================
|
||||
"volatility_5",
|
||||
"volatility_20",
|
||||
"volatility_ratio",
|
||||
"std_return_20",
|
||||
"sharpe_ratio_20",
|
||||
"min_ret_20",
|
||||
"volatility_squeeze_5_60",
|
||||
# ================= 3. 日内微观结构与异象 =================
|
||||
"overnight_intraday_diff",
|
||||
"upper_shadow_ratio",
|
||||
"capital_retention_20",
|
||||
"max_ret_20",
|
||||
# ================= 4. 量能、流动性与量价背离 =================
|
||||
"volume_ratio_5_20",
|
||||
"turnover_rate_mean_5",
|
||||
"turnover_deviation",
|
||||
"amihud_illiq_20",
|
||||
"turnover_cv_20",
|
||||
"pv_corr_20",
|
||||
"close_vwap_deviation",
|
||||
# ================= 5. 基本面财务特征 =================
|
||||
"roe",
|
||||
"roa",
|
||||
"profit_margin",
|
||||
"debt_to_equity",
|
||||
"current_ratio",
|
||||
"net_profit_yoy",
|
||||
"revenue_yoy",
|
||||
"healthy_expansion_velocity",
|
||||
# ================= 6. 基本面估值与截面动量共振 =================
|
||||
"EP",
|
||||
"BP",
|
||||
"CP",
|
||||
"market_cap_rank",
|
||||
"turnover_rank",
|
||||
"return_5_rank",
|
||||
"EP_rank",
|
||||
"pe_expansion_trend",
|
||||
"value_price_divergence",
|
||||
"active_market_cap",
|
||||
"ebit_rank",
|
||||
]
|
||||
|
||||
# 因子定义字典(完整因子库)
|
||||
FACTOR_DEFINITIONS = {
|
||||
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
|
||||
"ma_5": "ts_mean(close, 5)",
|
||||
"ma_20": "ts_mean(close, 20)",
|
||||
@@ -284,7 +355,9 @@ engine = FactorEngine(metadata_path="data/factors.jsonl")
|
||||
|
||||
# 2. 使用 metadata 定义因子
|
||||
print("\n[2] 定义因子(从 metadata 注册)")
|
||||
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)
|
||||
feature_cols = create_factors_with_metadata(
|
||||
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
|
||||
)
|
||||
target_col = LABEL_NAME
|
||||
|
||||
# 3. 准备数据(使用模块级别的日期配置)
|
||||
|
||||
Reference in New Issue
Block a user