refactor(factors): 简化 add_factor API 并默认启用 metadata

- 合并 add_factor_by_name 到 add_factor,支持三种调用方式
- FactorManager 构造函数改为可选参数,使用默认路径
- FactorEngine 默认启用 metadata,无需手动配置路径
This commit is contained in:
2026-03-12 22:34:25 +08:00
parent 2bb7718dd1
commit ced7a929c3
7 changed files with 496 additions and 254 deletions

View File

View File

@@ -22,10 +22,13 @@
"source": "## 1. 导入依赖" "source": "## 1. 导入依赖"
}, },
{ {
"metadata": {}, "metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:49.975545Z",
"start_time": "2026-03-11T16:02:48.487347Z"
}
},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [ "source": [
"import os\n", "import os\n",
"from datetime import datetime\n", "from datetime import datetime\n",
@@ -50,7 +53,9 @@
"from src.training.components.models import LightGBMLambdaRankModel\n", "from src.training.components.models import LightGBMLambdaRankModel\n",
"from src.training.config import TrainingConfig\n", "from src.training.config import TrainingConfig\n",
"\n" "\n"
] ],
"outputs": [],
"execution_count": 1
}, },
{ {
"metadata": {}, "metadata": {},
@@ -58,35 +63,49 @@
"source": "## 2. 辅助函数" "source": "## 2. 辅助函数"
}, },
{ {
"metadata": {}, "metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:49.989220Z",
"start_time": "2026-03-11T16:02:49.981542Z"
}
},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [ "source": [
"def create_factors_with_metadata(\n", "def create_factors_with_metadata(\n",
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n", " engine: FactorEngine,\n",
" selected_factors: List[str],\n",
" factor_definitions: dict,\n",
" label_factor: dict,\n",
") -> List[str]:\n", ") -> List[str]:\n",
" \"\"\"使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)\"\"\"\n", " \"\"\"注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
" print(\"=\" * 80)\n", " print(\"=\" * 80)\n",
" print(\"使用 metadata 注册因子\")\n", " print(\"注册因子\")\n",
" print(\"=\" * 80)\n", " print(\"=\" * 80)\n",
"\n", "\n",
" # 注册所有特征因子(通过 metadata 名称\n", " # 注册 SELECTED_FACTORS 中的因子(已在 metadata \n",
" print(\"\\n注册特征因子从 metadata:\")\n", " print(\"\\n注册特征因子从 metadata:\")\n",
" for name in factor_definitions.keys():\n", " for name in selected_factors:\n",
" engine.add_factor_by_name(name)\n", " engine.add_factor(name)\n",
" print(f\" - {name}\")\n", " print(f\" - {name}\")\n",
"\n", "\n",
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n", " # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
" print(\"\\n注册特征因子表达式:\")\n",
" for name, expr in factor_definitions.items():\n",
" engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n",
"\n",
" # 注册 label 因子(通过表达式)\n",
" print(\"\\n注册 Label 因子(表达式):\")\n", " print(\"\\n注册 Label 因子(表达式):\")\n",
" for name, expr in label_factor.items():\n", " for name, expr in label_factor.items():\n",
" engine.add_factor(name, expr)\n", " engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n", " print(f\" - {name}: {expr}\")\n",
"\n", "\n",
" # 从字典自动获取特征列\n", " # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
" feature_cols = list(factor_definitions.keys())\n", " feature_cols = selected_factors + list(factor_definitions.keys())\n",
"\n", "\n",
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n", " print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
" print(f\"Label: {list(label_factor.keys())[0]}\")\n", " print(f\"Label: {list(label_factor.keys())[0]}\")\n",
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n", " print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
"\n", "\n",
@@ -251,7 +270,9 @@
"\n", "\n",
" return results\n", " return results\n",
"\n" "\n"
] ],
"outputs": [],
"execution_count": 2
}, },
{ {
"metadata": {}, "metadata": {},
@@ -263,77 +284,86 @@
] ]
}, },
{ {
"metadata": {}, "metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:50.000875Z",
"start_time": "2026-03-11T16:02:49.994082Z"
}
},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [ "source": [
"# 特征因子定义字典(复用 regression.ipynb 的因子定义)\n", "# 特征因子定义字典(复用 regression.ipynb 的因子定义)\n",
"LABEL_NAME = \"future_return_5_rank\"\n", "LABEL_NAME = \"future_return_5_rank\"\n",
"\n", "\n",
"FACTOR_DEFINITIONS = {\n", "# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n", "SELECTED_FACTORS = [\n",
" \"ma_5\": \"ts_mean(close, 5)\",\n", " # ================= 1. 价格、趋势与路径依赖 =================\n",
" \"ma_20\": \"ts_mean(close, 20)\",\n", " \"ma_5\",\n",
" \"ma_ratio_5_20\": \"ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1\",\n", " \"ma_20\",\n",
" \"bias_10\": \"close / (ts_mean(close, 10) + 1e-8) - 1\",\n", " \"ma_ratio_5_20\",\n",
" \"high_low_ratio\": \"(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)\",\n", " \"bias_10\",\n",
" \"bbi_ratio\": \"(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)\",\n", " \"high_low_ratio\",\n",
" \"return_5\": \"(close / (ts_delay(close, 5) + 1e-8)) - 1\",\n", " \"bbi_ratio\",\n",
" \"return_20\": \"(close / (ts_delay(close, 20) + 1e-8)) - 1\",\n", " \"return_5\",\n",
" \"kaufman_ER_20\": \"abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)\",\n", " \"return_20\",\n",
" \"mom_acceleration_10_20\": \"(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)\",\n", " \"kaufman_ER_20\",\n",
" \"drawdown_from_high_60\": \"close / (ts_max(high, 60) + 1e-8) - 1\",\n", " \"mom_acceleration_10_20\",\n",
" \"up_days_ratio_20\": \"ts_sum(close > ts_delay(close, 1), 20) / 20\",\n", " \"drawdown_from_high_60\",\n",
" \"up_days_ratio_20\",\n",
" # ================= 2. 波动率、风险调整与高阶矩 =================\n", " # ================= 2. 波动率、风险调整与高阶矩 =================\n",
" \"volatility_5\": \"ts_std(close, 5)\",\n", " \"volatility_5\",\n",
" \"volatility_20\": \"ts_std(close, 20)\",\n", " \"volatility_20\",\n",
" \"volatility_ratio\": \"ts_std(close, 5) / (ts_std(close, 20) + 1e-8)\",\n", " \"volatility_ratio\",\n",
" \"std_return_20\": \"ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)\",\n", " \"std_return_20\",\n",
" \"sharpe_ratio_20\": \"ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)\",\n", " \"sharpe_ratio_20\",\n",
" \"min_ret_20\": \"ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n", " \"min_ret_20\",\n",
" \"volatility_squeeze_5_60\": \"ts_std(close, 5) / (ts_std(close, 60) + 1e-8)\",\n", " \"volatility_squeeze_5_60\",\n",
" # ================= 3. 日内微观结构与异象 =================\n", " # ================= 3. 日内微观结构与异象 =================\n",
" \"overnight_intraday_diff\": \"(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)\",\n", " \"overnight_intraday_diff\",\n",
" \"upper_shadow_ratio\": \"(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)\",\n", " \"upper_shadow_ratio\",\n",
" \"capital_retention_20\": \"ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)\",\n", " \"capital_retention_20\",\n",
" \"max_ret_20\": \"ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n", " \"max_ret_20\",\n",
" # ================= 4. 量能、流动性与量价背离 =================\n", " # ================= 4. 量能、流动性与量价背离 =================\n",
" \"volume_ratio_5_20\": \"ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)\",\n", " \"volume_ratio_5_20\",\n",
" \"turnover_rate_mean_5\": \"ts_mean(turnover_rate, 5)\",\n", " \"turnover_rate_mean_5\",\n",
" \"turnover_deviation\": \"(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)\",\n", " \"turnover_deviation\",\n",
" \"amihud_illiq_20\": \"ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)\",\n", " \"amihud_illiq_20\",\n",
" \"turnover_cv_20\": \"ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)\",\n", " \"turnover_cv_20\",\n",
" \"pv_corr_20\": \"ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)\",\n", " \"pv_corr_20\",\n",
" \"close_vwap_deviation\": \"close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1\",\n", " \"close_vwap_deviation\",\n",
" # ================= 5. 基本面财务特征 =================\n", " # ================= 5. 基本面财务特征 =================\n",
" \"roe\": \"n_income / (total_hldr_eqy_exc_min_int + 1e-8)\",\n", " \"roe\",\n",
" \"roa\": \"n_income / (total_assets + 1e-8)\",\n", " \"roa\",\n",
" \"profit_margin\": \"n_income / (revenue + 1e-8)\",\n", " \"profit_margin\",\n",
" \"debt_to_equity\": \"total_liab / (total_hldr_eqy_exc_min_int + 1e-8)\",\n", " \"debt_to_equity\",\n",
" \"current_ratio\": \"total_cur_assets / (total_cur_liab + 1e-8)\",\n", " \"current_ratio\",\n",
" \"net_profit_yoy\": \"(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1\",\n", " \"net_profit_yoy\",\n",
" \"revenue_yoy\": \"(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1\",\n", " \"revenue_yoy\",\n",
" \"healthy_expansion_velocity\": \"(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)\",\n", " \"healthy_expansion_velocity\",\n",
" # ================= 6. 基本面估值与截面动量共振 =================\n", " # ================= 6. 基本面估值与截面动量共振 =================\n",
" \"EP\": \"n_income / (total_mv * 10000 + 1e-8)\",\n", " \"EP\",\n",
" \"BP\": \"total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)\",\n", " \"BP\",\n",
" \"CP\": \"n_cashflow_act / (total_mv * 10000 + 1e-8)\",\n", " \"CP\",\n",
" \"market_cap_rank\": \"cs_rank(total_mv)\",\n", " \"market_cap_rank\",\n",
" \"turnover_rank\": \"cs_rank(turnover_rate)\",\n", " \"turnover_rank\",\n",
" \"return_5_rank\": \"cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)\",\n", " \"return_5_rank\",\n",
" \"EP_rank\": \"cs_rank(n_income / (total_mv + 1e-8))\",\n", " \"EP_rank\",\n",
" \"pe_expansion_trend\": \"(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1\",\n", " \"pe_expansion_trend\",\n",
" \"value_price_divergence\": \"cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))\",\n", " \"value_price_divergence\",\n",
" \"active_market_cap\": \"total_mv * ts_mean(turnover_rate, 20)\",\n", " \"active_market_cap\",\n",
" \"ebit_rank\": \"cs_rank(ebit)\",\n", " \"ebit_rank\",\n",
"}\n", "]\n",
"\n",
"# 因子定义字典(完整因子库)\n",
"FACTOR_DEFINITIONS = {\"turnover_volatility_ratio\": \"log(ts_std(turnover_rate, 20))\"}\n",
"\n", "\n",
"# Label 因子定义(不参与训练,用于计算目标)\n", "# Label 因子定义(不参与训练,用于计算目标)\n",
"LABEL_FACTOR = {\n", "LABEL_FACTOR = {\n",
" LABEL_NAME: \"(ts_delay(close, -5) / ts_delay(open, -1)) - 1\",\n", " LABEL_NAME: \"(ts_delay(close, -5) / ts_delay(open, -1)) - 1\",\n",
"}" "}"
] ],
"outputs": [],
"execution_count": 3
}, },
{ {
"metadata": {}, "metadata": {},
@@ -341,10 +371,13 @@
"source": "### 3.2 训练参数配置" "source": "### 3.2 训练参数配置"
}, },
{ {
"metadata": {}, "metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:50.009081Z",
"start_time": "2026-03-11T16:02:50.005330Z"
}
},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [ "source": [
"# 日期范围配置(正确的 train/val/test 三分法)\n", "# 日期范围配置(正确的 train/val/test 三分法)\n",
"TRAIN_START = \"20200101\"\n", "TRAIN_START = \"20200101\"\n",
@@ -377,7 +410,7 @@
"N_QUANTILES = 20 # 将 label 分为 20 组\n", "N_QUANTILES = 20 # 将 label 分为 20 组\n",
"\n", "\n",
"# 特征列(用于数据处理器)\n", "# 特征列(用于数据处理器)\n",
"FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())\n", "FEATURE_COLS = SELECTED_FACTORS\n",
"\n", "\n",
"# 数据处理器配置\n", "# 数据处理器配置\n",
"PROCESSORS = [\n", "PROCESSORS = [\n",
@@ -421,7 +454,9 @@
"\n", "\n",
"# Top N 配置:每日推荐股票数量\n", "# Top N 配置:每日推荐股票数量\n",
"TOP_N = 5 # 可调整为 10, 20 等" "TOP_N = 5 # 可调整为 10, 20 等"
] ],
"outputs": [],
"execution_count": 4
}, },
{ {
"metadata": {}, "metadata": {},
@@ -429,10 +464,13 @@
"source": "## 4. 训练流程" "source": "## 4. 训练流程"
}, },
{ {
"metadata": {}, "metadata": {
"ExecuteTime": {
"end_time": "2026-03-11T16:02:50.330018Z",
"start_time": "2026-03-11T16:02:50.012964Z"
}
},
"cell_type": "code", "cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [ "source": [
"print(\"\\n\" + \"=\" * 80)\n", "print(\"\\n\" + \"=\" * 80)\n",
"print(\"LightGBM LambdaRank 排序学习训练\")\n", "print(\"LightGBM LambdaRank 排序学习训练\")\n",
@@ -444,7 +482,9 @@
"\n", "\n",
"# 2. 使用 metadata 定义因子\n", "# 2. 使用 metadata 定义因子\n",
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n", "print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n", "feature_cols = create_factors_with_metadata(\n",
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
")\n",
"\n", "\n",
"# 3. 准备数据\n", "# 3. 准备数据\n",
"print(\"\\n[3] 准备数据\")\n", "print(\"\\n[3] 准备数据\")\n",
@@ -507,7 +547,49 @@
" feature_cols=feature_cols,\n", " feature_cols=feature_cols,\n",
" persist_model=PERSIST_MODEL,\n", " persist_model=PERSIST_MODEL,\n",
")" ")"
] ],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"================================================================================\n",
"LightGBM LambdaRank 排序学习训练\n",
"================================================================================\n",
"\n",
"[1] 创建 FactorEngine\n",
"\n",
"[2] 定义因子(从 metadata 注册)\n",
"================================================================================\n",
"注册因子\n",
"================================================================================\n",
"\n",
"注册特征因子(从 metadata:\n"
]
},
{
"ename": "QueryError",
"evalue": "查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n ",
"output_type": "error",
"traceback": [
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
"\u001B[31mBinderException\u001B[39m Traceback (most recent call last)",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:296\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 295\u001B[39m conn = \u001B[38;5;28mself\u001B[39m._get_connection()\n\u001B[32m--> \u001B[39m\u001B[32m296\u001B[39m result = \u001B[43mconn\u001B[49m\u001B[43m.\u001B[49m\u001B[43mexecute\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m.pl()\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n",
"\u001B[31mBinderException\u001B[39m: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001B[31mQueryError\u001B[39m Traceback (most recent call last)",
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[5]\u001B[39m\u001B[32m, line 11\u001B[39m\n\u001B[32m 9\u001B[39m \u001B[38;5;66;03m# 2. 使用 metadata 定义因子\u001B[39;00m\n\u001B[32m 10\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[2] 定义因子(从 metadata 注册)\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m---> \u001B[39m\u001B[32m11\u001B[39m feature_cols = \u001B[43mcreate_factors_with_metadata\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 12\u001B[39m \u001B[43m \u001B[49m\u001B[43mengine\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mSELECTED_FACTORS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mFACTOR_DEFINITIONS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mLABEL_FACTOR\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m)\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[38;5;66;03m# 3. 准备数据\u001B[39;00m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[3] 准备数据\u001B[39m\u001B[33m\"\u001B[39m)\n",
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 15\u001B[39m, in \u001B[36mcreate_factors_with_metadata\u001B[39m\u001B[34m(engine, selected_factors, factor_definitions, label_factor)\u001B[39m\n\u001B[32m 13\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m注册特征因子从 metadata:\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 14\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m name \u001B[38;5;129;01min\u001B[39;00m selected_factors:\n\u001B[32m---> \u001B[39m\u001B[32m15\u001B[39m \u001B[43mengine\u001B[49m\u001B[43m.\u001B[49m\u001B[43madd_factor\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m - \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 18\u001B[39m \u001B[38;5;66;03m# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\u001B[39;00m\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:225\u001B[39m, in \u001B[36mFactorEngine.add_factor\u001B[39m\u001B[34m(self, name, expression, data_specs)\u001B[39m\n\u001B[32m 182\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"注册因子(支持多种调用方式)。\u001B[39;00m\n\u001B[32m 183\u001B[39m \n\u001B[32m 184\u001B[39m \u001B[33;03m这是 register 方法的增强版,支持以下调用方式:\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 221\u001B[39m \u001B[33;03m ... .add_factor(\"golden_cross\", \"ma5 > ma10\"))\u001B[39;00m\n\u001B[32m 222\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 223\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m expression \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 224\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询表达式\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m225\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_add_factor_from_metadata\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdata_specs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 227\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(expression, \u001B[38;5;28mstr\u001B[39m):\n\u001B[32m 228\u001B[39m \u001B[38;5;66;03m# Fail-Fast立即解析失败立即报错\u001B[39;00m\n\u001B[32m 229\u001B[39m node = \u001B[38;5;28mself\u001B[39m._parser.parse(expression)\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:159\u001B[39m, in \u001B[36mFactorEngine._add_factor_from_metadata\u001B[39m\u001B[34m(self, name, factor_name_in_metadata, data_specs)\u001B[39m\n\u001B[32m 153\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\n\u001B[32m 154\u001B[39m \u001B[33m\"\u001B[39m\u001B[33m引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 155\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m例如FactorEngine(metadata_path=\u001B[39m\u001B[33m'\u001B[39m\u001B[33mdata/factors.jsonl\u001B[39m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 156\u001B[39m )\n\u001B[32m 158\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询因子\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m159\u001B[39m df = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_metadata\u001B[49m\u001B[43m.\u001B[49m\u001B[43mget_factors_by_name\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfactor_name_in_metadata\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 161\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(df) == \u001B[32m0\u001B[39m:\n\u001B[32m 162\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[32m 163\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m在 metadata 中未找到因子 \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfactor_name_in_metadata\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 164\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m请确认因子名称正确或先使用 FactorManager 添加该因子。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 165\u001B[39m )\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:177\u001B[39m, in \u001B[36mFactorManager.get_factors_by_name\u001B[39m\u001B[34m(self, name)\u001B[39m\n\u001B[32m 154\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"根据名称查询因子。\u001B[39;00m\n\u001B[32m 155\u001B[39m \n\u001B[32m 156\u001B[39m \u001B[33;03m使用DuckDB执行SQL查询返回Polars DataFrame。\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 170\u001B[39m \u001B[33;03m ... print(df[\"dsl\"][0])\u001B[39;00m\n\u001B[32m 171\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 172\u001B[39m sql = \u001B[33mf\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m 173\u001B[39m \u001B[33m SELECT *\u001B[39m\n\u001B[32m 174\u001B[39m \u001B[33m FROM read_json_auto(\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mself\u001B[39m.filepath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\n\u001B[32m 175\u001B[39m \u001B[33m WHERE name = \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\n\u001B[32m 176\u001B[39m \u001B[33m\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m--> \u001B[39m\u001B[32m177\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_execute_query\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m\n",
"\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:299\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n\u001B[32m 298\u001B[39m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[32m--> \u001B[39m\u001B[32m299\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m QueryError(sql, e)\n",
"\u001B[31mQueryError\u001B[39m: 查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n "
]
}
],
"execution_count": 5
}, },
{ {
"metadata": {}, "metadata": {},

View File

@@ -40,29 +40,40 @@ from src.training.config import TrainingConfig
# ## 2. 辅助函数 # ## 2. 辅助函数
# %% # %%
def create_factors_with_metadata( def create_factors_with_metadata(
engine: FactorEngine, factor_definitions: dict, label_factor: dict engine: FactorEngine,
selected_factors: List[str],
factor_definitions: dict,
label_factor: dict,
) -> List[str]: ) -> List[str]:
"""使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)""" """注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)"""
print("=" * 80) print("=" * 80)
print("使用 metadata 注册因子") print("注册因子")
print("=" * 80) print("=" * 80)
# 注册所有特征因子(通过 metadata 名称 # 注册 SELECTED_FACTORS 中的因子(已在 metadata
print("\n注册特征因子(从 metadata:") print("\n注册特征因子(从 metadata:")
for name in factor_definitions.keys(): for name in selected_factors:
engine.add_factor_by_name(name) engine.add_factor(name)
print(f" - {name}") print(f" - {name}")
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中) # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
print("\n注册特征因子(表达式):")
for name, expr in factor_definitions.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 注册 label 因子(通过表达式)
print("\n注册 Label 因子(表达式):") print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items(): for name, expr in label_factor.items():
engine.add_factor(name, expr) engine.add_factor(name, expr)
print(f" - {name}: {expr}") print(f" - {name}: {expr}")
# 从字典自动获取特征列 # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
feature_cols = list(factor_definitions.keys()) feature_cols = selected_factors + list(factor_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}") print(f"\n特征因子数: {len(feature_cols)}")
print(f" - 来自 metadata: {len(selected_factors)}")
print(f" - 来自表达式: {len(factor_definitions)}")
print(f"Label: {list(label_factor.keys())[0]}") print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}") print(f"已注册因子总数: {len(engine.list_registered())}")
@@ -236,62 +247,68 @@ def evaluate_ndcg_at_k(
# 特征因子定义字典(复用 regression.ipynb 的因子定义) # 特征因子定义字典(复用 regression.ipynb 的因子定义)
LABEL_NAME = "future_return_5_rank" LABEL_NAME = "future_return_5_rank"
FACTOR_DEFINITIONS = { # 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) ================= SELECTED_FACTORS = [
"ma_5": "ts_mean(close, 5)", # ================= 1. 价格、趋势与路径依赖 =================
"ma_20": "ts_mean(close, 20)", "ma_5",
"ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1", "ma_20",
"bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1", "ma_ratio_5_20",
"high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)", "bias_10",
"bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)", "high_low_ratio",
"return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1", "bbi_ratio",
"return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1", "return_5",
"kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)", "return_20",
"mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)", "kaufman_ER_20",
"drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1", "mom_acceleration_10_20",
"up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20", "drawdown_from_high_60",
"up_days_ratio_20",
# ================= 2. 波动率、风险调整与高阶矩 ================= # ================= 2. 波动率、风险调整与高阶矩 =================
"volatility_5": "ts_std(close, 5)", "volatility_5",
"volatility_20": "ts_std(close, 20)", "volatility_20",
"volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)", "volatility_ratio",
"std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)", "std_return_20",
"sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)", "sharpe_ratio_20",
"min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)", "min_ret_20",
"volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)", "volatility_squeeze_5_60",
# ================= 3. 日内微观结构与异象 ================= # ================= 3. 日内微观结构与异象 =================
"overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)", "overnight_intraday_diff",
"upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)", "upper_shadow_ratio",
"capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)", "capital_retention_20",
"max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)", "max_ret_20",
# ================= 4. 量能、流动性与量价背离 ================= # ================= 4. 量能、流动性与量价背离 =================
"volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)", "volume_ratio_5_20",
"turnover_rate_mean_5": "ts_mean(turnover_rate, 5)", "turnover_rate_mean_5",
"turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)", "turnover_deviation",
"amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)", "amihud_illiq_20",
"turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)", "turnover_cv_20",
"pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)", "pv_corr_20",
"close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1", "close_vwap_deviation",
# ================= 5. 基本面财务特征 ================= # ================= 5. 基本面财务特征 =================
"roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)", "roe",
"roa": "n_income / (total_assets + 1e-8)", "roa",
"profit_margin": "n_income / (revenue + 1e-8)", "profit_margin",
"debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)", "debt_to_equity",
"current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)", "current_ratio",
"net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1", "net_profit_yoy",
"revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1", "revenue_yoy",
"healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)", "healthy_expansion_velocity",
# ================= 6. 基本面估值与截面动量共振 ================= # ================= 6. 基本面估值与截面动量共振 =================
"EP": "n_income / (total_mv * 10000 + 1e-8)", "EP",
"BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)", "BP",
"CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)", "CP",
"market_cap_rank": "cs_rank(total_mv)", "market_cap_rank",
"turnover_rank": "cs_rank(turnover_rate)", "turnover_rank",
"return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)", "return_5_rank",
"EP_rank": "cs_rank(n_income / (total_mv + 1e-8))", "EP_rank",
"pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1", "pe_expansion_trend",
"value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))", "value_price_divergence",
"active_market_cap": "total_mv * ts_mean(turnover_rate, 20)", "active_market_cap",
"ebit_rank": "cs_rank(ebit)", "ebit_rank",
]
# 因子定义字典(完整因子库)
FACTOR_DEFINITIONS = {
# "turnover_volatility_ratio": "log(ts_std(turnover_rate, 20))"
} }
# Label 因子定义(不参与训练,用于计算目标) # Label 因子定义(不参与训练,用于计算目标)
@@ -332,7 +349,7 @@ MODEL_PARAMS = {
N_QUANTILES = 20 # 将 label 分为 20 组 N_QUANTILES = 20 # 将 label 分为 20 组
# 特征列(用于数据处理器) # 特征列(用于数据处理器)
FEATURE_COLS = list(FACTOR_DEFINITIONS.keys()) FEATURE_COLS = SELECTED_FACTORS
# 数据处理器配置 # 数据处理器配置
PROCESSORS = [ PROCESSORS = [
@@ -385,11 +402,13 @@ print("=" * 80)
# 1. 创建 FactorEngine启用 metadata 功能) # 1. 创建 FactorEngine启用 metadata 功能)
print("\n[1] 创建 FactorEngine") print("\n[1] 创建 FactorEngine")
engine = FactorEngine(metadata_path="data/factors.jsonl") engine = FactorEngine()
# 2. 使用 metadata 定义因子 # 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)") print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR) feature_cols = create_factors_with_metadata(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
)
# 3. 准备数据 # 3. 准备数据
print("\n[3] 准备数据") print("\n[3] 准备数据")

View File

@@ -47,29 +47,40 @@
"execution_count": null, "execution_count": null,
"source": [ "source": [
"def create_factors_with_metadata(\n", "def create_factors_with_metadata(\n",
" engine: FactorEngine, factor_definitions: dict, label_factor: dict\n", " engine: FactorEngine,\n",
" selected_factors: List[str],\n",
" factor_definitions: dict,\n",
" label_factor: dict,\n",
") -> List[str]:\n", ") -> List[str]:\n",
" \"\"\"使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)\"\"\"\n", " \"\"\"注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n",
" print(\"=\" * 80)\n", " print(\"=\" * 80)\n",
" print(\"使用 metadata 注册因子\")\n", " print(\"注册因子\")\n",
" print(\"=\" * 80)\n", " print(\"=\" * 80)\n",
"\n", "\n",
" # 注册所有特征因子(通过 metadata 名称\n", " # 注册 SELECTED_FACTORS 中的因子(已在 metadata \n",
" print(\"\\n注册特征因子从 metadata:\")\n", " print(\"\\n注册特征因子从 metadata:\")\n",
" for name in factor_definitions.keys():\n", " for name in selected_factors:\n",
" engine.add_factor_by_name(name)\n", " engine.add_factor(name)\n",
" print(f\" - {name}\")\n", " print(f\" - {name}\")\n",
"\n", "\n",
" # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n", " # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n",
" print(\"\\n注册特征因子表达式:\")\n",
" for name, expr in factor_definitions.items():\n",
" engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n",
"\n",
" # 注册 label 因子(通过表达式)\n",
" print(\"\\n注册 Label 因子(表达式):\")\n", " print(\"\\n注册 Label 因子(表达式):\")\n",
" for name, expr in label_factor.items():\n", " for name, expr in label_factor.items():\n",
" engine.add_factor(name, expr)\n", " engine.add_factor(name, expr)\n",
" print(f\" - {name}: {expr}\")\n", " print(f\" - {name}: {expr}\")\n",
"\n", "\n",
" # 从字典自动获取特征列\n", " # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n",
" feature_cols = list(factor_definitions.keys())\n", " feature_cols = selected_factors + list(factor_definitions.keys())\n",
"\n", "\n",
" print(f\"\\n特征因子数: {len(feature_cols)}\")\n", " print(f\"\\n特征因子数: {len(feature_cols)}\")\n",
" print(f\" - 来自 metadata: {len(selected_factors)}\")\n",
" print(f\" - 来自表达式: {len(factor_definitions)}\")\n",
" print(f\"Label: {list(label_factor.keys())[0]}\")\n", " print(f\"Label: {list(label_factor.keys())[0]}\")\n",
" print(f\"已注册因子总数: {len(engine.list_registered())}\")\n", " print(f\"已注册因子总数: {len(engine.list_registered())}\")\n",
"\n", "\n",
@@ -123,7 +134,67 @@
"# 特征因子定义字典:新增因子只需在此处添加一行\n", "# 特征因子定义字典:新增因子只需在此处添加一行\n",
"LABEL_NAME = \"future_return_5\"\n", "LABEL_NAME = \"future_return_5\"\n",
"\n", "\n",
"FACTOR_DEFINITIONS = FACTOR_DICT = {\n", "# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n",
"SELECTED_FACTORS = [\n",
" # ================= 1. 价格、趋势与路径依赖 =================\n",
" \"ma_5\",\n",
" \"ma_20\",\n",
" \"ma_ratio_5_20\",\n",
" \"bias_10\",\n",
" \"high_low_ratio\",\n",
" \"bbi_ratio\",\n",
" \"return_5\",\n",
" \"return_20\",\n",
" \"kaufman_ER_20\",\n",
" \"mom_acceleration_10_20\",\n",
" \"drawdown_from_high_60\",\n",
" \"up_days_ratio_20\",\n",
" # ================= 2. 波动率、风险调整与高阶矩 =================\n",
" \"volatility_5\",\n",
" \"volatility_20\",\n",
" \"volatility_ratio\",\n",
" \"std_return_20\",\n",
" \"sharpe_ratio_20\",\n",
" \"min_ret_20\",\n",
" \"volatility_squeeze_5_60\",\n",
" # ================= 3. 日内微观结构与异象 =================\n",
" \"overnight_intraday_diff\",\n",
" \"upper_shadow_ratio\",\n",
" \"capital_retention_20\",\n",
" \"max_ret_20\",\n",
" # ================= 4. 量能、流动性与量价背离 =================\n",
" \"volume_ratio_5_20\",\n",
" \"turnover_rate_mean_5\",\n",
" \"turnover_deviation\",\n",
" \"amihud_illiq_20\",\n",
" \"turnover_cv_20\",\n",
" \"pv_corr_20\",\n",
" \"close_vwap_deviation\",\n",
" # ================= 5. 基本面财务特征 =================\n",
" \"roe\",\n",
" \"roa\",\n",
" \"profit_margin\",\n",
" \"debt_to_equity\",\n",
" \"current_ratio\",\n",
" \"net_profit_yoy\",\n",
" \"revenue_yoy\",\n",
" \"healthy_expansion_velocity\",\n",
" # ================= 6. 基本面估值与截面动量共振 =================\n",
" \"EP\",\n",
" \"BP\",\n",
" \"CP\",\n",
" \"market_cap_rank\",\n",
" \"turnover_rank\",\n",
" \"return_5_rank\",\n",
" \"EP_rank\",\n",
" \"pe_expansion_trend\",\n",
" \"value_price_divergence\",\n",
" \"active_market_cap\",\n",
" \"ebit_rank\",\n",
"]\n",
"\n",
"# 因子定义字典(完整因子库)\n",
"FACTOR_DEFINITIONS = {\n",
" # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n", " # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n",
" \"ma_5\": \"ts_mean(close, 5)\",\n", " \"ma_5\": \"ts_mean(close, 5)\",\n",
" \"ma_20\": \"ts_mean(close, 20)\",\n", " \"ma_20\": \"ts_mean(close, 20)\",\n",
@@ -338,7 +409,9 @@
"\n", "\n",
"# 2. 使用 metadata 定义因子\n", "# 2. 使用 metadata 定义因子\n",
"print(\"\\n[2] 定义因子(从 metadata 注册)\")\n", "print(\"\\n[2] 定义因子(从 metadata 注册)\")\n",
"feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n", "feature_cols = create_factors_with_metadata(\n",
" engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n",
")\n",
"target_col = LABEL_NAME\n", "target_col = LABEL_NAME\n",
"\n", "\n",
"# 3. 准备数据(使用模块级别的日期配置)\n", "# 3. 准备数据(使用模块级别的日期配置)\n",

View File

@@ -26,29 +26,40 @@ from src.training.config import TrainingConfig
# ## 2. 定义辅助函数 # ## 2. 定义辅助函数
# %% # %%
def create_factors_with_metadata( def create_factors_with_metadata(
engine: FactorEngine, factor_definitions: dict, label_factor: dict engine: FactorEngine,
selected_factors: List[str],
factor_definitions: dict,
label_factor: dict,
) -> List[str]: ) -> List[str]:
"""使用 metadata 注册因子特征因子通过名称注册label 因子通过表达式注册)""" """注册因子SELECTED_FACTORS 从 metadata 查询FACTOR_DEFINITIONS 用表达式注册)"""
print("=" * 80) print("=" * 80)
print("使用 metadata 注册因子") print("注册因子")
print("=" * 80) print("=" * 80)
# 注册所有特征因子(通过 metadata 名称 # 注册 SELECTED_FACTORS 中的因子(已在 metadata
print("\n注册特征因子(从 metadata:") print("\n注册特征因子(从 metadata:")
for name in factor_definitions.keys(): for name in selected_factors:
engine.add_factor_by_name(name) engine.add_factor(name)
print(f" - {name}") print(f" - {name}")
# 注册 label 因子(通过表达式,因为 label 不在 metadata 中) # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)
print("\n注册特征因子(表达式):")
for name, expr in factor_definitions.items():
engine.add_factor(name, expr)
print(f" - {name}: {expr}")
# 注册 label 因子(通过表达式)
print("\n注册 Label 因子(表达式):") print("\n注册 Label 因子(表达式):")
for name, expr in label_factor.items(): for name, expr in label_factor.items():
engine.add_factor(name, expr) engine.add_factor(name, expr)
print(f" - {name}: {expr}") print(f" - {name}: {expr}")
# 从字典自动获取特征列 # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys
feature_cols = list(factor_definitions.keys()) feature_cols = selected_factors + list(factor_definitions.keys())
print(f"\n特征因子数: {len(feature_cols)}") print(f"\n特征因子数: {len(feature_cols)}")
print(f" - 来自 metadata: {len(selected_factors)}")
print(f" - 来自表达式: {len(factor_definitions)}")
print(f"Label: {list(label_factor.keys())[0]}") print(f"Label: {list(label_factor.keys())[0]}")
print(f"已注册因子总数: {len(engine.list_registered())}") print(f"已注册因子总数: {len(engine.list_registered())}")
@@ -91,7 +102,67 @@ def prepare_data(
# 特征因子定义字典:新增因子只需在此处添加一行 # 特征因子定义字典:新增因子只需在此处添加一行
LABEL_NAME = "future_return_5" LABEL_NAME = "future_return_5"
FACTOR_DEFINITIONS = FACTOR_DICT = { # 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)
SELECTED_FACTORS = [
# ================= 1. 价格、趋势与路径依赖 =================
"ma_5",
"ma_20",
"ma_ratio_5_20",
"bias_10",
"high_low_ratio",
"bbi_ratio",
"return_5",
"return_20",
"kaufman_ER_20",
"mom_acceleration_10_20",
"drawdown_from_high_60",
"up_days_ratio_20",
# ================= 2. 波动率、风险调整与高阶矩 =================
"volatility_5",
"volatility_20",
"volatility_ratio",
"std_return_20",
"sharpe_ratio_20",
"min_ret_20",
"volatility_squeeze_5_60",
# ================= 3. 日内微观结构与异象 =================
"overnight_intraday_diff",
"upper_shadow_ratio",
"capital_retention_20",
"max_ret_20",
# ================= 4. 量能、流动性与量价背离 =================
"volume_ratio_5_20",
"turnover_rate_mean_5",
"turnover_deviation",
"amihud_illiq_20",
"turnover_cv_20",
"pv_corr_20",
"close_vwap_deviation",
# ================= 5. 基本面财务特征 =================
"roe",
"roa",
"profit_margin",
"debt_to_equity",
"current_ratio",
"net_profit_yoy",
"revenue_yoy",
"healthy_expansion_velocity",
# ================= 6. 基本面估值与截面动量共振 =================
"EP",
"BP",
"CP",
"market_cap_rank",
"turnover_rank",
"return_5_rank",
"EP_rank",
"pe_expansion_trend",
"value_price_divergence",
"active_market_cap",
"ebit_rank",
]
# 因子定义字典(完整因子库)
FACTOR_DEFINITIONS = {
# ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) ================= # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================
"ma_5": "ts_mean(close, 5)", "ma_5": "ts_mean(close, 5)",
"ma_20": "ts_mean(close, 20)", "ma_20": "ts_mean(close, 20)",
@@ -284,7 +355,9 @@ engine = FactorEngine(metadata_path="data/factors.jsonl")
# 2. 使用 metadata 定义因子 # 2. 使用 metadata 定义因子
print("\n[2] 定义因子(从 metadata 注册)") print("\n[2] 定义因子(从 metadata 注册)")
feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR) feature_cols = create_factors_with_metadata(
engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR
)
target_col = LABEL_NAME target_col = LABEL_NAME
# 3. 准备数据(使用模块级别的日期配置) # 3. 准备数据(使用模块级别的日期配置)

View File

@@ -81,12 +81,16 @@ class FactorEngine:
self._registry = registry if registry is not None else FunctionRegistry() self._registry = registry if registry is not None else FunctionRegistry()
self._parser = FormulaParser(self._registry) self._parser = FormulaParser(self._registry)
# 初始化 metadata 管理器(可选) # 初始化 metadata 管理器(可选,默认启用
self._metadata: Optional["FactorManager"] = None
if metadata_path is not None: if metadata_path is not None:
from src.factors.metadata import FactorManager from src.factors.metadata import FactorManager
self._metadata = FactorManager(metadata_path) self._metadata = FactorManager(metadata_path)
else:
# 使用 FactorManager 的默认路径
from src.factors.metadata import FactorManager
self._metadata = FactorManager()
def register( def register(
self, self,
@@ -128,22 +132,68 @@ class FactorEngine:
return self return self
def _add_factor_from_metadata(
self,
name: str,
factor_name_in_metadata: str,
data_specs: Optional[List[DataSpec]] = None,
) -> "FactorEngine":
"""从 metadata 中查询并注册因子(内部方法)。
Args:
name: 要注册的因子名称(引擎中使用的名称)
factor_name_in_metadata: metadata 中的因子名称
data_specs: 可选的数据规格
Returns:
self支持链式调用
Raises:
RuntimeError: 当引擎未配置 metadata 路径时
ValueError: 当在 metadata 中未找到因子时
FormulaParseError: 当 DSL 表达式解析失败时
"""
if self._metadata is None:
raise RuntimeError(
"引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,"
+ "例如FactorEngine(metadata_path='data/factors.jsonl')"
)
# 从 metadata 查询因子
df = self._metadata.get_factors_by_name(factor_name_in_metadata)
if len(df) == 0:
raise ValueError(
f"在 metadata 中未找到因子 '{factor_name_in_metadata}'"
+ "请确认因子名称正确,或先使用 FactorManager 添加该因子。"
)
# 获取 DSL 表达式
dsl_expr = df["dsl"][0]
# 解析表达式为 Node
node = self._parser.parse(dsl_expr)
# 委托给 register 方法
return self.register(name, node, data_specs)
def add_factor( def add_factor(
self, self,
name: str, name: str,
expression: Union[str, Node], expression: Optional[Union[str, Node]] = None,
data_specs: Optional[List[DataSpec]] = None, data_specs: Optional[List[DataSpec]] = None,
) -> "FactorEngine": ) -> "FactorEngine":
"""注册因子(支持字符串或 Node 表达式)。 """注册因子(支持多种调用方式)。
这是 register 方法的增强版,支持字符串表达式解析。 这是 register 方法的增强版,支持以下调用方式:
向后兼容register 方法保持不变,继续只接受 Node 类型。 1. 传入 name 和 expression直接注册表达式字符串或 Node
2. 只传入 name从 metadata 中查询表达式并注册
遵循 Fail-Fast 原则:字符串表达式会立即解析,失败时立即抛出异常。 遵循 Fail-Fast 原则:字符串表达式会立即解析,失败时立即抛出异常。
Args: Args:
name: 因子名称 name: 因子名称(引擎中使用的名称)
expression: 字符串表达式或 Node 对象 expression: 字符串表达式或 Node 对象,为 None 时从 metadata 查询
data_specs: 可选的数据规格 data_specs: 可选的数据规格
Returns: Returns:
@@ -152,19 +202,21 @@ class FactorEngine:
Raises: Raises:
TypeError: 当 expression 类型不支持时 TypeError: 当 expression 类型不支持时
FormulaParseError: 当字符串解析失败时(立即报错) FormulaParseError: 当字符串解析失败时(立即报错)
RuntimeError: 当 expression 为 None 但未配置 metadata 时
ValueError: 当在 metadata 中未找到因子时
Example: Example:
>>> engine = FactorEngine() >>> engine = FactorEngine()
>>> >>>
>>> # 字符串方式(新功能) >>> # 方式1字符串表达式
>>> engine.add_factor("ma20", "ts_mean(close, 20)") >>> engine.add_factor("ma20", "ts_mean(close, 20)")
>>> >>>
>>> # Node 方式(与 register 相同) >>> # 方式2Node 表达式
>>> from src.factors.api import close, ts_mean >>> from src.factors.api import close, ts_mean
>>> engine.add_factor("ma20", ts_mean(close, 20)) >>> engine.add_factor("ma20", ts_mean(close, 20))
>>> >>>
>>> # 复杂表达式 >>> # 方式3从 metadata 查询(需要初始化时配置 metadata_path
>>> engine.add_factor("alpha1", "cs_rank(close / open)") >>> engine.add_factor("return_5") # 从 metadata 查询名为 return_5 的因子
>>> >>>
>>> # 链式调用 >>> # 链式调用
>>> (engine >>> (engine
@@ -172,6 +224,10 @@ class FactorEngine:
... .add_factor("ma10", "ts_mean(close, 10)") ... .add_factor("ma10", "ts_mean(close, 10)")
... .add_factor("golden_cross", "ma5 > ma10")) ... .add_factor("golden_cross", "ma5 > ma10"))
""" """
if expression is None:
# 从 metadata 查询表达式
return self._add_factor_from_metadata(name, name, data_specs)
if isinstance(expression, str): if isinstance(expression, str):
# Fail-Fast立即解析失败立即报错 # Fail-Fast立即解析失败立即报错
node = self._parser.parse(expression) node = self._parser.parse(expression)
@@ -185,76 +241,6 @@ class FactorEngine:
# 委托给现有的 register 方法 # 委托给现有的 register 方法
return self.register(name, node, data_specs) return self.register(name, node, data_specs)
def add_factor_by_name(
self,
name: str,
factor_name_in_metadata: Optional[str] = None,
data_specs: Optional[List[DataSpec]] = None,
) -> "FactorEngine":
"""根据 metadata 中的因子名称注册因子。
从 metadata 管理器中根据因子名称查询 DSL 表达式,
然后解析并注册到引擎中。
Args:
name: 要注册的因子名称(引擎中使用的名称)
factor_name_in_metadata: metadata 中的因子名称,
为 None 时默认使用 name 参数
data_specs: 可选的数据规格
Returns:
self支持链式调用
Raises:
RuntimeError: 当引擎未配置 metadata 路径时
ValueError: 当在 metadata 中未找到因子时
FormulaParseError: 当 DSL 表达式解析失败时
Example:
>>> # 初始化时启用 metadata
>>> engine = FactorEngine(metadata_path="data/factors.jsonl")
>>>
>>> # 注册 metadata 中的因子(使用相同名称)
>>> engine.add_factor_by_name("return_5")
>>>
>>> # 使用不同名称注册
>>> engine.add_factor_by_name("my_mom", "momentum_5d")
>>>
>>> # 链式调用
>>> (engine
... .add_factor_by_name("ma20")
... .add_factor_by_name("rsi14")
... .compute(["ma20", "rsi14"], "20240101", "20240131"))
"""
if self._metadata is None:
raise RuntimeError(
"引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,"
+ "例如FactorEngine(metadata_path='data/factors.jsonl')"
)
# 使用传入的名称或默认使用 name
query_name = (
factor_name_in_metadata if factor_name_in_metadata is not None else name
)
# 从 metadata 查询因子
df = self._metadata.get_factors_by_name(query_name)
if len(df) == 0:
raise ValueError(
f"在 metadata 中未找到因子 '{query_name}'"
+ "请确认因子名称正确,或先使用 FactorManager 添加该因子。"
)
# 获取 DSL 表达式
dsl_expr = df["dsl"][0]
# 解析表达式为 Node
node = self._parser.parse(dsl_expr)
# 委托给 register 方法
return self.register(name, node, data_specs)
def compute( def compute(
self, self,
factor_names: Union[str, List[str]], factor_names: Union[str, List[str]],

View File

@@ -53,23 +53,32 @@ class FactorManager:
_conn: DuckDB连接对象懒加载 _conn: DuckDB连接对象懒加载
Example: Example:
>>> manager = FactorManager("data/factors.jsonl") >>> manager = FactorManager() # 使用默认路径
>>> df = manager.get_factors_by_name("mom_5d") >>> df = manager.get_factors_by_name("mom_5d")
>>> print(df["dsl"][0]) >>> print(df["dsl"][0])
""" """
def __init__(self, filepath: str) -> None: _DEFAULT_FILENAME = "factors.jsonl"
def __init__(self, filepath: Optional[str] = None) -> None:
"""初始化因子管理器。 """初始化因子管理器。
如果文件不存在会自动创建空的JSONL文件。 如果文件不存在会自动创建空的JSONL文件。
Args: Args:
filepath: JSONL文件路径相对或绝对路径 filepath: JSONL文件路径相对或绝对路径为None时使用默认路径
Raises: Raises:
FileOperationError: 当文件创建失败时 FileOperationError: 当文件创建失败时
""" """
self.filepath = Path(filepath).resolve() if filepath is None:
# 使用默认路径:从配置读取数据目录
from src.config.settings import settings
self.filepath = settings.data_path_resolved / self._DEFAULT_FILENAME
else:
self.filepath = Path(filepath).resolve()
self._conn: Optional[duckdb.DuckDBPyConnection] = None self._conn: Optional[duckdb.DuckDBPyConnection] = None
# 确保文件存在 # 确保文件存在