diff --git a/src/experiment/data/factors.jsonl b/src/experiment/data/factors.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/src/experiment/learn_to_rank.ipynb b/src/experiment/learn_to_rank.ipynb index 22bfea2..983aada 100644 --- a/src/experiment/learn_to_rank.ipynb +++ b/src/experiment/learn_to_rank.ipynb @@ -22,10 +22,13 @@ "source": "## 1. 导入依赖" }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-11T16:02:49.975545Z", + "start_time": "2026-03-11T16:02:48.487347Z" + } + }, "cell_type": "code", - "outputs": [], - "execution_count": null, "source": [ "import os\n", "from datetime import datetime\n", @@ -50,7 +53,9 @@ "from src.training.components.models import LightGBMLambdaRankModel\n", "from src.training.config import TrainingConfig\n", "\n" - ] + ], + "outputs": [], + "execution_count": 1 }, { "metadata": {}, @@ -58,35 +63,49 @@ "source": "## 2. 辅助函数" }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-11T16:02:49.989220Z", + "start_time": "2026-03-11T16:02:49.981542Z" + } + }, "cell_type": "code", - "outputs": [], - "execution_count": null, "source": [ "def create_factors_with_metadata(\n", - " engine: FactorEngine, factor_definitions: dict, label_factor: dict\n", + " engine: FactorEngine,\n", + " selected_factors: List[str],\n", + " factor_definitions: dict,\n", + " label_factor: dict,\n", ") -> List[str]:\n", - " \"\"\"使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)\"\"\"\n", + " \"\"\"注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n", " print(\"=\" * 80)\n", - " print(\"使用 metadata 注册因子\")\n", + " print(\"注册因子\")\n", " print(\"=\" * 80)\n", "\n", - " # 注册所有特征因子(通过 metadata 名称)\n", + " # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)\n", " print(\"\\n注册特征因子(从 metadata):\")\n", - " for name in factor_definitions.keys():\n", - " engine.add_factor_by_name(name)\n", + " for name in selected_factors:\n", + " engine.add_factor(name)\n", " print(f\" - {name}\")\n", "\n", - " # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n", + " # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n", + " print(\"\\n注册特征因子(表达式):\")\n", + " for name, expr in factor_definitions.items():\n", + " engine.add_factor(name, expr)\n", + " print(f\" - {name}: {expr}\")\n", + "\n", + " # 注册 label 因子(通过表达式)\n", " print(\"\\n注册 Label 因子(表达式):\")\n", " for name, expr in label_factor.items():\n", " engine.add_factor(name, expr)\n", " print(f\" - {name}: {expr}\")\n", "\n", - " # 从字典自动获取特征列\n", - " feature_cols = list(factor_definitions.keys())\n", + " # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n", + " feature_cols = selected_factors + list(factor_definitions.keys())\n", "\n", " print(f\"\\n特征因子数: {len(feature_cols)}\")\n", + " print(f\" - 来自 metadata: {len(selected_factors)}\")\n", + " print(f\" - 来自表达式: {len(factor_definitions)}\")\n", " print(f\"Label: {list(label_factor.keys())[0]}\")\n", " print(f\"已注册因子总数: {len(engine.list_registered())}\")\n", "\n", @@ -251,7 +270,9 @@ "\n", " return results\n", "\n" - ] + ], + "outputs": [], + "execution_count": 2 }, { "metadata": {}, @@ -263,77 +284,86 @@ ] }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-11T16:02:50.000875Z", + "start_time": "2026-03-11T16:02:49.994082Z" + } + }, "cell_type": "code", - "outputs": [], - "execution_count": null, "source": [ "# 特征因子定义字典(复用 regression.ipynb 的因子定义)\n", "LABEL_NAME = \"future_return_5_rank\"\n", "\n", - "FACTOR_DEFINITIONS = {\n", - " # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n", - " \"ma_5\": \"ts_mean(close, 5)\",\n", - " \"ma_20\": \"ts_mean(close, 20)\",\n", - " \"ma_ratio_5_20\": \"ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1\",\n", - " \"bias_10\": \"close / (ts_mean(close, 10) + 1e-8) - 1\",\n", - " \"high_low_ratio\": \"(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)\",\n", - " \"bbi_ratio\": \"(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)\",\n", - " \"return_5\": \"(close / (ts_delay(close, 5) + 1e-8)) - 1\",\n", - " \"return_20\": \"(close / (ts_delay(close, 20) + 1e-8)) - 1\",\n", - " \"kaufman_ER_20\": \"abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)\",\n", - " \"mom_acceleration_10_20\": \"(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)\",\n", - " \"drawdown_from_high_60\": \"close / (ts_max(high, 60) + 1e-8) - 1\",\n", - " \"up_days_ratio_20\": \"ts_sum(close > ts_delay(close, 1), 20) / 20\",\n", + "# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n", + "SELECTED_FACTORS = [\n", + " # ================= 1. 价格、趋势与路径依赖 =================\n", + " \"ma_5\",\n", + " \"ma_20\",\n", + " \"ma_ratio_5_20\",\n", + " \"bias_10\",\n", + " \"high_low_ratio\",\n", + " \"bbi_ratio\",\n", + " \"return_5\",\n", + " \"return_20\",\n", + " \"kaufman_ER_20\",\n", + " \"mom_acceleration_10_20\",\n", + " \"drawdown_from_high_60\",\n", + " \"up_days_ratio_20\",\n", " # ================= 2. 波动率、风险调整与高阶矩 =================\n", - " \"volatility_5\": \"ts_std(close, 5)\",\n", - " \"volatility_20\": \"ts_std(close, 20)\",\n", - " \"volatility_ratio\": \"ts_std(close, 5) / (ts_std(close, 20) + 1e-8)\",\n", - " \"std_return_20\": \"ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)\",\n", - " \"sharpe_ratio_20\": \"ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)\",\n", - " \"min_ret_20\": \"ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n", - " \"volatility_squeeze_5_60\": \"ts_std(close, 5) / (ts_std(close, 60) + 1e-8)\",\n", + " \"volatility_5\",\n", + " \"volatility_20\",\n", + " \"volatility_ratio\",\n", + " \"std_return_20\",\n", + " \"sharpe_ratio_20\",\n", + " \"min_ret_20\",\n", + " \"volatility_squeeze_5_60\",\n", " # ================= 3. 日内微观结构与异象 =================\n", - " \"overnight_intraday_diff\": \"(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)\",\n", - " \"upper_shadow_ratio\": \"(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)\",\n", - " \"capital_retention_20\": \"ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)\",\n", - " \"max_ret_20\": \"ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)\",\n", + " \"overnight_intraday_diff\",\n", + " \"upper_shadow_ratio\",\n", + " \"capital_retention_20\",\n", + " \"max_ret_20\",\n", " # ================= 4. 量能、流动性与量价背离 =================\n", - " \"volume_ratio_5_20\": \"ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)\",\n", - " \"turnover_rate_mean_5\": \"ts_mean(turnover_rate, 5)\",\n", - " \"turnover_deviation\": \"(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)\",\n", - " \"amihud_illiq_20\": \"ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)\",\n", - " \"turnover_cv_20\": \"ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)\",\n", - " \"pv_corr_20\": \"ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)\",\n", - " \"close_vwap_deviation\": \"close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1\",\n", + " \"volume_ratio_5_20\",\n", + " \"turnover_rate_mean_5\",\n", + " \"turnover_deviation\",\n", + " \"amihud_illiq_20\",\n", + " \"turnover_cv_20\",\n", + " \"pv_corr_20\",\n", + " \"close_vwap_deviation\",\n", " # ================= 5. 基本面财务特征 =================\n", - " \"roe\": \"n_income / (total_hldr_eqy_exc_min_int + 1e-8)\",\n", - " \"roa\": \"n_income / (total_assets + 1e-8)\",\n", - " \"profit_margin\": \"n_income / (revenue + 1e-8)\",\n", - " \"debt_to_equity\": \"total_liab / (total_hldr_eqy_exc_min_int + 1e-8)\",\n", - " \"current_ratio\": \"total_cur_assets / (total_cur_liab + 1e-8)\",\n", - " \"net_profit_yoy\": \"(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1\",\n", - " \"revenue_yoy\": \"(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1\",\n", - " \"healthy_expansion_velocity\": \"(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)\",\n", + " \"roe\",\n", + " \"roa\",\n", + " \"profit_margin\",\n", + " \"debt_to_equity\",\n", + " \"current_ratio\",\n", + " \"net_profit_yoy\",\n", + " \"revenue_yoy\",\n", + " \"healthy_expansion_velocity\",\n", " # ================= 6. 基本面估值与截面动量共振 =================\n", - " \"EP\": \"n_income / (total_mv * 10000 + 1e-8)\",\n", - " \"BP\": \"total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)\",\n", - " \"CP\": \"n_cashflow_act / (total_mv * 10000 + 1e-8)\",\n", - " \"market_cap_rank\": \"cs_rank(total_mv)\",\n", - " \"turnover_rank\": \"cs_rank(turnover_rate)\",\n", - " \"return_5_rank\": \"cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)\",\n", - " \"EP_rank\": \"cs_rank(n_income / (total_mv + 1e-8))\",\n", - " \"pe_expansion_trend\": \"(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1\",\n", - " \"value_price_divergence\": \"cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))\",\n", - " \"active_market_cap\": \"total_mv * ts_mean(turnover_rate, 20)\",\n", - " \"ebit_rank\": \"cs_rank(ebit)\",\n", - "}\n", + " \"EP\",\n", + " \"BP\",\n", + " \"CP\",\n", + " \"market_cap_rank\",\n", + " \"turnover_rank\",\n", + " \"return_5_rank\",\n", + " \"EP_rank\",\n", + " \"pe_expansion_trend\",\n", + " \"value_price_divergence\",\n", + " \"active_market_cap\",\n", + " \"ebit_rank\",\n", + "]\n", + "\n", + "# 因子定义字典(完整因子库)\n", + "FACTOR_DEFINITIONS = {\"turnover_volatility_ratio\": \"log(ts_std(turnover_rate, 20))\"}\n", "\n", "# Label 因子定义(不参与训练,用于计算目标)\n", "LABEL_FACTOR = {\n", " LABEL_NAME: \"(ts_delay(close, -5) / ts_delay(open, -1)) - 1\",\n", "}" - ] + ], + "outputs": [], + "execution_count": 3 }, { "metadata": {}, @@ -341,10 +371,13 @@ "source": "### 3.2 训练参数配置" }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-11T16:02:50.009081Z", + "start_time": "2026-03-11T16:02:50.005330Z" + } + }, "cell_type": "code", - "outputs": [], - "execution_count": null, "source": [ "# 日期范围配置(正确的 train/val/test 三分法)\n", "TRAIN_START = \"20200101\"\n", @@ -377,7 +410,7 @@ "N_QUANTILES = 20 # 将 label 分为 20 组\n", "\n", "# 特征列(用于数据处理器)\n", - "FEATURE_COLS = list(FACTOR_DEFINITIONS.keys())\n", + "FEATURE_COLS = SELECTED_FACTORS\n", "\n", "# 数据处理器配置\n", "PROCESSORS = [\n", @@ -421,7 +454,9 @@ "\n", "# Top N 配置:每日推荐股票数量\n", "TOP_N = 5 # 可调整为 10, 20 等" - ] + ], + "outputs": [], + "execution_count": 4 }, { "metadata": {}, @@ -429,10 +464,13 @@ "source": "## 4. 训练流程" }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-03-11T16:02:50.330018Z", + "start_time": "2026-03-11T16:02:50.012964Z" + } + }, "cell_type": "code", - "outputs": [], - "execution_count": null, "source": [ "print(\"\\n\" + \"=\" * 80)\n", "print(\"LightGBM LambdaRank 排序学习训练\")\n", @@ -444,7 +482,9 @@ "\n", "# 2. 使用 metadata 定义因子\n", "print(\"\\n[2] 定义因子(从 metadata 注册)\")\n", - "feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n", + "feature_cols = create_factors_with_metadata(\n", + " engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n", + ")\n", "\n", "# 3. 准备数据\n", "print(\"\\n[3] 准备数据\")\n", @@ -507,7 +547,49 @@ " feature_cols=feature_cols,\n", " persist_model=PERSIST_MODEL,\n", ")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "LightGBM LambdaRank 排序学习训练\n", + "================================================================================\n", + "\n", + "[1] 创建 FactorEngine\n", + "\n", + "[2] 定义因子(从 metadata 注册)\n", + "================================================================================\n", + "注册因子\n", + "================================================================================\n", + "\n", + "注册特征因子(从 metadata):\n" + ] + }, + { + "ename": "QueryError", + "evalue": "查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n ", + "output_type": "error", + "traceback": [ + "\u001B[31m---------------------------------------------------------------------------\u001B[39m", + "\u001B[31mBinderException\u001B[39m Traceback (most recent call last)", + "\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:296\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 295\u001B[39m conn = \u001B[38;5;28mself\u001B[39m._get_connection()\n\u001B[32m--> \u001B[39m\u001B[32m296\u001B[39m result = \u001B[43mconn\u001B[49m\u001B[43m.\u001B[49m\u001B[43mexecute\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m.pl()\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n", + "\u001B[31mBinderException\u001B[39m: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001B[31mQueryError\u001B[39m Traceback (most recent call last)", + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[5]\u001B[39m\u001B[32m, line 11\u001B[39m\n\u001B[32m 9\u001B[39m \u001B[38;5;66;03m# 2. 使用 metadata 定义因子\u001B[39;00m\n\u001B[32m 10\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[2] 定义因子(从 metadata 注册)\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m---> \u001B[39m\u001B[32m11\u001B[39m feature_cols = \u001B[43mcreate_factors_with_metadata\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 12\u001B[39m \u001B[43m \u001B[49m\u001B[43mengine\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mSELECTED_FACTORS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mFACTOR_DEFINITIONS\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mLABEL_FACTOR\u001B[49m\n\u001B[32m 13\u001B[39m \u001B[43m)\u001B[49m\n\u001B[32m 15\u001B[39m \u001B[38;5;66;03m# 3. 准备数据\u001B[39;00m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m[3] 准备数据\u001B[39m\u001B[33m\"\u001B[39m)\n", + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 15\u001B[39m, in \u001B[36mcreate_factors_with_metadata\u001B[39m\u001B[34m(engine, selected_factors, factor_definitions, label_factor)\u001B[39m\n\u001B[32m 13\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[33m注册特征因子(从 metadata):\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 14\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m name \u001B[38;5;129;01min\u001B[39;00m selected_factors:\n\u001B[32m---> \u001B[39m\u001B[32m15\u001B[39m \u001B[43mengine\u001B[49m\u001B[43m.\u001B[49m\u001B[43madd_factor\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 16\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m - \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 18\u001B[39m \u001B[38;5;66;03m# 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\u001B[39;00m\n", + "\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:225\u001B[39m, in \u001B[36mFactorEngine.add_factor\u001B[39m\u001B[34m(self, name, expression, data_specs)\u001B[39m\n\u001B[32m 182\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"注册因子(支持多种调用方式)。\u001B[39;00m\n\u001B[32m 183\u001B[39m \n\u001B[32m 184\u001B[39m \u001B[33;03m这是 register 方法的增强版,支持以下调用方式:\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 221\u001B[39m \u001B[33;03m ... .add_factor(\"golden_cross\", \"ma5 > ma10\"))\u001B[39;00m\n\u001B[32m 222\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 223\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m expression \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 224\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询表达式\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m225\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_add_factor_from_metadata\u001B[49m\u001B[43m(\u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdata_specs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 227\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(expression, \u001B[38;5;28mstr\u001B[39m):\n\u001B[32m 228\u001B[39m \u001B[38;5;66;03m# Fail-Fast:立即解析,失败立即报错\u001B[39;00m\n\u001B[32m 229\u001B[39m node = \u001B[38;5;28mself\u001B[39m._parser.parse(expression)\n", + "\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\engine\\factor_engine.py:159\u001B[39m, in \u001B[36mFactorEngine._add_factor_from_metadata\u001B[39m\u001B[34m(self, name, factor_name_in_metadata, data_specs)\u001B[39m\n\u001B[32m 153\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\n\u001B[32m 154\u001B[39m \u001B[33m\"\u001B[39m\u001B[33m引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数,\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 155\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m例如:FactorEngine(metadata_path=\u001B[39m\u001B[33m'\u001B[39m\u001B[33mdata/factors.jsonl\u001B[39m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 156\u001B[39m )\n\u001B[32m 158\u001B[39m \u001B[38;5;66;03m# 从 metadata 查询因子\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m159\u001B[39m df = \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_metadata\u001B[49m\u001B[43m.\u001B[49m\u001B[43mget_factors_by_name\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfactor_name_in_metadata\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 161\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(df) == \u001B[32m0\u001B[39m:\n\u001B[32m 162\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[32m 163\u001B[39m \u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33m在 metadata 中未找到因子 \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfactor_name_in_metadata\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 164\u001B[39m + \u001B[33m\"\u001B[39m\u001B[33m请确认因子名称正确,或先使用 FactorManager 添加该因子。\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 165\u001B[39m )\n", + "\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:177\u001B[39m, in \u001B[36mFactorManager.get_factors_by_name\u001B[39m\u001B[34m(self, name)\u001B[39m\n\u001B[32m 154\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"根据名称查询因子。\u001B[39;00m\n\u001B[32m 155\u001B[39m \n\u001B[32m 156\u001B[39m \u001B[33;03m使用DuckDB执行SQL查询,返回Polars DataFrame。\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 170\u001B[39m \u001B[33;03m ... print(df[\"dsl\"][0])\u001B[39;00m\n\u001B[32m 171\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 172\u001B[39m sql = \u001B[33mf\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m 173\u001B[39m \u001B[33m SELECT *\u001B[39m\n\u001B[32m 174\u001B[39m \u001B[33m FROM read_json_auto(\u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mself\u001B[39m.filepath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\u001B[33m)\u001B[39m\n\u001B[32m 175\u001B[39m \u001B[33m WHERE name = \u001B[39m\u001B[33m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mname\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m'\u001B[39m\n\u001B[32m 176\u001B[39m \u001B[33m\u001B[39m\u001B[33m\"\"\"\u001B[39m\n\u001B[32m--> \u001B[39m\u001B[32m177\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_execute_query\u001B[49m\u001B[43m(\u001B[49m\u001B[43msql\u001B[49m\u001B[43m)\u001B[49m\n", + "\u001B[36mFile \u001B[39m\u001B[32mD:\\PyProject\\ProStock\\src\\factors\\metadata\\manager.py:299\u001B[39m, in \u001B[36mFactorManager._execute_query\u001B[39m\u001B[34m(self, sql)\u001B[39m\n\u001B[32m 297\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m result\n\u001B[32m 298\u001B[39m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[32m--> \u001B[39m\u001B[32m299\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m QueryError(sql, e)\n", + "\u001B[31mQueryError\u001B[39m: 查询执行失败: Binder Error: Referenced column \"name\" not found in FROM clause!\nCandidate bindings: \"json\"\n\nLINE 4: WHERE name = 'ma_5'\n ^\nSQL: \n SELECT *\n FROM read_json_auto('D:\\PyProject\\ProStock\\src\\experiment\\data\\factors.jsonl')\n WHERE name = 'ma_5'\n " + ] + } + ], + "execution_count": 5 }, { "metadata": {}, diff --git a/src/experiment/learn_to_rank.py b/src/experiment/learn_to_rank.py index 5a22b14..37a895f 100644 --- a/src/experiment/learn_to_rank.py +++ b/src/experiment/learn_to_rank.py @@ -40,29 +40,40 @@ from src.training.config import TrainingConfig # ## 2. 辅助函数 # %% def create_factors_with_metadata( - engine: FactorEngine, factor_definitions: dict, label_factor: dict + engine: FactorEngine, + selected_factors: List[str], + factor_definitions: dict, + label_factor: dict, ) -> List[str]: - """使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)""" + """注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)""" print("=" * 80) - print("使用 metadata 注册因子") + print("注册因子") print("=" * 80) - # 注册所有特征因子(通过 metadata 名称) + # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中) print("\n注册特征因子(从 metadata):") - for name in factor_definitions.keys(): - engine.add_factor_by_name(name) + for name in selected_factors: + engine.add_factor(name) print(f" - {name}") - # 注册 label 因子(通过表达式,因为 label 不在 metadata 中) + # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中) + print("\n注册特征因子(表达式):") + for name, expr in factor_definitions.items(): + engine.add_factor(name, expr) + print(f" - {name}: {expr}") + + # 注册 label 因子(通过表达式) print("\n注册 Label 因子(表达式):") for name, expr in label_factor.items(): engine.add_factor(name, expr) print(f" - {name}: {expr}") - # 从字典自动获取特征列 - feature_cols = list(factor_definitions.keys()) + # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys + feature_cols = selected_factors + list(factor_definitions.keys()) print(f"\n特征因子数: {len(feature_cols)}") + print(f" - 来自 metadata: {len(selected_factors)}") + print(f" - 来自表达式: {len(factor_definitions)}") print(f"Label: {list(label_factor.keys())[0]}") print(f"已注册因子总数: {len(engine.list_registered())}") @@ -236,62 +247,68 @@ def evaluate_ndcg_at_k( # 特征因子定义字典(复用 regression.ipynb 的因子定义) LABEL_NAME = "future_return_5_rank" -FACTOR_DEFINITIONS = { - # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) ================= - "ma_5": "ts_mean(close, 5)", - "ma_20": "ts_mean(close, 20)", - "ma_ratio_5_20": "ts_mean(close, 5) / (ts_mean(close, 20) + 1e-8) - 1", - "bias_10": "close / (ts_mean(close, 10) + 1e-8) - 1", - "high_low_ratio": "(close - ts_min(low, 20)) / (ts_max(high, 20) - ts_min(low, 20) + 1e-8)", - "bbi_ratio": "(ts_mean(close, 3) + ts_mean(close, 6) + ts_mean(close, 12) + ts_mean(close, 24)) / (4 * close + 1e-8)", - "return_5": "(close / (ts_delay(close, 5) + 1e-8)) - 1", - "return_20": "(close / (ts_delay(close, 20) + 1e-8)) - 1", - "kaufman_ER_20": "abs(close - ts_delay(close, 20)) / (ts_sum(abs(close - ts_delay(close, 1)), 20) + 1e-8)", - "mom_acceleration_10_20": "(close / (ts_delay(close, 10) + 1e-8) - 1) - (ts_delay(close, 10) / (ts_delay(close, 20) + 1e-8) - 1)", - "drawdown_from_high_60": "close / (ts_max(high, 60) + 1e-8) - 1", - "up_days_ratio_20": "ts_sum(close > ts_delay(close, 1), 20) / 20", +# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子) +SELECTED_FACTORS = [ + # ================= 1. 价格、趋势与路径依赖 ================= + "ma_5", + "ma_20", + "ma_ratio_5_20", + "bias_10", + "high_low_ratio", + "bbi_ratio", + "return_5", + "return_20", + "kaufman_ER_20", + "mom_acceleration_10_20", + "drawdown_from_high_60", + "up_days_ratio_20", # ================= 2. 波动率、风险调整与高阶矩 ================= - "volatility_5": "ts_std(close, 5)", - "volatility_20": "ts_std(close, 20)", - "volatility_ratio": "ts_std(close, 5) / (ts_std(close, 20) + 1e-8)", - "std_return_20": "ts_std((close / (ts_delay(close, 1) + 1e-8)) - 1, 20)", - "sharpe_ratio_20": "ts_mean(close / (ts_delay(close, 1) + 1e-8) - 1, 20) / (ts_std(close / (ts_delay(close, 1) + 1e-8) - 1, 20) + 1e-8)", - "min_ret_20": "ts_min(close / (ts_delay(close, 1) + 1e-8) - 1, 20)", - "volatility_squeeze_5_60": "ts_std(close, 5) / (ts_std(close, 60) + 1e-8)", + "volatility_5", + "volatility_20", + "volatility_ratio", + "std_return_20", + "sharpe_ratio_20", + "min_ret_20", + "volatility_squeeze_5_60", # ================= 3. 日内微观结构与异象 ================= - "overnight_intraday_diff": "(open / (ts_delay(close, 1) + 1e-8) - 1) - (close / (open + 1e-8) - 1)", - "upper_shadow_ratio": "(high - ((open + close + abs(open - close)) / 2)) / (high - low + 1e-8)", - "capital_retention_20": "ts_sum(abs(close - open), 20) / (ts_sum(high - low, 20) + 1e-8)", - "max_ret_20": "ts_max(close / (ts_delay(close, 1) + 1e-8) - 1, 20)", + "overnight_intraday_diff", + "upper_shadow_ratio", + "capital_retention_20", + "max_ret_20", # ================= 4. 量能、流动性与量价背离 ================= - "volume_ratio_5_20": "ts_mean(vol, 5) / (ts_mean(vol, 20) + 1e-8)", - "turnover_rate_mean_5": "ts_mean(turnover_rate, 5)", - "turnover_deviation": "(turnover_rate - ts_mean(turnover_rate, 10)) / (ts_std(turnover_rate, 10) + 1e-8)", - "amihud_illiq_20": "ts_mean(abs(close / (ts_delay(close, 1) + 1e-8) - 1) / (amount + 1e-8), 20)", - "turnover_cv_20": "ts_std(turnover_rate, 20) / (ts_mean(turnover_rate, 20) + 1e-8)", - "pv_corr_20": "ts_corr(close / (ts_delay(close, 1) + 1e-8) - 1, vol, 20)", - "close_vwap_deviation": "close / (amount / (vol * 100 + 1e-8) + 1e-8) - 1", + "volume_ratio_5_20", + "turnover_rate_mean_5", + "turnover_deviation", + "amihud_illiq_20", + "turnover_cv_20", + "pv_corr_20", + "close_vwap_deviation", # ================= 5. 基本面财务特征 ================= - "roe": "n_income / (total_hldr_eqy_exc_min_int + 1e-8)", - "roa": "n_income / (total_assets + 1e-8)", - "profit_margin": "n_income / (revenue + 1e-8)", - "debt_to_equity": "total_liab / (total_hldr_eqy_exc_min_int + 1e-8)", - "current_ratio": "total_cur_assets / (total_cur_liab + 1e-8)", - "net_profit_yoy": "(n_income / (ts_delay(n_income, 252) + 1e-8)) - 1", - "revenue_yoy": "(revenue / (ts_delay(revenue, 252) + 1e-8)) - 1", - "healthy_expansion_velocity": "(total_assets / (ts_delay(total_assets, 252) + 1e-8) - 1) - (total_liab / (ts_delay(total_liab, 252) + 1e-8) - 1)", + "roe", + "roa", + "profit_margin", + "debt_to_equity", + "current_ratio", + "net_profit_yoy", + "revenue_yoy", + "healthy_expansion_velocity", # ================= 6. 基本面估值与截面动量共振 ================= - "EP": "n_income / (total_mv * 10000 + 1e-8)", - "BP": "total_hldr_eqy_exc_min_int / (total_mv * 10000 + 1e-8)", - "CP": "n_cashflow_act / (total_mv * 10000 + 1e-8)", - "market_cap_rank": "cs_rank(total_mv)", - "turnover_rank": "cs_rank(turnover_rate)", - "return_5_rank": "cs_rank((close / (ts_delay(close, 5) + 1e-8)) - 1)", - "EP_rank": "cs_rank(n_income / (total_mv + 1e-8))", - "pe_expansion_trend": "(total_mv / (n_income + 1e-8)) / (ts_delay(total_mv, 60) / (ts_delay(n_income, 60) + 1e-8) + 1e-8) - 1", - "value_price_divergence": "cs_rank((n_income - ts_delay(n_income, 252)) / (abs(ts_delay(n_income, 252)) + 1e-8)) - cs_rank(close / (ts_delay(close, 20) + 1e-8))", - "active_market_cap": "total_mv * ts_mean(turnover_rate, 20)", - "ebit_rank": "cs_rank(ebit)", + "EP", + "BP", + "CP", + "market_cap_rank", + "turnover_rank", + "return_5_rank", + "EP_rank", + "pe_expansion_trend", + "value_price_divergence", + "active_market_cap", + "ebit_rank", +] + +# 因子定义字典(完整因子库) +FACTOR_DEFINITIONS = { + # "turnover_volatility_ratio": "log(ts_std(turnover_rate, 20))" } # Label 因子定义(不参与训练,用于计算目标) @@ -332,7 +349,7 @@ MODEL_PARAMS = { N_QUANTILES = 20 # 将 label 分为 20 组 # 特征列(用于数据处理器) -FEATURE_COLS = list(FACTOR_DEFINITIONS.keys()) +FEATURE_COLS = SELECTED_FACTORS # 数据处理器配置 PROCESSORS = [ @@ -385,11 +402,13 @@ print("=" * 80) # 1. 创建 FactorEngine(启用 metadata 功能) print("\n[1] 创建 FactorEngine") -engine = FactorEngine(metadata_path="data/factors.jsonl") +engine = FactorEngine() # 2. 使用 metadata 定义因子 print("\n[2] 定义因子(从 metadata 注册)") -feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR) +feature_cols = create_factors_with_metadata( + engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR +) # 3. 准备数据 print("\n[3] 准备数据") diff --git a/src/experiment/regression.ipynb b/src/experiment/regression.ipynb index fa8e0b7..79a78cd 100644 --- a/src/experiment/regression.ipynb +++ b/src/experiment/regression.ipynb @@ -47,29 +47,40 @@ "execution_count": null, "source": [ "def create_factors_with_metadata(\n", - " engine: FactorEngine, factor_definitions: dict, label_factor: dict\n", + " engine: FactorEngine,\n", + " selected_factors: List[str],\n", + " factor_definitions: dict,\n", + " label_factor: dict,\n", ") -> List[str]:\n", - " \"\"\"使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)\"\"\"\n", + " \"\"\"注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)\"\"\"\n", " print(\"=\" * 80)\n", - " print(\"使用 metadata 注册因子\")\n", + " print(\"注册因子\")\n", " print(\"=\" * 80)\n", "\n", - " # 注册所有特征因子(通过 metadata 名称)\n", + " # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中)\n", " print(\"\\n注册特征因子(从 metadata):\")\n", - " for name in factor_definitions.keys():\n", - " engine.add_factor_by_name(name)\n", + " for name in selected_factors:\n", + " engine.add_factor(name)\n", " print(f\" - {name}\")\n", "\n", - " # 注册 label 因子(通过表达式,因为 label 不在 metadata 中)\n", + " # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中)\n", + " print(\"\\n注册特征因子(表达式):\")\n", + " for name, expr in factor_definitions.items():\n", + " engine.add_factor(name, expr)\n", + " print(f\" - {name}: {expr}\")\n", + "\n", + " # 注册 label 因子(通过表达式)\n", " print(\"\\n注册 Label 因子(表达式):\")\n", " for name, expr in label_factor.items():\n", " engine.add_factor(name, expr)\n", " print(f\" - {name}: {expr}\")\n", "\n", - " # 从字典自动获取特征列\n", - " feature_cols = list(factor_definitions.keys())\n", + " # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys\n", + " feature_cols = selected_factors + list(factor_definitions.keys())\n", "\n", " print(f\"\\n特征因子数: {len(feature_cols)}\")\n", + " print(f\" - 来自 metadata: {len(selected_factors)}\")\n", + " print(f\" - 来自表达式: {len(factor_definitions)}\")\n", " print(f\"Label: {list(label_factor.keys())[0]}\")\n", " print(f\"已注册因子总数: {len(engine.list_registered())}\")\n", "\n", @@ -123,7 +134,67 @@ "# 特征因子定义字典:新增因子只需在此处添加一行\n", "LABEL_NAME = \"future_return_5\"\n", "\n", - "FACTOR_DEFINITIONS = FACTOR_DICT = {\n", + "# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子)\n", + "SELECTED_FACTORS = [\n", + " # ================= 1. 价格、趋势与路径依赖 =================\n", + " \"ma_5\",\n", + " \"ma_20\",\n", + " \"ma_ratio_5_20\",\n", + " \"bias_10\",\n", + " \"high_low_ratio\",\n", + " \"bbi_ratio\",\n", + " \"return_5\",\n", + " \"return_20\",\n", + " \"kaufman_ER_20\",\n", + " \"mom_acceleration_10_20\",\n", + " \"drawdown_from_high_60\",\n", + " \"up_days_ratio_20\",\n", + " # ================= 2. 波动率、风险调整与高阶矩 =================\n", + " \"volatility_5\",\n", + " \"volatility_20\",\n", + " \"volatility_ratio\",\n", + " \"std_return_20\",\n", + " \"sharpe_ratio_20\",\n", + " \"min_ret_20\",\n", + " \"volatility_squeeze_5_60\",\n", + " # ================= 3. 日内微观结构与异象 =================\n", + " \"overnight_intraday_diff\",\n", + " \"upper_shadow_ratio\",\n", + " \"capital_retention_20\",\n", + " \"max_ret_20\",\n", + " # ================= 4. 量能、流动性与量价背离 =================\n", + " \"volume_ratio_5_20\",\n", + " \"turnover_rate_mean_5\",\n", + " \"turnover_deviation\",\n", + " \"amihud_illiq_20\",\n", + " \"turnover_cv_20\",\n", + " \"pv_corr_20\",\n", + " \"close_vwap_deviation\",\n", + " # ================= 5. 基本面财务特征 =================\n", + " \"roe\",\n", + " \"roa\",\n", + " \"profit_margin\",\n", + " \"debt_to_equity\",\n", + " \"current_ratio\",\n", + " \"net_profit_yoy\",\n", + " \"revenue_yoy\",\n", + " \"healthy_expansion_velocity\",\n", + " # ================= 6. 基本面估值与截面动量共振 =================\n", + " \"EP\",\n", + " \"BP\",\n", + " \"CP\",\n", + " \"market_cap_rank\",\n", + " \"turnover_rank\",\n", + " \"return_5_rank\",\n", + " \"EP_rank\",\n", + " \"pe_expansion_trend\",\n", + " \"value_price_divergence\",\n", + " \"active_market_cap\",\n", + " \"ebit_rank\",\n", + "]\n", + "\n", + "# 因子定义字典(完整因子库)\n", + "FACTOR_DEFINITIONS = {\n", " # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) =================\n", " \"ma_5\": \"ts_mean(close, 5)\",\n", " \"ma_20\": \"ts_mean(close, 20)\",\n", @@ -338,7 +409,9 @@ "\n", "# 2. 使用 metadata 定义因子\n", "print(\"\\n[2] 定义因子(从 metadata 注册)\")\n", - "feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR)\n", + "feature_cols = create_factors_with_metadata(\n", + " engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR\n", + ")\n", "target_col = LABEL_NAME\n", "\n", "# 3. 准备数据(使用模块级别的日期配置)\n", diff --git a/src/experiment/regression.py b/src/experiment/regression.py index 57c054c..8b4e780 100644 --- a/src/experiment/regression.py +++ b/src/experiment/regression.py @@ -26,29 +26,40 @@ from src.training.config import TrainingConfig # ## 2. 定义辅助函数 # %% def create_factors_with_metadata( - engine: FactorEngine, factor_definitions: dict, label_factor: dict + engine: FactorEngine, + selected_factors: List[str], + factor_definitions: dict, + label_factor: dict, ) -> List[str]: - """使用 metadata 注册因子(特征因子通过名称注册,label 因子通过表达式注册)""" + """注册因子(SELECTED_FACTORS 从 metadata 查询,FACTOR_DEFINITIONS 用表达式注册)""" print("=" * 80) - print("使用 metadata 注册因子") + print("注册因子") print("=" * 80) - # 注册所有特征因子(通过 metadata 名称) + # 注册 SELECTED_FACTORS 中的因子(已在 metadata 中) print("\n注册特征因子(从 metadata):") - for name in factor_definitions.keys(): - engine.add_factor_by_name(name) + for name in selected_factors: + engine.add_factor(name) print(f" - {name}") - # 注册 label 因子(通过表达式,因为 label 不在 metadata 中) + # 注册 FACTOR_DEFINITIONS 中的因子(通过表达式,尚未在 metadata 中) + print("\n注册特征因子(表达式):") + for name, expr in factor_definitions.items(): + engine.add_factor(name, expr) + print(f" - {name}: {expr}") + + # 注册 label 因子(通过表达式) print("\n注册 Label 因子(表达式):") for name, expr in label_factor.items(): engine.add_factor(name, expr) print(f" - {name}: {expr}") - # 从字典自动获取特征列 - feature_cols = list(factor_definitions.keys()) + # 特征列 = SELECTED_FACTORS + FACTOR_DEFINITIONS 的 keys + feature_cols = selected_factors + list(factor_definitions.keys()) print(f"\n特征因子数: {len(feature_cols)}") + print(f" - 来自 metadata: {len(selected_factors)}") + print(f" - 来自表达式: {len(factor_definitions)}") print(f"Label: {list(label_factor.keys())[0]}") print(f"已注册因子总数: {len(engine.list_registered())}") @@ -91,7 +102,67 @@ def prepare_data( # 特征因子定义字典:新增因子只需在此处添加一行 LABEL_NAME = "future_return_5" -FACTOR_DEFINITIONS = FACTOR_DICT = { +# 当前选择的因子列表(从 FACTOR_DEFINITIONS 中选择要使用的因子) +SELECTED_FACTORS = [ + # ================= 1. 价格、趋势与路径依赖 ================= + "ma_5", + "ma_20", + "ma_ratio_5_20", + "bias_10", + "high_low_ratio", + "bbi_ratio", + "return_5", + "return_20", + "kaufman_ER_20", + "mom_acceleration_10_20", + "drawdown_from_high_60", + "up_days_ratio_20", + # ================= 2. 波动率、风险调整与高阶矩 ================= + "volatility_5", + "volatility_20", + "volatility_ratio", + "std_return_20", + "sharpe_ratio_20", + "min_ret_20", + "volatility_squeeze_5_60", + # ================= 3. 日内微观结构与异象 ================= + "overnight_intraday_diff", + "upper_shadow_ratio", + "capital_retention_20", + "max_ret_20", + # ================= 4. 量能、流动性与量价背离 ================= + "volume_ratio_5_20", + "turnover_rate_mean_5", + "turnover_deviation", + "amihud_illiq_20", + "turnover_cv_20", + "pv_corr_20", + "close_vwap_deviation", + # ================= 5. 基本面财务特征 ================= + "roe", + "roa", + "profit_margin", + "debt_to_equity", + "current_ratio", + "net_profit_yoy", + "revenue_yoy", + "healthy_expansion_velocity", + # ================= 6. 基本面估值与截面动量共振 ================= + "EP", + "BP", + "CP", + "market_cap_rank", + "turnover_rank", + "return_5_rank", + "EP_rank", + "pe_expansion_trend", + "value_price_divergence", + "active_market_cap", + "ebit_rank", +] + +# 因子定义字典(完整因子库) +FACTOR_DEFINITIONS = { # ================= 1. 价格、趋势与路径依赖 (Trend, Momentum & Path Dependency) ================= "ma_5": "ts_mean(close, 5)", "ma_20": "ts_mean(close, 20)", @@ -284,7 +355,9 @@ engine = FactorEngine(metadata_path="data/factors.jsonl") # 2. 使用 metadata 定义因子 print("\n[2] 定义因子(从 metadata 注册)") -feature_cols = create_factors_with_metadata(engine, FACTOR_DEFINITIONS, LABEL_FACTOR) +feature_cols = create_factors_with_metadata( + engine, SELECTED_FACTORS, FACTOR_DEFINITIONS, LABEL_FACTOR +) target_col = LABEL_NAME # 3. 准备数据(使用模块级别的日期配置) diff --git a/src/factors/engine/factor_engine.py b/src/factors/engine/factor_engine.py index a931d0f..f7452e4 100644 --- a/src/factors/engine/factor_engine.py +++ b/src/factors/engine/factor_engine.py @@ -81,12 +81,16 @@ class FactorEngine: self._registry = registry if registry is not None else FunctionRegistry() self._parser = FormulaParser(self._registry) - # 初始化 metadata 管理器(可选) - self._metadata: Optional["FactorManager"] = None + # 初始化 metadata 管理器(可选,默认启用) if metadata_path is not None: from src.factors.metadata import FactorManager self._metadata = FactorManager(metadata_path) + else: + # 使用 FactorManager 的默认路径 + from src.factors.metadata import FactorManager + + self._metadata = FactorManager() def register( self, @@ -128,22 +132,68 @@ class FactorEngine: return self + def _add_factor_from_metadata( + self, + name: str, + factor_name_in_metadata: str, + data_specs: Optional[List[DataSpec]] = None, + ) -> "FactorEngine": + """从 metadata 中查询并注册因子(内部方法)。 + + Args: + name: 要注册的因子名称(引擎中使用的名称) + factor_name_in_metadata: metadata 中的因子名称 + data_specs: 可选的数据规格 + + Returns: + self,支持链式调用 + + Raises: + RuntimeError: 当引擎未配置 metadata 路径时 + ValueError: 当在 metadata 中未找到因子时 + FormulaParseError: 当 DSL 表达式解析失败时 + """ + if self._metadata is None: + raise RuntimeError( + "引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数," + + "例如:FactorEngine(metadata_path='data/factors.jsonl')" + ) + + # 从 metadata 查询因子 + df = self._metadata.get_factors_by_name(factor_name_in_metadata) + + if len(df) == 0: + raise ValueError( + f"在 metadata 中未找到因子 '{factor_name_in_metadata}'。" + + "请确认因子名称正确,或先使用 FactorManager 添加该因子。" + ) + + # 获取 DSL 表达式 + dsl_expr = df["dsl"][0] + + # 解析表达式为 Node + node = self._parser.parse(dsl_expr) + + # 委托给 register 方法 + return self.register(name, node, data_specs) + def add_factor( self, name: str, - expression: Union[str, Node], + expression: Optional[Union[str, Node]] = None, data_specs: Optional[List[DataSpec]] = None, ) -> "FactorEngine": - """注册因子(支持字符串或 Node 表达式)。 + """注册因子(支持多种调用方式)。 - 这是 register 方法的增强版,支持字符串表达式解析。 - 向后兼容:register 方法保持不变,继续只接受 Node 类型。 + 这是 register 方法的增强版,支持以下调用方式: + 1. 传入 name 和 expression:直接注册表达式(字符串或 Node) + 2. 只传入 name:从 metadata 中查询表达式并注册 遵循 Fail-Fast 原则:字符串表达式会立即解析,失败时立即抛出异常。 Args: - name: 因子名称 - expression: 字符串表达式或 Node 对象 + name: 因子名称(引擎中使用的名称) + expression: 字符串表达式或 Node 对象,为 None 时从 metadata 查询 data_specs: 可选的数据规格 Returns: @@ -152,19 +202,21 @@ class FactorEngine: Raises: TypeError: 当 expression 类型不支持时 FormulaParseError: 当字符串解析失败时(立即报错) + RuntimeError: 当 expression 为 None 但未配置 metadata 时 + ValueError: 当在 metadata 中未找到因子时 Example: >>> engine = FactorEngine() >>> - >>> # 字符串方式(新功能) + >>> # 方式1:字符串表达式 >>> engine.add_factor("ma20", "ts_mean(close, 20)") >>> - >>> # Node 方式(与 register 相同) + >>> # 方式2:Node 表达式 >>> from src.factors.api import close, ts_mean >>> engine.add_factor("ma20", ts_mean(close, 20)) >>> - >>> # 复杂表达式 - >>> engine.add_factor("alpha1", "cs_rank(close / open)") + >>> # 方式3:从 metadata 查询(需要初始化时配置 metadata_path) + >>> engine.add_factor("return_5") # 从 metadata 查询名为 return_5 的因子 >>> >>> # 链式调用 >>> (engine @@ -172,6 +224,10 @@ class FactorEngine: ... .add_factor("ma10", "ts_mean(close, 10)") ... .add_factor("golden_cross", "ma5 > ma10")) """ + if expression is None: + # 从 metadata 查询表达式 + return self._add_factor_from_metadata(name, name, data_specs) + if isinstance(expression, str): # Fail-Fast:立即解析,失败立即报错 node = self._parser.parse(expression) @@ -185,76 +241,6 @@ class FactorEngine: # 委托给现有的 register 方法 return self.register(name, node, data_specs) - def add_factor_by_name( - self, - name: str, - factor_name_in_metadata: Optional[str] = None, - data_specs: Optional[List[DataSpec]] = None, - ) -> "FactorEngine": - """根据 metadata 中的因子名称注册因子。 - - 从 metadata 管理器中根据因子名称查询 DSL 表达式, - 然后解析并注册到引擎中。 - - Args: - name: 要注册的因子名称(引擎中使用的名称) - factor_name_in_metadata: metadata 中的因子名称, - 为 None 时默认使用 name 参数 - data_specs: 可选的数据规格 - - Returns: - self,支持链式调用 - - Raises: - RuntimeError: 当引擎未配置 metadata 路径时 - ValueError: 当在 metadata 中未找到因子时 - FormulaParseError: 当 DSL 表达式解析失败时 - - Example: - >>> # 初始化时启用 metadata - >>> engine = FactorEngine(metadata_path="data/factors.jsonl") - >>> - >>> # 注册 metadata 中的因子(使用相同名称) - >>> engine.add_factor_by_name("return_5") - >>> - >>> # 使用不同名称注册 - >>> engine.add_factor_by_name("my_mom", "momentum_5d") - >>> - >>> # 链式调用 - >>> (engine - ... .add_factor_by_name("ma20") - ... .add_factor_by_name("rsi14") - ... .compute(["ma20", "rsi14"], "20240101", "20240131")) - """ - if self._metadata is None: - raise RuntimeError( - "引擎未配置 metadata 路径。请在初始化时传入 metadata_path 参数," - + "例如:FactorEngine(metadata_path='data/factors.jsonl')" - ) - - # 使用传入的名称或默认使用 name - query_name = ( - factor_name_in_metadata if factor_name_in_metadata is not None else name - ) - - # 从 metadata 查询因子 - df = self._metadata.get_factors_by_name(query_name) - - if len(df) == 0: - raise ValueError( - f"在 metadata 中未找到因子 '{query_name}'。" - + "请确认因子名称正确,或先使用 FactorManager 添加该因子。" - ) - - # 获取 DSL 表达式 - dsl_expr = df["dsl"][0] - - # 解析表达式为 Node - node = self._parser.parse(dsl_expr) - - # 委托给 register 方法 - return self.register(name, node, data_specs) - def compute( self, factor_names: Union[str, List[str]], diff --git a/src/factors/metadata/manager.py b/src/factors/metadata/manager.py index dca2988..5942d5f 100644 --- a/src/factors/metadata/manager.py +++ b/src/factors/metadata/manager.py @@ -53,23 +53,32 @@ class FactorManager: _conn: DuckDB连接对象(懒加载) Example: - >>> manager = FactorManager("data/factors.jsonl") + >>> manager = FactorManager() # 使用默认路径 >>> df = manager.get_factors_by_name("mom_5d") >>> print(df["dsl"][0]) """ - def __init__(self, filepath: str) -> None: + _DEFAULT_FILENAME = "factors.jsonl" + + def __init__(self, filepath: Optional[str] = None) -> None: """初始化因子管理器。 如果文件不存在,会自动创建空的JSONL文件。 Args: - filepath: JSONL文件路径(相对或绝对路径) + filepath: JSONL文件路径(相对或绝对路径),为None时使用默认路径 Raises: FileOperationError: 当文件创建失败时 """ - self.filepath = Path(filepath).resolve() + if filepath is None: + # 使用默认路径:从配置读取数据目录 + from src.config.settings import settings + + self.filepath = settings.data_path_resolved / self._DEFAULT_FILENAME + else: + self.filepath = Path(filepath).resolve() + self._conn: Optional[duckdb.DuckDBPyConnection] = None # 确保文件存在