parallel测试

2025-05-26 21:34:36 +08:00
parent a4b05bb62f
commit bf86fd9415
28 changed files with 15054 additions and 7886 deletions
--- a/main/factor/factor.py
+++ b/main/factor/factor.py
@@ -2,6 +2,9 @@ import numpy as np
 import pandas as pd
 import talib

+from pandarallel import pandarallel
+
+pandarallel.initialize()

 def get_rolling_factor(df):
    old_columns = df.columns.tolist()[:]
@@ -2747,7 +2750,7 @@ def sentiment_panic_greed_index(df: pd.DataFrame, window_atr: int = 14, window_s
        print(f"Error: DataFrame 缺少必需的 OHLCV 列。将为 {factor_name} 填充 NaN。")
        df[factor_name] = np.nan
        return
-
+    
    try:
        df['_prev_close'] = df['close'].shift(1)

@@ -2848,4 +2851,186 @@ def sentiment_reversal_indicator(df: pd.DataFrame, window_ret: int = 5, window_v
        if cols_to_drop:
            df.drop(columns=cols_to_drop, inplace=True)
        print(f"Finished {factor_name}.")
-        return df
+        return df
+
+
+def price_minus_deduction_price(df, n=10):
+    """
+    因子 1 (定量): 计算当前收盘价与N周期前收盘价（均线抵扣价）的差值。
+
+    该因子衡量当前价格相对于即将移出均线计算窗口的价格的差异。
+    正值表示当前价格高于抵扣价，下一周期均线倾向于上涨（如果价格不变）。
+
+    参数:
+    df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
+    n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
+
+    返回:
+    pd.DataFrame: 增加了 'price_minus_deduction_price_n' 列的DataFrame。
+    """
+    if 'close' not in df.columns:
+        print("错误: DataFrame中没有'close'列，无法计算抵扣价相关因子。")
+        return df
+
+    if n <= 1:
+        print("错误: 均线周期 n 必须大于 1。")
+        df[f'price_minus_deduction_price_{n}'] = np.nan
+        return df
+
+    grouped = df.groupby('ts_code', group_keys=False)
+
+    # 抵扣价是当前窗口移除的最早的价格，即当前价格的 n-1 周期前的价格
+    # 例如计算 SMA(10) for P_t, 窗口是 P_{t-9}, ..., P_t. 移除的是 P_{t-9}.
+    # P_{t-9} 是 P_t 的 shift(9). So shift(n-1).
+    deduction_price = grouped['close'].shift(n - 1)
+
+    # 计算差值
+    df[f'price_minus_deduction_price_{n}'] = df['close'] - deduction_price
+
+    print(f"因子 price_minus_deduction_price_{n} 计算完成。")
+    return df
+
+def price_deduction_price_diff_ratio_to_sma(df, n=10):
+    """
+    因子 2 (定量): 计算当前收盘价与抵扣价的差值占N周期均线的比例。
+
+    该因子衡量当前价格高于抵扣价的程度相对于均线水平的大小。
+
+    参数:
+    df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
+    n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
+
+    返回:
+    pd.DataFrame: 增加了 'price_deduction_price_diff_ratio_to_sma_n' 列的DataFrame。
+    """
+    if 'close' not in df.columns:
+        print("错误: DataFrame中没有'close'列，无法计算抵扣价相关因子。")
+        return df
+    if n <= 1:
+        print("错误: 均线周期 n 必须大于 1。")
+        df[f'price_deduction_price_diff_ratio_to_sma_{n}'] = np.nan
+        return df
+
+    grouped = df.groupby('ts_code', group_keys=False)
+
+    # 计算N周期SMA
+    # 使用 transform 可以保持与原始 df 的索引对齐
+    sma = grouped['close'].transform(lambda x: x.rolling(window=n).mean())
+
+    # 抵扣价
+    deduction_price = grouped['close'].shift(n - 1)
+
+    # 计算比例，处理均线为零的情况
+    diff = df['close'] - deduction_price
+    # 使用 np.divide 并指定 where 条件和 fill_value 来避免除以零警告和 NaN 结果
+    # 如果 sma 为 0，则结果设为 NaN
+    df[f'price_deduction_price_diff_ratio_to_sma_{n}'] = np.divide(
+        diff,
+        sma,
+        out=np.full_like(diff, np.nan), # 输出数组形状与 diff 相同，NaN 填充
+        where=(sma != 0) # 仅在 sma 不为 0 时执行除法
+    )
+
+    # np.divide with where handles Inf/-Inf and 0/0 (as NaN), but explicitly replace might be slightly safer depending on numpy version
+    # df[f'price_deduction_price_diff_ratio_to_sma_{n}'].replace([np.inf, -np.inf], np.nan, inplace=True) # This is often redundant with np.divide(..., where=...)
+
+    print(f"因子 price_deduction_price_diff_ratio_to_sma_{n} 计算完成。")
+    return df
+
+
+def cat_price_vs_sma_vs_deduction_price(df, n=10):
+    """
+    因子 3 (分类): 基于当前收盘价、N周期均线和抵扣价的位置关系构建分类因子。
+
+    捕捉当前价格和抵扣价相对于均线的位置，指示可能的趋势状态或变化。
+
+    分类定义：
+    0: 数据不足 (SMA 或抵扣价为 NaN) 或 均线为 NaN
+    1: 当前价 > SMA 且 抵扣价 > SMA (两者都在均线之上)
+    2: 当前价 < SMA 且 抵扣价 < SMA (两者都在均线之下)
+    3: 当前价 > SMA 且 抵扣价 <= SMA (当前价上穿或位于均线上方，抵扣价在均线下方或正好在均线)
+    4: 当前价 <= SMA 且 抵扣价 > SMA (当前价下穿或位于均线下方，抵扣价在均线上方)
+
+    参数:
+    df (pd.DataFrame): 包含股票日线数据的DataFrame。必须包含 'ts_code', 'close' 列。
+    n (int): 均线计算的周期数。抵扣价是 n-1 周期前的数据点。
+
+    返回:
+    pd.DataFrame: 增加了 'cat_price_vs_sma_vs_deduction_price_n' 列的DataFrame。
+    """
+    if 'close' not in df.columns:
+        print("错误: DataFrame中没有'close'列，无法计算抵扣价相关因子。")
+        return df
+    if n <= 1:
+        print("错误: 均线周期 n 必须大于 1。")
+        df[f'cat_price_vs_sma_vs_deduction_price_{n}'] = np.nan
+        return df
+
+    grouped = df.groupby('ts_code', group_keys=False)
+
+    # 计算N周期SMA
+    sma = grouped['close'].transform(lambda x: x.rolling(window=n).mean())
+
+    # 抵扣价
+    deduction_price = grouped['close'].shift(n - 1)
+
+    # 定义条件和分类值
+    conditions = [
+        (df['close'] > sma) & (deduction_price > sma),
+        (df['close'] < sma) & (deduction_price < sma),
+        (df['close'] > sma) & (deduction_price <= sma), # 包含等于的情况
+        (df['close'] <= sma) & (deduction_price > sma)  # 包含等于的情况
+        # 注意：sma 或 deduction_price 为 NaN 的行，其条件结果为 False，会落入 default=0
+    ]
+    choices = [1, 2, 3, 4]
+
+    # 使用 np.select 进行分类
+    # 默认值为 0，用于处理条件不满足或输入为 NaN 的情况
+    df[f'cat_price_vs_sma_vs_deduction_price_{n}'] = np.select(conditions, choices, default=0)
+
+    print(f"因子 cat_price_vs_sma_vs_deduction_price_{n} 计算完成。")
+    return df
+
+def cat_is_on_top_list(df: pd.DataFrame, top_list: pd.DataFrame) -> pd.DataFrame:
+    if 'cat_reason' not in df.columns:
+        print('计算因子cat_is_on_top_list失败，缺少cat_reason列')
+        return df
+
+    df['cat_is_on_top_list'] = df['cat_reason']
+    df['cat_is_on_top_list'] = df['cat_is_on_top_list'] * (df['pct_chg'] > 0).astype(int)
+
+    return df
+
+
+def cat_reason(df: pd.DataFrame, top_list: pd.DataFrame) -> pd.DataFrame:
+    """
+    高效地将龙虎榜的 reason 列转化为单一数值类型的因子列，并仅返回必要的列。
+
+    Args:
+        df (pd.DataFrame): 包含所有股票数据的 DataFrame，需包含 'ts_code' 和 'trade_date' 列。
+        top_list (pd.DataFrame): 每日龙虎榜数据的 DataFrame，需包含 'ts_code', 'trade_date' 和 'reason' 列。
+
+    Returns:
+        pd.DataFrame: 包含 'ts_code', 'trade_date' 和 'cat_reason' 列。
+    """
+    # 提取所有唯一的 reason 并进行数值编码
+    unique_reasons = top_list['reason'].unique()
+    reason_mapping = {reason: i + 1 for i, reason in enumerate(unique_reasons)}
+
+    # 在 top_list 中创建数值型的 reason 列
+    top_list['cat_reason'] = top_list['reason'].map(reason_mapping).astype('Int64')
+
+    # 转换 trade_date 类型以进行合并
+    df['trade_date'] = pd.to_datetime(df['trade_date'])
+    top_list['trade_date'] = pd.to_datetime(top_list['trade_date'])
+
+    # 仅保留 top_list 中需要的列进行合并
+    top_list_slim = top_list[['ts_code', 'trade_date', 'cat_reason']]
+
+    # 合并 DataFrame
+    merged_df = df.merge(top_list_slim, on=['ts_code', 'trade_date'], how='left')
+
+    # 填充 NaN 为 0 并转换为 int 类型
+    merged_df['cat_reason'] = merged_df['cat_reason'].fillna(0).astype('int')
+
+    return merged_df