feat(probe-selection): 添加探针法因子筛选模块
This commit is contained in:
93
src/experiment/probe_selection/noise_generator.py
Normal file
93
src/experiment/probe_selection/noise_generator.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""噪音生成器
|
||||
|
||||
使用 Polars 零拷贝方式注入随机噪音特征。
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
|
||||
class NoiseGenerator:
|
||||
"""噪音生成器
|
||||
|
||||
生成服从标准正态分布的随机噪音列,使用 Polars 原生 API
|
||||
实现零拷贝注入。
|
||||
"""
|
||||
|
||||
NOISE_PREFIX = "__noise__"
|
||||
|
||||
def __init__(self, random_state: int = 42):
|
||||
"""初始化噪音生成器
|
||||
|
||||
Args:
|
||||
random_state: 随机种子,保证可复现性
|
||||
"""
|
||||
self.random_state = random_state
|
||||
|
||||
def generate_noise(
|
||||
self,
|
||||
df: pl.DataFrame,
|
||||
n_noise: int,
|
||||
seed: int = 42,
|
||||
) -> pl.DataFrame:
|
||||
"""向 DataFrame 注入噪音特征
|
||||
|
||||
使用 Polars 原生 with_columns 实现零拷贝拼接。
|
||||
|
||||
Args:
|
||||
df: 原始数据
|
||||
n_noise: 噪音列数量
|
||||
seed: 随机种子
|
||||
|
||||
Returns:
|
||||
添加了噪音列的 DataFrame
|
||||
"""
|
||||
np.random.seed(seed)
|
||||
n_rows = df.height
|
||||
|
||||
# 直接生成 Polars Series 列表,然后一次性 with_columns
|
||||
# 实现零拷贝拼接,避免转换为 Pandas
|
||||
noise_series = [
|
||||
pl.Series(
|
||||
f"{self.NOISE_PREFIX}{i}",
|
||||
np.random.randn(n_rows).astype(np.float32),
|
||||
dtype=pl.Float32,
|
||||
)
|
||||
for i in range(n_noise)
|
||||
]
|
||||
|
||||
return df.with_columns(noise_series)
|
||||
|
||||
def remove_noise(self, df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""移除噪音列
|
||||
|
||||
Args:
|
||||
df: 包含噪音列的数据
|
||||
|
||||
Returns:
|
||||
移除了噪音列的数据
|
||||
"""
|
||||
noise_cols = [col for col in df.columns if col.startswith(self.NOISE_PREFIX)]
|
||||
return df.drop(noise_cols)
|
||||
|
||||
def get_noise_columns(self, df: pl.DataFrame) -> list[str]:
|
||||
"""获取所有噪音列名
|
||||
|
||||
Args:
|
||||
df: 数据
|
||||
|
||||
Returns:
|
||||
噪音列名列表
|
||||
"""
|
||||
return [col for col in df.columns if col.startswith(self.NOISE_PREFIX)]
|
||||
|
||||
def is_noise_column(self, col_name: str) -> bool:
|
||||
"""判断是否为噪音列
|
||||
|
||||
Args:
|
||||
col_name: 列名
|
||||
|
||||
Returns:
|
||||
是否为噪音列
|
||||
"""
|
||||
return col_name.startswith(self.NOISE_PREFIX)
|
||||
Reference in New Issue
Block a user