联合中性化处理
# 假设 df 含列:['stock', 'date', 'factor', 'industry', 'mkt_cap']
def neutralize_factor(df, factor_col='factor', industry_col='industry', size_col='mkt_cap'):
"""
对单期截面数据进行行业 + 市值中性化
返回中性化后的因子值 residual
"""
# 1. 行业虚拟变量
industry_dummies = pd.get_dummies(df[industry_col], prefix='ind', drop_first=True)
# 2. 自变量矩阵:行业虚拟变量 + ln(市值)
X = pd.concat([np.log(df[size_col]), industry_dummies], axis=1)
X = sm.add_constant(X)
# 3. 因变量:原始因子值
y = df[factor_col]
# 4. 回归拟合
model = sm.OLS(y, X, missing='drop').fit()
# 5. 残差 = 中性化后的因子值
df[f'{factor_col}_neutral'] = model.resid
return df
# ----------------------------
# 多期中性化(按时间分组)
# ----------------------------
df_neutralized = (
df.groupby('date', group_keys=False)
.apply(lambda x: neutralize_factor(x, 'factor', 'industry', 'mkt_cap'))
)
2025-11-07