去极值有多种方式,比如均值/方差,分位数法,MAD法,还有一些。我自己实现的代码如下:
```
# 去极值:按均值-方差方式迭代去极值(3倍标准差)
def winsorize_std(data, iter_cnt=5, inplace=False):
if inplace:
series = data
else:
series = data.copy()
for i in range(iter_cnt):
series = series.replace([-np.inf, np.inf], np.nan)
std = series.std() # 计算标准差
mean = series.mean() # 计算平均值
lower = mean - 3*std # 计算下边界
upper = mean + 3*std # 计算上边界
series[series < lower] = lower # 去下极值
series[series > upper] = upper # 去上极值
return series
```
```
# 去极值:按分位数进行去极值
def winsorize_quantile(data, pvalue=0.05, inplace=False):
if inplace:
serise = data
else:
series = data.copy()
series = series.replace([-np.inf, np.inf], np.nan)
lowerbound = pvalue / 2.0
upperbound = 1 - pvalue / 2.0
lower = series.quantile(lowerbound) # 计算下分位数
upper = series.quantile(upperbound) # 计算上分位数
series[series < lower] = lower # 去下极值
series[series > upper] = upper # 去上极值
return series
```
```
# 去极值:使用 MAD 方式去极值
def winsorize_mad(data, num=3, inplace=False):
if inplace:
series = data
else:
series = data.copy()
series = series.replace([-np.inf, np.inf], np.nan)
median = series.median()
mad = abs(series - median).median()
mad = mad * num * 1.483
lower = - mad
upper = mad
series[series < lower] = lower
series[series > upper] = upper
return series
```
2016-11-16