数据预处理—3.变量选择之逐步挑选—向前挑选、向后挑选、双向挑选python实现
文章目录引言1.最优子集法2.向前逐步选择3.向后逐步选择4.双向挑选引言 逐步挑选法是基于最优子集法上的改进。逐步挑选法分为向前挑选、向后挑选、双向挑选。其中最常用的是双向挑选,能够兼顾模型复杂度与模型精度的要求。逐步回归法计算量大,python中也没有现成的包调用,使用的不多。常用到的指标有AIC,BIC,R2R^2R2,在python中没有找到直接计算AIC,BIC的包,自定义也很复杂,这
引言
逐步挑选法是基于最优子集法上的改进。逐步挑选法分为向前挑选、向后挑选、双向挑选。其中最常用的是双向挑选,能够兼顾模型复杂度与模型精度的要求。逐步回归法计算量大,python中也没有现成的包调用,使用的不多。常用到的指标有AIC,BIC, R 2 R^2 R2,在python中没有找到直接计算AIC,BIC的包,自定义也很复杂,这里使用 R 2 R^2 R2
1.最优子集法
(i) 记不含任何特征的模型为 𝑀0 ,计算这个 𝑀0 的测试误差。
(ii) 在 𝑀0 基础上增加一个变量,计算p个模型的RSS,选择RSS最小的模型记作 𝑀1 ,并计算该模型 𝑀1 的测试误差。
(iii) 再增加变量,计算p-1个模型的RSS,并选择RSS最小的模型记作 𝑀2 ,并计算该模型 𝑀2 的测试误差。
(iv) 重复以上过程知道拟合的模型有p个特征为止,并选择p+1个模型 {𝑀0,𝑀1,…,𝑀𝑝} 中测试误差最小的模型作为最优模型。
最优子集选择虽然在原理上很直观,但是随着数据特征维度p的增加,子集的数量为
2
p
2^p
2p (概率论里的知识),计算效率非常低下且需要的计算内存也很高,在大数据的背景下显然不适用。因此,我们需要把最优子集选择的运算效率提高。
2.向前逐步选择
相当于在最优子集法的基础上,每增加一个变量考虑了局部最优。
(i) 记不含任何特征的模型为
M
0
M_0
M0,计算这个
M
0
M_0
M0的测试误差。
(ii) 在
M
0
M_0
M0基础上增加一个变量,计算p个模型的RSS,选择RSS最小的模型记作
M
1
M_1
M1,并计算该模型
M
1
M_1
M1的测试误差。
(iii) 在最小的RSS模型下继续增加一个变量,选择RSS最小的模型记作
M
2
M_2
M2,并计算该模型
M
2
M_2
M2的测试误差。
(iv) 以此类推,重复以上过程知道拟合的模型有p个特征为止,并选择p+1个模型
{
M
0
,
M
1
,
.
.
.
,
M
p
}
\{M_0,M_1,...,M_p \}
{M0,M1,...,Mp}中测试误差最小的模型作为最优模型。
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: admin
@file: 向前逐步选择.py
@time: 2021/03/15
@desc:
"""
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import Lasso
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
def froward_select(train_data, test_data, target):
"""
向前逐步回归
:param data: 数据
:param target:目标值
:return:
"""
variate = set(train_data.columns)
variate.remove(target)
# 参数
selected = [] # 储存挑选的变量
# 初始化
# 初始化决定系数R^2,越近于1越好
cur_score, best_score = 0.0, 0.0
# 循环删选变量,直至对所有变量进行了选择
while variate:
variate_r2 = []
# 找到局部最优
for var in variate:
selected.append(var)
if len(selected) == 1:
model = Lasso().fit(train_data[selected[0]].values.reshape(-1, 1), train_data[target])
y_pred = model.predict(test_data[selected[0]].values.reshape(-1, 1))
# R2 = r2(test_data[target], y_pred)
R2 = r2_score(test_data[target], y_pred)
variate_r2.append((R2, var))
selected.remove(var)
else:
model = Lasso().fit(train_data[selected], train_data[target])
y_pred = model.predict(test_data[selected])
# R2 = r2(test_data[target], y_pred)
R2 = r2_score(test_data[target], y_pred)
variate_r2.append((R2, var))
selected.remove(var)
variate_r2.sort(reverse=False) # 默认升序
best_score, best_var = variate_r2.pop() # pop用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值
if cur_score < best_score: # 说明了加了该变量更好了
variate.remove(best_var) # 判断过了,不管是好是坏,就删了
selected.append(best_var)
cur_score = best_score
print("R2={},continue!".format(cur_score))
else:
print('for selection over!')
break
selected_features = '+'.join([str(i) for i in selected])
print(selected_features)
def main():
boston = load_boston()
X = boston.data
y = boston.target
features = boston.feature_names
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
boston_train_data = pd.DataFrame(x_train, columns=features)
boston_train_data["Price"] = y_train
boston_test_data = pd.DataFrame(x_test, columns=features)
boston_test_data["Price"] = y_test
froward_select(boston_train_data, boston_test_data, 'Price')
if __name__ == '__main__':
main()
R2=0.61744910032392,continue!
R2=0.6908671406351847,continue!
R2=0.7317782212152852,continue!
R2=0.7395157511526225,continue!
R2=0.7433588119420051,continue!
R2=0.7454229322919887,continue!
R2=0.7462568212024802,continue!
R2=0.7462857832907019,continue!
for selection over!
LSTAT+PTRATIO+RM+DIS+B+CRIM+INDUS+TAX
3.向后逐步选择
向后逐步选择简述如下:
- 初始化时将所有特征放入模型中
- 每次剔除最差变量,该变量的剔除使得模型效果不明显,评估模型性能的改善
- 重复直到没有变量可剔除后
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: admin
@file: 向后逐步挑选.py
@time: 2021/03/16
@desc:
"""
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import Lasso
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
def froward_select(train_data, test_data, target):
"""
向前逐步回归
:param data: 数据
:param target:目标值
:return:
"""
variate = list(set(train_data.columns))
variate.remove(target)
# 参数
selected = [] # 储存挑选的变量
# 初始化
# 初始化决定系数R^2,越近于1越好
cur_score, best_score = 0.0, 0.0
# 循环删选变量,直至对所有变量进行了选择
while variate:
variate_r2 = []
# 找到局部最优
for var in variate:
variate.remove(var)
if len(variate) == 1:
model = Lasso().fit(train_data[variate[0]].values.reshape(-1, 1), train_data[target])
y_pred = model.predict(test_data[variate[0]].values.reshape(-1, 1))
R2 = r2_score(test_data[target], y_pred)
variate_r2.append((R2, var))
variate.append(var)
else:
model = Lasso().fit(train_data[variate], train_data[target])
y_pred = model.predict(test_data[variate])
R2 = r2_score(test_data[target], y_pred)
variate_r2.append((R2, var))
variate.append(var)
variate_r2.sort(reverse=False) # 升序排序r2,默认升序
best_score, best_var = variate_r2.pop() # pop用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值
if cur_score < best_score: # 说明了移除了该变量更好了
variate.remove(best_var) # 判断过了,不管是好是坏,就删了
selected.append(best_var)
cur_score = best_score
print("R2={},continue!".format(cur_score))
else:
print('for selection over!')
break
print(selected)
selected = [var for var in set(train_data.columns) if var not in selected]
selected_features = '+'.join([str(i) for i in selected])
print(selected_features)
def main():
boston = load_boston()
X = boston.data
y = boston.target
features = boston.feature_names
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
boston_train_data = pd.DataFrame(x_train, columns=features)
boston_train_data["Price"] = y_train
boston_test_data = pd.DataFrame(x_test, columns=features)
boston_test_data["Price"] = y_test
froward_select(boston_train_data, boston_test_data, 'Price')
if __name__ == '__main__':
main()
R2=0.6130365918500247,continue!
R2=0.6206140392385366,continue!
R2=0.6206319773780711,continue!
R2=0.6216812478858313,continue!
R2=0.6217076288117218,continue!
for selection over!
['CHAS', 'AGE', 'INDUS', 'ZN', 'NOX']
TAX+Price+RAD+DIS+PTRATIO+RM+LSTAT+CRIM+B
4.双向挑选
双向挑选简述如下:
向前向后挑选的结合
双向挑选用的较多,能够兼顾模型复杂度与模型精度的要求。
描述为:先两步向前挑选,再向后挑选,再反复向前向后
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: admin
@file: 双向挑选.py
@time: 2021/03/16
@desc:
"""
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import Lasso
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
def froward_select(train_data, test_data, target):
"""
向前逐步回归
:param data: 数据
:param target:目标值
:return:
"""
variate = list(set(train_data.columns))
variate.remove(target)
selected = [] # 储存挑选的变量
selected_h = [] # 存储删除的变量
# 初始化
# 初始化决定系数R^2,越近于1越好
cur_score_f, best_score_f = 0.0, 0.0
cur_score_h, best_score_h = 0.0, 0.0
# 循环删选变量,直至对所有变量进行了选择
# 双向挑选—先两步前向再一步后向
while variate:
variate_r2_f = []
variate_r2_h = []
# 找到局部最优
# 先两步前向
for i in range(2):
for var in variate:
selected.append(var)
if len(selected) == 1:
model = Lasso().fit(train_data[selected[0]].values.reshape(-1, 1), train_data[target])
y_pred = model.predict(test_data[selected[0]].values.reshape(-1, 1))
R2 = r2_score(test_data[target], y_pred)
variate_r2_f.append((R2, var))
selected.remove(var)
else:
model = Lasso().fit(train_data[selected], train_data[target])
y_pred = model.predict(test_data[selected])
R2 = r2_score(test_data[target], y_pred)
variate_r2_f.append((R2, var))
selected.remove(var)
variate_r2_f.sort(reverse=False) # 降序排序r2,默认升序
best_score_f, best_var_f = variate_r2_f.pop() # pop用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值
if cur_score_f < best_score_f: # 说明了加了该变量更好了就不移除了,否则就移除
selected.append(best_var_f)
cur_score_f = best_score_f
print("R2_f={},continue!".format(cur_score_f))
else:
variate.remove(best_var_f)
break
# 再一步后向
for var in variate:
variate.remove(var)
if len(variate) == 1:
model = Lasso().fit(train_data[variate[0]].values.reshape(-1, 1), train_data[target])
y_pred = model.predict(test_data[variate[0]].values.reshape(-1, 1))
R2 = r2_score(test_data[target], y_pred)
variate_r2_h.append((R2, var))
variate.append(var)
else:
model = Lasso().fit(train_data[variate], train_data[target])
y_pred = model.predict(test_data[variate])
R2 = r2_score(test_data[target], y_pred)
variate_r2_h.append((R2, var))
variate.append(var)
variate_r2_h.sort(reverse=False) # 升序排序r2,默认升序
best_score_h, best_var_h = variate_r2_h.pop() # pop用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值
if cur_score_h < best_score_h: # 说明了移除了该变量更好了
variate.remove(best_var_h)
selected_h.append(best_var_h)
cur_score_h = best_score_h
print("R2_h={},continue!".format(cur_score_h))
else:
print('for selection over!')
selected = [var for var in set(train_data.columns) if var not in selected_h]
selected_features = '+'.join([str(i) for i in selected])
print(selected_features)
break
def main():
boston = load_boston()
X = boston.data
y = boston.target
features = boston.feature_names
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
boston_train_data = pd.DataFrame(x_train, columns=features)
boston_train_data["Price"] = y_train
boston_test_data = pd.DataFrame(x_test, columns=features)
boston_test_data["Price"] = y_test
froward_select(boston_train_data, boston_test_data, 'Price')
if __name__ == '__main__':
main()
R2_f=0.5290772958895777,continue!
R2_f=0.5992603091580796,continue!
R2_h=0.6392096900660633,continue!
R2_f=0.6328497309792275,continue!
R2_f=0.6424099014083555,continue!
R2_h=0.6446960403771425,continue!
R2_f=0.6529845736263218,continue!
R2_f=0.6555371387702666,continue!
R2_h=0.6524813775669193,continue!
R2_f=0.6577033230821112,continue!
R2_f=0.6577063213485781,continue!
R2_h=0.6525859983540159,continue!
R2_f=0.6577196381996436,continue!
for selection over!
Price+RM+CHAS+AGE+PTRATIO+TAX+NOX+CRIM+B+DIS
如果对您有帮助,麻烦点赞关注,这真的对我很重要!!!如果需要互关,请评论留言!
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
所有评论(0)