模型搭建简单介绍
首先,需要安装xgboost和DEAP库,由于pip安装较慢,可以在命令行中输入如下指令进行快速安装,该部分可以参考添加链接描述
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple +安装包
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple xgboost
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple deap
然后,我们可以开始编写代码。首先,导入必要的库:
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
import numpy as np
from deap import base, creator, tools, algorithms在这里插入代码片
接下来,加载Boston房价数据集:
boston = load_boston()
X, y = boston.data, boston.target
由于1.2.0以上版本的scikit-learn已经不再支持,可以使用1.1.1的scikit-learn,安装时候会自动卸载原版本参考添加链接描述
pip install scikit-learn==1.1.1-i https://pypi.tuna.tsinghua.edu.cn/simple
然后,定义目标函数,即XGBoost模型的交叉验证误差:
defeval_xgb(individual):#保证[0,1]
individual[6]=math.fabs(individual[6])while individual[6]>1:
individual[6]=individual[6]-1
params ={'max_depth': math.ceil(individual[0]),#取整'learning_rate': math.fabs(individual[1]),#取绝对值'n_estimators': individual[2],'gamma': individual[3],'min_child_weight': individual[4],'subsample': individual[5],'colsample_bytree': individual[6],'objective':'reg:squarederror'}
cv_results = xgb.cv(params=params, dtrain=dtrain, num_boost_round=100, nfold=5, metrics='rmse', early_stopping_rounds=10, seed=0)return cv_results['test-rmse-mean'][len(cv_results)-1],
这里的目标函数接受一个个体(即一组参数)作为输入,并返回该个体的交叉验证误差。
然后,定义遗传算法的参数和操作:
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual',list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register('attr_max_depth', np.random.randint,1,10)
toolbox.register('attr_learning_rate', np.random.uniform,0.01,0.3)
toolbox.register('attr_n_estimators', np.random.randint,50,200)
toolbox.register('attr_gamma', np.random.uniform,0,1)
toolbox.register('attr_min_child_weight', np.random.uniform,0.1,10)
toolbox.register('attr_subsample', np.random.uniform,0.5,1)
toolbox.register('attr_colsample_bytree', np.random.uniform,0.5,1)
toolbox.register('individual', tools.initCycle, creator.Individual,(
toolbox.attr_max_depth,
toolbox.attr_learning_rate,
toolbox.attr_n_estimators,
toolbox.attr_gamma,
toolbox.attr_min_child_weight,
toolbox.attr_subsample,
toolbox.attr_colsample_bytree), n=1)
toolbox.register('population', tools.initRepeat,list, toolbox.individual)
toolbox.register('evaluate', eval_xgb)
toolbox.register('mate', tools.cxUniform, indpb=0.1)
toolbox.register('mutate', tools.mutGaussian, mu=0, sigma=0.1, indpb=0.1)
toolbox.register('select', tools.selTournament, tournsize=3)
这里我们使用了随机数生成器来初始化每个参数,并定义了交叉和变异操作。
最后,运行遗传算法:
np.random.seed(0)
dtrain = xgb.DMatrix(X, label=y)
pop = toolbox.population(n=50)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('min', np.min)
pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, stats=stats, halloffame=hof, verbose=True)
best_ind = hof[0]print('Best individual:', best_ind)print('Best RMSE:', best_ind.fitness.values[0])
这里我们使用了eaSimple函数来运行遗传算法,并设置了交叉概率和变异概率。运行完毕后,我们可以得到最优的个体和对应的RMSE误差。
模型类封装
亲测可用,有问题欢迎评论
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
import numpy as np
from deap import base, creator, tools, algorithms
import math
defoptimize_xgb(X, y, n_pop=5, n_gen=5, cxpb=0.5, mutpb=0.2):#定义目标函数,即XGBoost模型的交叉验证误差,这里的目标函数接受一个个体(即一组参数)作为输入,并返回该个体的交叉验证误差defeval_xgb(individual):#保证[0,1]
individual[6]=math.fabs(individual[6])while individual[6]>1:
individual[6]=individual[6]-1
params ={'max_depth': math.ceil(individual[0]),#取整'learning_rate': math.fabs(individual[1]),#取绝对值'n_estimators': individual[2],'gamma': individual[3],'min_child_weight': individual[4],'subsample': individual[5],'colsample_bytree': individual[6],'objective':'reg:squarederror'}
cv_results = xgb.cv(params=params, dtrain=dtrain, num_boost_round=100, nfold=5, metrics='rmse', early_stopping_rounds=10, seed=0)return cv_results['test-rmse-mean'][len(cv_results)-1],#定义遗传算法的参数和操作#这里我们使用了随机数生成器来初始化每个参数,并定义了交叉和变异操作
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual',list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register('attr_max_depth', np.random.randint,1,10)
toolbox.register('attr_learning_rate', np.random.uniform,0.01,0.3)
toolbox.register('attr_n_estimators', np.random.randint,50,200)
toolbox.register('attr_gamma', np.random.uniform,0,1)
toolbox.register('attr_min_child_weight', np.random.uniform,0.1,10)
toolbox.register('attr_subsample', np.random.uniform,0.5,1)
toolbox.register('attr_colsample_bytree', np.random.uniform,0.5,1)
toolbox.register('individual', tools.initCycle, creator.Individual,(
toolbox.attr_max_depth,
toolbox.attr_learning_rate,
toolbox.attr_n_estimators,
toolbox.attr_gamma,
toolbox.attr_min_child_weight,
toolbox.attr_subsample,
toolbox.attr_colsample_bytree), n=1)
toolbox.register('population', tools.initRepeat,list, toolbox.individual)
toolbox.register('evaluate', eval_xgb)
toolbox.register('mate', tools.cxUniform, indpb=0.1)
toolbox.register('mutate', tools.mutGaussian, mu=0, sigma=0.1, indpb=0.1)
toolbox.register('select', tools.selTournament, tournsize=3)#这里我们使用了eaSimple函数来运行遗传算法,并设置了交叉概率和变异概率。运行完毕后,我们可以得到最优的个体和对应的RMSE误差
np.random.seed(0)
dtrain = xgb.DMatrix(X, label=y)
pop = toolbox.population(n=n_pop)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('avg', np.mean)
stats.register('min', np.min)
pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=n_gen, stats=stats, halloffame=hof, verbose=True)
best_ind = hof[0]
best_params ={'max_depth': best_ind[0],'learning_rate': best_ind[1],'n_estimators': best_ind[2],'gamma': best_ind[3],'min_child_weight': best_ind[4],'subsample': best_ind[5],'colsample_bytree': best_ind[6],'objective':'reg:squarederror'}
best_rmse = best_ind.fitness.values[0]print('Best individual:', best_ind)print('Best RMSE:', best_rmse)return best_params, best_rmse
# 这里我们将整个代码封装成了一个函数optimize_xgb,它接受训练数据X和标签y,以及遗传算法的参数n_pop、n_gen、cxpb和mutpb。函数返回最优的XGBoost参数和对应的RMSE误差。import pandas as pd
import numpy as np
#Best individual: [8.048710533322954, 0.0867211275103418, 153, 0.45615033221654855, 5.72749609379962, 0.5093949002181776, 0.8088177485379385]# Best RMSE: 3.4154928196132395
data_url ="http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2,:], raw_df.values[1::2,:2]])
target = raw_df.values[1::2,2]# boston = load_boston()# X, y = boston.data, boston.target
optimize_xgb(data,target)
版权归原作者 傻傻虎虎 所有, 如有侵权,请联系我们删除。