这是本次夏令营的最后一次任务打卡了,通过七天的学习基本学习到了通过Python代码方式来实现机器学习去预测电力输出。对自己来说也算是基本完成任务了。第一天的baseline的返回分数是373,到后来一直在尝试Task03里面的样板代码去做lightgbm、xgboost与catboost的相互融合去输出结果。在使用Task03中的特征提取、特征划分与特征工程建立之后,我做的三者融合输出的结果是257.7,效果要好于单一的lightgbm的预测结果分数259.9。下方是我写的三者融合的代码:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
import tqdm
import sys
import os
import gc
import argparse
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id', 'dt'], ascending=False).reset_index(drop=True)
# 历史平移
for i in range(10, 36):
data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1, 4):
data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15, 30, 50, 70]:
data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
# 历史平移 + 窗口统计
for win in [7, 14, 28, 35, 50, 70]:
data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').mean().values
data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').max().values
data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').std().values
# 数据切分
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)
# 确定输入特征
train_cols = [f for f in data.columns if f not in ['id', 'target']]
def train_cb_model(train_df, valid_df, test_df, cols):
cb_params = {
'iterations': 2000,
'learning_rate': 0.05,
'depth': 6,
'l2_leaf_reg': 3,
'subsample': 0.8,
'colsample_bylevel': 0.8,
'bootstrap_type': 'Bernoulli',
'objective': 'RMSE',
'thread_count': 16,
'verbose': 200
}
train_dataset = cb.Pool(train_df[cols], label=train_df['target'])
valid_dataset = cb.Pool(valid_df[cols], label=valid_df['target'])
test_dataset = cb.Pool(test_df[cols])
model = cb.CatBoostRegressor(**cb_params)
model.fit(train_dataset, eval_set=valid_dataset, early_stopping_rounds=100, verbose=200)
val_pred = model.predict(valid_dataset)
test_pred = model.predict(test_dataset)
return val_pred, test_pred
def train_lgb_model(train_df, valid_df, test_df, cols):
# lightgbm参数
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'mse',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.05,
'seed': 2024,
'nthread': 16,
'verbose': -1,
}
train_data = lgb.Dataset(train_df[cols], label=train_df['target'])
valid_data = lgb.Dataset(valid_df[cols], label=valid_df['target'], reference=train_data)
model = lgb.train(lgb_params, train_data, num_boost_round=1000, valid_sets=[train_data, valid_data],
verbose_eval=100, early_stopping_rounds=100)
val_pred = model.predict(valid_df[cols], num_iteration=model.best_iteration)
test_pred = model.predict(test_df[cols], num_iteration=model.best_iteration)
return val_pred, test_pred
import xgboost as xgb
import xgboost as xgb
def train_xgb_model(train_df, valid_df, test_df, cols):
xgb_params = {
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'eta': 0.05,
'max_depth': 6,
'subsample': 0.8,
'colsample_bytree': 0.8,
'verbosity': 1
}
dtrain = xgb.DMatrix(train_df[cols], label=train_df['target'])
dvalid = xgb.DMatrix(valid_df[cols], label=valid_df['target'])
dtest = xgb.DMatrix(test_df[cols])
model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dvalid, 'valid')],early_stopping_rounds=100, verbose_eval=100)
# 在验证集和测试集上进行预测
val_pred = model.predict(dvalid, iteration_range=(0, model.best_iteration))
test_pred = model.predict(dtest, iteration_range=(0, model.best_iteration))
return val_pred, test_pred
# 离线分数评估
score = mean_squared_error(val_pred, val_y)
print(score)
return val_pred, test_pred
# 训练 CatBoost 模型
cb_oof, cb_test = train_cb_model(train[train.dt >= 31], train[train.dt <= 30], test, train_cols)
# 训练 LightGBM 模型
lgb_oof, lgb_test = train_lgb_model(train[train.dt >= 31], train[train.dt <= 30], test, train_cols)
# 训练 XGBoost 模型
xgb_oof, xgb_test = train_xgb_model(train[train.dt >= 31], train[train.dt <= 30], test, train_cols)
# 模型预测结果融合
final_test = (cb_test + lgb_test + xgb_test) / 3 # 简单平均融合
# 保存结果文件到本地
test['target'] = final_test
test[['id', 'dt', 'target']].to_csv('submitfinal.csv', index=None)
# cat
test['target'] = cb_test
test[['id', 'dt', 'target']].to_csv('submitcat.csv', index=None)
# lightgbm
test['target'] = lgb_test
test[['id', 'dt', 'target']].to_csv('submitlgb.csv', index=None)
# XGBoost
test['target'] = xgb_test
test[['id', 'dt', 'target']].to_csv('submitxgb.csv', index=None)
在这段程序中,我输出了三者融合后的结果,也输出了三者各自的预测的数值,分别的得分为260.3605、262.43793与259.9,相比来说相互融合的方式还是效果好一点。(目前认为)
后来观察到Task03中还提供了在数据建立特征工程后,通过三者堆叠融合后可输出结果,我又进行了尝试,本次代码运行耗时较长,且代码长度也比较长,可相对较好的是结果提交后有了不错的分数提升,分数从257.7来到了233.1。以下是运行代码:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
import tqdm
import sys
import os
import gc
import argparse
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id', 'dt'], ascending=False).reset_index(drop=True)
# 历史平移
for i in range(10, 36):
data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1, 4):
data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15, 30, 50, 70]:
data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
closed='left').mean().values
data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
closed='left').max().values
data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
closed='left').min().values
data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
closed='left').std().values
# 历史平移 + 窗口统计
for win in [7, 14, 28, 35, 50, 70]:
data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').mean().values
data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').max().values
data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').min().values
data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').sum().values
data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
closed='left').std().values
# 数据切分
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)
# 确定输入特征
train_cols = [f for f in data.columns if f not in ['id', 'target']]
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge
def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2024):
'''
clf:调用模型
train_x:训练数据
train_y:训练数据对应标签
test_x:测试数据
clf_name:选择使用模型名
seed:随机种子
'''
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
oof = np.zeros(train_x.shape[0])
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i + 1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'mse',
'min_child_weight': 10,
'num_leaves': 2 ** 8,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.05,
'seed': 2024,
'nthread': 16,
'verbose': -1,
}
model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
categorical_feature=[], verbose_eval=200, early_stopping_rounds=100)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == "xgb":
xgb_params = {
'booster': 'gbtree',
'objective': 'reg:squarederror',
'eval_metric': 'mae',
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.1,
'tree_method': 'hist',
'seed': 520,
'nthread': 16
}
train_matrix = clf.DMatrix(trn_x, label=trn_y)
valid_matrix = clf.DMatrix(val_x, label=val_y)
test_matrix = clf.DMatrix(test_x)
watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist, verbose_eval=200,
early_stopping_rounds=100)
val_pred = model.predict(valid_matrix)
test_pred = model.predict(test_matrix)
if clf_name == "cat":
params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type': 'Bernoulli', 'random_seed': 2023,
'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=1000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
metric_period=200,
use_best_model=True,
cat_features=[],
verbose=1)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
oof[valid_index] = val_pred
test_predict += test_pred / kf.n_splits
score = mean_absolute_error(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
return oof, test_predict
# 选择lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
# 选择catboost模型
cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
def stack_model(lgb_oof, xgb_oof, cat_oof, lgb_test, xgb_test, cat_test, y):
'''
输入的oof_1, oof_2, oof_3可以对应lgb_oof,xgb_oof,cat_oof
predictions_1, predictions_2, predictions_3对应lgb_test,xgb_test,cat_test
'''
train_stack = pd.concat([lgb_oof, xgb_oof, cat_oof], axis=1)
test_stack = pd.concat([lgb_test, xgb_test, cat_test], axis=1)
oof = np.zeros((train_stack.shape[0],))
predictions = np.zeros((test_stack.shape[0],))
scores = []
from sklearn.model_selection import RepeatedKFold
folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2021)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train_stack)):
print("fold n°{}".format(fold_ + 1))
trn_data, trn_y = train_stack.loc[trn_idx], y[trn_idx]
val_data, val_y = train_stack.loc[val_idx], y[val_idx]
clf = Ridge(random_state=2021)
clf.fit(trn_data, trn_y)
oof[val_idx] = clf.predict(val_data)
predictions += clf.predict(test_stack) / (5 * 2)
score_single = mean_absolute_error(val_y, oof[val_idx])
scores.append(score_single)
print(f'{fold_ + 1}/{5}', score_single)
print('mean: ', np.mean(scores))
return oof, predictions
stack_oof, stack_pred = stack_model(pd.DataFrame(lgb_oof), pd.DataFrame(xgb_oof), pd.DataFrame(cat_oof),
pd.DataFrame(lgb_test), pd.DataFrame(xgb_test), pd.DataFrame(cat_test),
train['target'])
# 进行取平均融合
final_test =stack_pred
# 保存结果文件到本地
test['target'] = final_test
test[['id', 'dt', 'target']].to_csv('submit.csv', index=None)
以上代码都是在Python3.11中运行完成的。需要注意的是最后的结果输出还是按照之前的方式保存并输出。这个程序是目前结果里面运行结果最优的。为了继续提高运行得分,继续尝试深度学习的方式进行预测。本次预测采用Task03中提供的LSTM长短时时序预测方式来进行尝试。
我分别利用此代码进行了三轮预测测试,预测结果如图所示。在官方给的10轮预测分数,为1470多,故进行加量的多轮预测。分别进行了50轮、60轮和300轮与500轮预测,结果都为500多,无较大提升。便意识到此种深度学习的方法,可能缺少数据处理过程或训练数据较少,预测效果不理想,便不再尝试。虽然深度学习的方式在此种预测时结果不理想,但不能认为其效果普遍比千层机器学习的方法差,它只不过是不适用。
例如,我之前本科毕设所做的航空发动机剩余使用寿命预测,使用的BP融合SVM的效果要劣于卷积神经网络,但也不能武断的说其方法不好,只能说是其各有各的用处。浅层的学习,时间短训练快,时间效率要占优的。各有各的优缺点。
需要注意的一点,在运行深度学习的网络代码时,其输出结果的维度要控制一致。样板代码如下所示。
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from keras.optimizers import Adam
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# 数据预处理
def preprocess_data(df, look_back=100):
# 将数据按照id进行分组
grouped = df.groupby('id')
datasets = {}
for id, group in grouped:
datasets[id] = group.values
# 准备训练数据集
X, Y = [], []
for id, data in datasets.items():
for i in range(10, 15): # 每个id构建5个序列
a = data[i:(i + look_back), 3]
a = np.append(a, np.array([0] * (100 - len(a))))
X.append(a[::-1])
Y.append(data[i - 10:i, 3][::-1])
# 准备测试数据集
OOT = []
for id, data in datasets.items():
a = data[:100, 3]
a = np.append(a, np.array([0] * (100 - len(a))))
OOT.append(a[::-1])
return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)
# 定义模型
def build_model(look_back, n_features, n_output):
model = Sequential()
model.add(LSTM(50, input_shape=(look_back, n_features)))
model.add(RepeatVector(n_output))
model.add(LSTM(50, return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
return model
# 构建和训练模型
look_back = 100 # 序列长度
n_features = 1 # 假设每个时间点只有一个特征
n_output = 10 # 预测未来10个时间单位的值
# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)
# 构建模型
model = build_model(look_back, n_features, n_output)
# 训练模型
model.fit(X, Y, epochs=10, batch_size=64, verbose=1)
# 进行预测
predicted_values = model.predict(OOT)
结果输出的代码,需加入到上方代码末尾,如下所示。这个问题的解决也是反复提问大模型给出的解决方案。通过这两段代码,可实现深度学习网络的预测。
# LSTM 模型部分的输出保存
# LSTM 模型部分的输出保存,调整以保持与 CatBoostRegressor 输出相同的格式
result_df = pd.DataFrame({
'id': test['id'], # 使用原始 test 数据的 id 列
'dt': test['dt'], # 使用原始 test 数据的 dt 列
'target': predicted_values.flatten() # 将预测结果展平为一维数组
})
result_df.to_csv('submit_lstm.csv', index=None)
接着,通过学习别人的笔记,通过对Task02中单一lightgbm的特征工程改进会提升运行结果的分数。随即我便展开尝试。通过改进特征提取的程序,我得到运行结果为240.8563。相比于三者均值融合要提高不少,但略逊于三者堆叠融合的233.1。
接下来,改进其运行参数,也有了一定的分数提升,为238但效果仍劣于三者堆叠融合。下方是我对lightgbm做的特征工程改进的代码。本特征中,筛选了均值、最大值、最小值、标准差等来作为差分特征,学习了这些代表性特征使模型学习更加高效,所以有了分数的提升。
def add_shift_features(data, start, end, base_col='target'):
""" 添加历史平移特征 """
for i in range(start, end):
data[f'{base_col}_shift{i}'] = data.groupby('id')[base_col].shift(i)
def add_diff_features(data, base_col, diff_ranges):
""" 添加差分特征 """
for i in diff_ranges:
data[f'{base_col}_diff{i}'] = data.groupby('id')[base_col].diff(i)
def add_rolling_features(data, windows, base_col, min_periods=3, stats=['mean', 'max', 'min', 'std', 'sum']):
""" 添加窗口统计特征 """
for win in windows:
group = data.groupby('id')[base_col].rolling(window=win, min_periods=min_periods, closed='left')
for stat in stats:
if stat == 'sum' and 'sum' not in stats:
continue # 避免不必要的计算
data[f'{base_col}_win{win}_{stat}'] = group.aggregate(stat).reset_index(level=0, drop=True)
# 应用特征工程
add_shift_features(data, 10, 36)
add_diff_features(data, 'target_shift10', range(1, 4))
add_rolling_features(data, [15, 30, 50, 70], 'target', stats=['mean', 'max', 'min', 'std'])
add_rolling_features(data, [7, 14, 28, 35, 50, 70], 'target_shift10', stats=['mean', 'max', 'min', 'std', 'sum'])
对于模型的调整也是通过多轮验证才得以完成。如下图所示。
最后,通过继续改进参数和代码,将特征工程应用到堆叠预测中,实现了分数的再一次提升为229.3416,这也是我在夏令营结束前的最后一次尝试机会。可能夏令营结束了,但我的尝试并没有结束,我会接着继续去进行这个的代码优化和调参。
本次夏令营收获满满。自己本来是代码小白,完后会多参加举办的夏令营来充实自己。谢谢Datawhale 提供的平台,很好的一次学习机会。助教很专业很认真负责,授课老师也很细心。谢谢大家。
这也是我的第一篇CSDN记录一下。
版权归原作者 MCAC_123 所有, 如有侵权,请联系我们删除。