0


Datawhale AI 夏令营 Task03

这是本次夏令营的最后一次任务打卡了,通过七天的学习基本学习到了通过Python代码方式来实现机器学习去预测电力输出。对自己来说也算是基本完成任务了。第一天的baseline的返回分数是373,到后来一直在尝试Task03里面的样板代码去做lightgbm、xgboost与catboost的相互融合去输出结果。在使用Task03中的特征提取、特征划分与特征工程建立之后,我做的三者融合输出的结果是257.7,效果要好于单一的lightgbm的预测结果分数259.9。下方是我写的三者融合的代码:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
import tqdm
import sys
import os
import gc
import argparse
import warnings

warnings.filterwarnings('ignore')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id', 'dt'], ascending=False).reset_index(drop=True)

# 历史平移
for i in range(10, 36):
    data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1, 4):
    data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15, 30, 50, 70]:
    data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,                                                                 closed='left').mean().values
    data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,                                                                closed='left').max().values
    data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,                                                                  closed='left').min().values
    data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,                                                               closed='left').std().values
# 历史平移 + 窗口统计
for win in [7, 14, 28, 35, 50, 70]:
    data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                         closed='left').mean().values
    data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                        closed='left').max().values
    data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,                                                                         closed='left').min().values
    data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,                                                                            closed='left').sum().values
    data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                        closed='left').std().values
# 数据切分
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)
# 确定输入特征
train_cols = [f for f in data.columns if f not in ['id', 'target']]
def train_cb_model(train_df, valid_df, test_df, cols):
        cb_params = {
        'iterations': 2000,
        'learning_rate': 0.05,
        'depth': 6,
        'l2_leaf_reg': 3,
        'subsample': 0.8,
        'colsample_bylevel': 0.8,
        'bootstrap_type': 'Bernoulli',
        'objective': 'RMSE',
        'thread_count': 16,
        'verbose': 200
    }
    train_dataset = cb.Pool(train_df[cols], label=train_df['target'])
    valid_dataset = cb.Pool(valid_df[cols], label=valid_df['target'])
    test_dataset = cb.Pool(test_df[cols])
    model = cb.CatBoostRegressor(**cb_params)
    model.fit(train_dataset, eval_set=valid_dataset, early_stopping_rounds=100, verbose=200)
    val_pred = model.predict(valid_dataset)
    test_pred = model.predict(test_dataset)
    return val_pred, test_pred
def train_lgb_model(train_df, valid_df, test_df, cols):
        # lightgbm参数
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mse',
        'min_child_weight': 5,
        'num_leaves': 2 ** 5,
        'lambda_l2': 10,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 4,
        'learning_rate': 0.05,
        'seed': 2024,
        'nthread': 16,
        'verbose': -1,
    }
    train_data = lgb.Dataset(train_df[cols], label=train_df['target'])
    valid_data = lgb.Dataset(valid_df[cols], label=valid_df['target'], reference=train_data)

    model = lgb.train(lgb_params, train_data, num_boost_round=1000, valid_sets=[train_data, valid_data],
                      verbose_eval=100, early_stopping_rounds=100)
    val_pred = model.predict(valid_df[cols], num_iteration=model.best_iteration)
    test_pred = model.predict(test_df[cols], num_iteration=model.best_iteration)

    return val_pred, test_pred
import xgboost as xgb
import xgboost as xgb
def train_xgb_model(train_df, valid_df, test_df, cols):
       xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': 0.05,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'verbosity': 1
    }
    dtrain = xgb.DMatrix(train_df[cols], label=train_df['target'])
    dvalid = xgb.DMatrix(valid_df[cols], label=valid_df['target'])
    dtest = xgb.DMatrix(test_df[cols])
    model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dvalid, 'valid')],early_stopping_rounds=100, verbose_eval=100)
    # 在验证集和测试集上进行预测
    val_pred = model.predict(dvalid, iteration_range=(0, model.best_iteration))
    test_pred = model.predict(dtest, iteration_range=(0, model.best_iteration))
    return val_pred, test_pred
# 离线分数评估
    score = mean_squared_error(val_pred, val_y)
    print(score)
    return val_pred, test_pred
# 训练 CatBoost 模型
cb_oof, cb_test = train_cb_model(train[train.dt >= 31], train[train.dt <= 30], test, train_cols)
# 训练 LightGBM 模型
lgb_oof, lgb_test = train_lgb_model(train[train.dt >= 31], train[train.dt <= 30], test, train_cols)
# 训练 XGBoost 模型
xgb_oof, xgb_test = train_xgb_model(train[train.dt >= 31], train[train.dt <= 30], test, train_cols)
# 模型预测结果融合
final_test = (cb_test + lgb_test + xgb_test) / 3  # 简单平均融合
# 保存结果文件到本地
test['target'] = final_test
test[['id', 'dt', 'target']].to_csv('submitfinal.csv', index=None)
# cat
test['target'] = cb_test
test[['id', 'dt', 'target']].to_csv('submitcat.csv', index=None)
# lightgbm
test['target'] = lgb_test
test[['id', 'dt', 'target']].to_csv('submitlgb.csv', index=None)
# XGBoost
test['target'] = xgb_test
test[['id', 'dt', 'target']].to_csv('submitxgb.csv', index=None)

在这段程序中,我输出了三者融合后的结果,也输出了三者各自的预测的数值,分别的得分为260.3605、262.43793与259.9,相比来说相互融合的方式还是效果好一点。(目前认为)

后来观察到Task03中还提供了在数据建立特征工程后,通过三者堆叠融合后可输出结果,我又进行了尝试,本次代码运行耗时较长,且代码长度也比较长,可相对较好的是结果提交后有了不错的分数提升,分数从257.7来到了233.1。以下是运行代码:

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
import tqdm
import sys
import os
import gc
import argparse
import warnings

warnings.filterwarnings('ignore')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id', 'dt'], ascending=False).reset_index(drop=True)

# 历史平移
for i in range(10, 36):
    data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)

# 历史平移 + 差分特征
for i in range(1, 4):
    data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)

# 窗口统计
for win in [15, 30, 50, 70]:
    data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
                                                                         closed='left').mean().values
    data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
                                                                        closed='left').max().values
    data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
                                                                        closed='left').min().values
    data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3,
                                                                        closed='left').std().values

# 历史平移 + 窗口统计
for win in [7, 14, 28, 35, 50, 70]:
    data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                         closed='left').mean().values
    data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                        closed='left').max().values
    data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                        closed='left').min().values
    data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                        closed='left').sum().values
    data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3,
                                                                                        closed='left').std().values
    # 数据切分
    train = data[data.target.notnull()].reset_index(drop=True)
    test = data[data.target.isnull()].reset_index(drop=True)

    # 确定输入特征
    train_cols = [f for f in data.columns if f not in ['id', 'target']]

    from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
    import lightgbm as lgb
    import xgboost as xgb
    from catboost import CatBoostRegressor
    from sklearn.metrics import mean_squared_error, mean_absolute_error
    from sklearn.linear_model import Ridge

    def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2024):
        '''
        clf:调用模型
        train_x:训练数据
        train_y:训练数据对应标签
        test_x:测试数据
        clf_name:选择使用模型名
        seed:随机种子
        '''
        folds = 5
        kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
        oof = np.zeros(train_x.shape[0])
        test_predict = np.zeros(test_x.shape[0])
        cv_scores = []

        for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
            print('************************************ {} ************************************'.format(str(i + 1)))
            trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
            train_y[valid_index]
            if clf_name == "lgb":
                train_matrix = clf.Dataset(trn_x, label=trn_y)
                valid_matrix = clf.Dataset(val_x, label=val_y)
                params = {
                   'boosting_type': 'gbdt',
           'objective': 'regression',
           'metric': 'mse',
           'min_child_weight': 10,
           'num_leaves': 2 ** 8,
           'lambda_l2': 10,
           'feature_fraction': 0.8,
           'bagging_fraction': 0.8,
           'bagging_freq': 4,
           'learning_rate': 0.05,
           'seed': 2024,
           'nthread': 16,
           'verbose': -1,
      }
                model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
                                  categorical_feature=[], verbose_eval=200, early_stopping_rounds=100)
                val_pred = model.predict(val_x, num_iteration=model.best_iteration)
                test_pred = model.predict(test_x, num_iteration=model.best_iteration)

            if clf_name == "xgb":
                xgb_params = {
                    'booster': 'gbtree',
                    'objective': 'reg:squarederror',
                    'eval_metric': 'mae',
                    'max_depth': 5,
                    'lambda': 10,
                    'subsample': 0.7,
                    'colsample_bytree': 0.7,
                    'colsample_bylevel': 0.7,
                    'eta': 0.1,
                    'tree_method': 'hist',
                    'seed': 520,
                    'nthread': 16
                }
                train_matrix = clf.DMatrix(trn_x, label=trn_y)
                valid_matrix = clf.DMatrix(val_x, label=val_y)
                test_matrix = clf.DMatrix(test_x)

                watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

                model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist, verbose_eval=200,
                                  early_stopping_rounds=100)
                val_pred = model.predict(valid_matrix)
                test_pred = model.predict(test_matrix)

            if clf_name == "cat":
                params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type': 'Bernoulli', 'random_seed': 2023,
                          'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}

                model = clf(iterations=1000, **params)
                model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                          metric_period=200,
                          use_best_model=True,
                          cat_features=[],
                          verbose=1)

                val_pred = model.predict(val_x)
                test_pred = model.predict(test_x)

            oof[valid_index] = val_pred
            test_predict += test_pred / kf.n_splits

            score = mean_absolute_error(val_y, val_pred)
            cv_scores.append(score)
            print(cv_scores)

        return oof, test_predict

    # 选择lightgbm模型
    lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
    # 选择xgboost模型
    xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
    # 选择catboost模型
    cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')

    def stack_model(lgb_oof, xgb_oof, cat_oof, lgb_test, xgb_test, cat_test, y):
        '''
        输入的oof_1, oof_2, oof_3可以对应lgb_oof,xgb_oof,cat_oof
        predictions_1, predictions_2, predictions_3对应lgb_test,xgb_test,cat_test
        '''
        train_stack = pd.concat([lgb_oof, xgb_oof, cat_oof], axis=1)
        test_stack = pd.concat([lgb_test, xgb_test, cat_test], axis=1)

        oof = np.zeros((train_stack.shape[0],))
        predictions = np.zeros((test_stack.shape[0],))
        scores = []

        from sklearn.model_selection import RepeatedKFold
        folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2021)

        for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train_stack)):
            print("fold n°{}".format(fold_ + 1))
            trn_data, trn_y = train_stack.loc[trn_idx], y[trn_idx]
            val_data, val_y = train_stack.loc[val_idx], y[val_idx]

            clf = Ridge(random_state=2021)
            clf.fit(trn_data, trn_y)

            oof[val_idx] = clf.predict(val_data)
            predictions += clf.predict(test_stack) / (5 * 2)

            score_single = mean_absolute_error(val_y, oof[val_idx])
            scores.append(score_single)
            print(f'{fold_ + 1}/{5}', score_single)
        print('mean: ', np.mean(scores))

        return oof, predictions

    stack_oof, stack_pred = stack_model(pd.DataFrame(lgb_oof), pd.DataFrame(xgb_oof), pd.DataFrame(cat_oof),
                                        pd.DataFrame(lgb_test), pd.DataFrame(xgb_test), pd.DataFrame(cat_test),
                                        train['target'])

    # 进行取平均融合
    final_test =stack_pred
    # 保存结果文件到本地
    test['target'] = final_test
    test[['id', 'dt', 'target']].to_csv('submit.csv', index=None)
以上代码都是在Python3.11中运行完成的。需要注意的是最后的结果输出还是按照之前的方式保存并输出。这个程序是目前结果里面运行结果最优的。为了继续提高运行得分,继续尝试深度学习的方式进行预测。本次预测采用Task03中提供的LSTM长短时时序预测方式来进行尝试。

我分别利用此代码进行了三轮预测测试,预测结果如图所示。在官方给的10轮预测分数,为1470多,故进行加量的多轮预测。分别进行了50轮、60轮和300轮与500轮预测,结果都为500多,无较大提升。便意识到此种深度学习的方法,可能缺少数据处理过程或训练数据较少,预测效果不理想,便不再尝试。虽然深度学习的方式在此种预测时结果不理想,但不能认为其效果普遍比千层机器学习的方法差,它只不过是不适用。

例如,我之前本科毕设所做的航空发动机剩余使用寿命预测,使用的BP融合SVM的效果要劣于卷积神经网络,但也不能武断的说其方法不好,只能说是其各有各的用处。浅层的学习,时间短训练快,时间效率要占优的。各有各的优缺点。

需要注意的一点,在运行深度学习的网络代码时,其输出结果的维度要控制一致。样板代码如下所示。
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from keras.optimizers import Adam

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 数据预处理
def preprocess_data(df, look_back=100):
    # 将数据按照id进行分组
    grouped = df.groupby('id')
    datasets = {}
    for id, group in grouped:
        datasets[id] = group.values

    # 准备训练数据集
    X, Y = [], []
    for id, data in datasets.items():
        for i in range(10, 15):  # 每个id构建5个序列
            a = data[i:(i + look_back), 3]
            a = np.append(a, np.array([0] * (100 - len(a))))
            X.append(a[::-1])
            Y.append(data[i - 10:i, 3][::-1])

    # 准备测试数据集
    OOT = []
    for id, data in datasets.items():
        a = data[:100, 3]
        a = np.append(a, np.array([0] * (100 - len(a))))
        OOT.append(a[::-1])

    return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)

# 定义模型
def build_model(look_back, n_features, n_output):
    model = Sequential()
    model.add(LSTM(50, input_shape=(look_back, n_features)))
    model.add(RepeatVector(n_output))
    model.add(LSTM(50, return_sequences=True))
    model.add(TimeDistributed(Dense(1)))
    model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
    return model

# 构建和训练模型
look_back = 100  # 序列长度
n_features = 1  # 假设每个时间点只有一个特征
n_output = 10  # 预测未来10个时间单位的值

# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)

# 构建模型
model = build_model(look_back, n_features, n_output)

# 训练模型
model.fit(X, Y, epochs=10, batch_size=64, verbose=1)

# 进行预测
predicted_values = model.predict(OOT)
结果输出的代码,需加入到上方代码末尾,如下所示。这个问题的解决也是反复提问大模型给出的解决方案。通过这两段代码,可实现深度学习网络的预测。
# LSTM 模型部分的输出保存
# LSTM 模型部分的输出保存,调整以保持与 CatBoostRegressor 输出相同的格式
result_df = pd.DataFrame({
    'id': test['id'],  # 使用原始 test 数据的 id 列
    'dt': test['dt'],  # 使用原始 test 数据的 dt 列
    'target': predicted_values.flatten()  # 将预测结果展平为一维数组
})
result_df.to_csv('submit_lstm.csv', index=None)
接着,通过学习别人的笔记,通过对Task02中单一lightgbm的特征工程改进会提升运行结果的分数。随即我便展开尝试。通过改进特征提取的程序,我得到运行结果为240.8563。相比于三者均值融合要提高不少,但略逊于三者堆叠融合的233.1。

接下来,改进其运行参数,也有了一定的分数提升,为238但效果仍劣于三者堆叠融合。下方是我对lightgbm做的特征工程改进的代码。本特征中,筛选了均值、最大值、最小值、标准差等来作为差分特征,学习了这些代表性特征使模型学习更加高效,所以有了分数的提升。
def add_shift_features(data, start, end, base_col='target'):
    """ 添加历史平移特征 """
    for i in range(start, end):
        data[f'{base_col}_shift{i}'] = data.groupby('id')[base_col].shift(i)
def add_diff_features(data, base_col, diff_ranges):
    """ 添加差分特征 """
    for i in diff_ranges:
        data[f'{base_col}_diff{i}'] = data.groupby('id')[base_col].diff(i)
def add_rolling_features(data, windows, base_col, min_periods=3, stats=['mean', 'max', 'min', 'std', 'sum']):
    """ 添加窗口统计特征 """
    for win in windows:
        group = data.groupby('id')[base_col].rolling(window=win, min_periods=min_periods, closed='left')
        for stat in stats:
            if stat == 'sum' and 'sum' not in stats:
                continue  # 避免不必要的计算
            data[f'{base_col}_win{win}_{stat}'] = group.aggregate(stat).reset_index(level=0, drop=True)
# 应用特征工程
add_shift_features(data, 10, 36)
add_diff_features(data, 'target_shift10', range(1, 4))
add_rolling_features(data, [15, 30, 50, 70], 'target', stats=['mean', 'max', 'min', 'std'])
add_rolling_features(data, [7, 14, 28, 35, 50, 70], 'target_shift10', stats=['mean', 'max', 'min', 'std', 'sum'])
对于模型的调整也是通过多轮验证才得以完成。如下图所示。

最后,通过继续改进参数和代码,将特征工程应用到堆叠预测中,实现了分数的再一次提升为229.3416,这也是我在夏令营结束前的最后一次尝试机会。可能夏令营结束了,但我的尝试并没有结束,我会接着继续去进行这个的代码优化和调参。

本次夏令营收获满满。自己本来是代码小白,完后会多参加举办的夏令营来充实自己。谢谢Datawhale 提供的平台,很好的一次学习机会。助教很专业很认真负责,授课老师也很细心。谢谢大家。

这也是我的第一篇CSDN记录一下。


本文转载自: https://blog.csdn.net/MCAC_123/article/details/140576279
版权归原作者 MCAC_123 所有, 如有侵权,请联系我们删除。

“Datawhale AI 夏令营 Task03”的评论:

还没有评论