0


阿里云天池大赛赛题(机器学习)——阿里云安全恶意程序检测(完整代码)

目录

赛题背景

阿里云作为国内最大的云服务提供商,每天都面临着网络上海量的恶意攻击。
本题目提供的一堆恶意文件数据,包括感染性病毒、木马程序、挖矿程序、DDoS木马、勒索病毒等等,总计6亿条数据,每个文件数据会有对API调用顺序及线程等相关信息,我们需要训练模型,将测试文件正确归类(预测出是哪种病毒),因此是典型的多分类问题
常见的分类算法:朴素贝叶斯决策树支持向量机KNN逻辑回归等等;
集成学习:随机森林GBDT(梯度提升决策树),AdabootXGBoostLightGBMCatBoost等等;
神经网络:MLP(多层神经网络),DL(深度学习)等。

全代码(ML 和 DL)

一个典型的机器学习实战算法基本包括 1) 数据处理,2) 特征选取、优化,和 3) 模型选取、验证、优化。 因为“数据和特征决定了机器学习的上限,而模型和算法知识逼近这个上限而已。” 所以在解决一个机器学习问题时大部分时间都会花在数据处理和特征优化上。
大家最好在jupyter notebook上一段一段地跑下面的代码,加深理解。
机器学习的基本知识可以康康我的其他文章哦 好康的。

特征工程进阶与方案优化 代码

  1. import pandas as pd
  2. import numpy as np
  3. import seaborn as sns
  4. import matplotlib.pyplot as plt
  5. import lightgbm as lgb
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.preprocessing import OneHotEncoder
  8. from tqdm import tqdm_notebook
  9. import warnings
  10. warnings.filterwarnings('ignore')%matplotlib inline
  1. # 内存管理import numpy as np
  2. import pandas as pd
  3. from tqdm import tqdm
  4. class_Data_Preprocess:def__init__(self):
  5. self.int8_max = np.iinfo(np.int8).max
  6. self.int8_min = np.iinfo(np.int8).min
  7. self.int16_max = np.iinfo(np.int16).max
  8. self.int16_min = np.iinfo(np.int16).min
  9. self.int32_max = np.iinfo(np.int32).max
  10. self.int32_min = np.iinfo(np.int32).min
  11. self.int64_max = np.iinfo(np.int64).max
  12. self.int64_min = np.iinfo(np.int64).min
  13. self.float16_max = np.finfo(np.float16).max
  14. self.float16_min = np.finfo(np.float16).min
  15. self.float32_max = np.finfo(np.float32).max
  16. self.float32_min = np.finfo(np.float32).min
  17. self.float64_max = np.finfo(np.float64).max
  18. self.float64_min = np.finfo(np.float64).mindef_get_type(self, min_val, max_val, types):if types =='int':if max_val <= self.int8_max and min_val >= self.int8_min:return np.int8
  19. elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:return np.int16
  20. elif max_val <= self.int32_max and min_val >= self.int32_min:return np.int32
  21. returnNoneelif types =='float':if max_val <= self.float16_max and min_val >= self.float16_min:return np.float16
  22. if max_val <= self.float32_max and min_val >= self.float32_min:return np.float32
  23. if max_val <= self.float64_max and min_val >= self.float64_min:return np.float64
  24. returnNonedef_memory_process(self, df):
  25. init_memory = df.memory_usage().sum()/1024**2/1024print('Original data occupies {} GB memory.'.format(init_memory))
  26. df_cols = df.columns
  27. for col in tqdm_notebook(df_cols):try:if'float'instr(df[col].dtypes):
  28. max_val = df[col].max()
  29. min_val = df[col].min()
  30. trans_types = self._get_type(min_val, max_val,'float')if trans_types isnotNone:
  31. df[col]= df[col].astype(trans_types)elif'int'instr(df[col].dtypes):
  32. max_val = df[col].max()
  33. min_val = df[col].min()
  34. trans_types = self._get_type(min_val, max_val,'int')if trans_types isnotNone:
  35. df[col]= df[col].astype(trans_types)except:print(' Can not do any process for column, {}.'.format(col))
  36. afterprocess_memory = df.memory_usage().sum()/1024**2/1024print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))return df
  1. memory_process = _Data_Preprocess()
  1. ### 数据读取
  2. path ='../security_data/'
  3. train = pd.read_csv(path +'security_train.csv')
  4. test = pd.read_csv(path +'security_test.csv')
  1. train.head()
  1. defsimple_sts_features(df):
  2. simple_fea = pd.DataFrame()
  3. simple_fea['file_id']= df['file_id'].unique()
  4. simple_fea = simple_fea.sort_values('file_id')
  5. df_grp = df.groupby('file_id')
  6. simple_fea['file_id_api_count']= df_grp['api'].count().values
  7. simple_fea['file_id_api_nunique']= df_grp['api'].nunique().values
  8. simple_fea['file_id_tid_count']= df_grp['tid'].count().values
  9. simple_fea['file_id_tid_nunique']= df_grp['tid'].nunique().values
  10. simple_fea['file_id_index_count']= df_grp['index'].count().values
  11. simple_fea['file_id_index_nunique']= df_grp['index'].nunique().values
  12. return simple_fea
  1. %%time
  2. simple_train_fea1 = simple_sts_features(train)
  1. %%time
  2. simple_test_fea1 = simple_sts_features(test)
  1. defsimple_numerical_sts_features(df):
  2. simple_numerical_fea = pd.DataFrame()
  3. simple_numerical_fea['file_id']= df['file_id'].unique()
  4. simple_numerical_fea = simple_numerical_fea.sort_values('file_id')
  5. df_grp = df.groupby('file_id')
  6. simple_numerical_fea['file_id_tid_mean']= df_grp['tid'].mean().values
  7. simple_numerical_fea['file_id_tid_min']= df_grp['tid'].min().values
  8. simple_numerical_fea['file_id_tid_std']= df_grp['tid'].std().values
  9. simple_numerical_fea['file_id_tid_max']= df_grp['tid'].max().values
  10. simple_numerical_fea['file_id_index_mean']= df_grp['index'].mean().values
  11. simple_numerical_fea['file_id_index_min']= df_grp['index'].min().values
  12. simple_numerical_fea['file_id_index_std']= df_grp['index'].std().values
  13. simple_numerical_fea['file_id_index_max']= df_grp['index'].max().values
  14. return simple_numerical_fea
  1. %%time
  2. simple_train_fea2 = simple_numerical_sts_features(train)
  1. %%time
  2. simple_test_fea2 = simple_numerical_sts_features(test)

特征工程进阶部分

  1. defapi_pivot_count_features(df):
  2. tmp = df.groupby(['file_id','api'])['tid'].count().to_frame('api_tid_count').reset_index()
  3. tmp_pivot = pd.pivot_table(data=tmp,index ='file_id',columns='api',values='api_tid_count',fill_value=0)
  4. tmp_pivot.columns =[tmp_pivot.columns.names[0]+'_pivot_'+str(col)for col in tmp_pivot.columns]
  5. tmp_pivot.reset_index(inplace =True)
  6. tmp_pivot = memory_process._memory_process(tmp_pivot)return tmp_pivot
  1. %%time
  2. simple_train_fea3 = api_pivot_count_features(train)
  1. %%time
  2. simple_test_fea3 = api_pivot_count_features(test)
  1. defapi_pivot_nunique_features(df):
  2. tmp = df.groupby(['file_id','api'])['tid'].nunique().to_frame('api_tid_nunique').reset_index()
  3. tmp_pivot = pd.pivot_table(data=tmp,index ='file_id',columns='api',values='api_tid_nunique',fill_value=0)
  4. tmp_pivot.columns =[tmp_pivot.columns.names[0]+'_pivot_'+str(col)for col in tmp_pivot.columns]
  5. tmp_pivot.reset_index(inplace =True)
  6. tmp_pivot = memory_process._memory_process(tmp_pivot)return tmp_pivot
  1. %%time
  2. simple_train_fea4 = api_pivot_count_features(train)
  1. %%time
  2. simple_test_fea4 = api_pivot_count_features(test)
  1. train_label = train[['file_id','label']].drop_duplicates(subset =['file_id','label'], keep ='first')
  2. test_submit = test[['file_id']].drop_duplicates(subset =['file_id'], keep ='first')
  1. train_data = train_label.merge(simple_train_fea1, on ='file_id', how='left')
  2. train_data = train_data.merge(simple_train_fea2, on ='file_id', how='left')
  3. train_data = train_data.merge(simple_train_fea3, on ='file_id', how='left')
  4. train_data = train_data.merge(simple_train_fea4, on ='file_id', how='left')
  1. test_submit = test_submit.merge(simple_test_fea1, on ='file_id', how='left')
  2. test_submit = test_submit.merge(simple_test_fea2, on ='file_id', how='left')
  3. test_submit = test_submit.merge(simple_test_fea3, on ='file_id', how='left')
  4. test_submit = test_submit.merge(simple_test_fea4, on ='file_id', how='left')
  1. ### 评估指标构建deflgb_logloss(preds,data):
  2. labels_ = data.get_label()
  3. classes_ = np.unique(labels_)
  4. preds_prob =[]for i inrange(len(classes_)):
  5. preds_prob.append(preds[i*len(labels_):(i+1)*len(labels_)])
  6. preds_prob_ = np.vstack(preds_prob)
  7. loss =[]for i inrange(preds_prob_.shape[1]):
  8. sum_ =0for j inrange(preds_prob_.shape[0]):
  9. pred = preds_prob_[j,i]if j == labels_[i]:
  10. sum_ += np.log(pred)else:
  11. sum_ += np.log(1- pred)
  12. loss.append(sum_)return'loss is: ',-1*(np.sum(loss)/ preds_prob_.shape[1]),False

基于LightGBM 的模型验证

  1. train_features =[col for col in train_data.columns if col notin['label','file_id']]
  2. train_label ='label'
  1. %%time
  2. from sklearn.model_selection import StratifiedKFold,KFold
  3. params ={'task':'train','num_leaves':255,'objective':'multiclass','num_class':8,'min_data_in_leaf':50,'learning_rate':0.05,'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':5,'max_bin':128,'random_state':100}
  4. folds = KFold(n_splits=5, shuffle=True, random_state=15)
  5. oof = np.zeros(len(train))
  6. predict_res =0
  7. models =[]for fold_,(trn_idx, val_idx)inenumerate(folds.split(train_data)):print("fold n°{}".format(fold_))
  8. trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
  9. val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)
  10. clf = lgb.train(params, trn_data, num_boost_round=2000,valid_sets=[trn_data,val_data], verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss)
  11. models.append(clf)
  1. plt.figure(figsize=[10,8])
  2. sns.heatmap(train_data.iloc[:10000,1:21].corr())
  1. ### 特征重要性分析
  2. feature_importance = pd.DataFrame()
  3. feature_importance['fea_name']= train_features
  4. feature_importance['fea_imp']= clf.feature_importance()
  5. feature_importance = feature_importance.sort_values('fea_imp',ascending =False)
  1. feature_importance.sort_values('fea_imp',ascending =False)
  1. plt.figure(figsize=[20,10,])
  2. plt.figure(figsize=[20,10,])
  3. sns.barplot(x = feature_importance.iloc[:10]['fea_name'], y = feature_importance.iloc[:10]['fea_imp'])
  1. plt.figure(figsize=[20,10,])
  2. sns.barplot(x = feature_importance['fea_name'], y = feature_importance['fea_imp'])

模型测试

  1. pred_res =0
  2. flod =5for model in models:
  3. pred_res += model.predict(test_submit[train_features])*1.0/ flod
  4. test_submit['prob0']=0
  5. test_submit['prob1']=0
  6. test_submit['prob2']=0
  7. test_submit['prob3']=0
  8. test_submit['prob4']=0
  9. test_submit['prob5']=0
  10. test_submit['prob6']=0
  11. test_submit['prob7']=0
  12. test_submit[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']]= pred_res
  13. test_submit[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('baseline2.csv',index =None)

深度学习解决方案:TextCNN建模 代码

数据读取

  1. import pandas as pd
  2. import numpy as np
  3. import seaborn as sns
  4. import matplotlib.pyplot as plt
  5. import lightgbm as lgb
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.preprocessing import OneHotEncoder
  8. from tqdm import tqdm_notebook
  9. from sklearn.preprocessing import LabelBinarizer,LabelEncoder
  10. import warnings
  11. warnings.filterwarnings('ignore')%matplotlib inline
  12. path ='../security_data/'
  13. train = pd.read_csv(path +'security_train.csv')
  14. test = pd.read_csv(path +'security_test.csv')import numpy as np
  15. import pandas as pd
  16. from tqdm import tqdm
  17. class_Data_Preprocess:def__init__(self):
  18. self.int8_max = np.iinfo(np.int8).max
  19. self.int8_min = np.iinfo(np.int8).min
  20. self.int16_max = np.iinfo(np.int16).max
  21. self.int16_min = np.iinfo(np.int16).min
  22. self.int32_max = np.iinfo(np.int32).max
  23. self.int32_min = np.iinfo(np.int32).min
  24. self.int64_max = np.iinfo(np.int64).max
  25. self.int64_min = np.iinfo(np.int64).min
  26. self.float16_max = np.finfo(np.float16).max
  27. self.float16_min = np.finfo(np.float16).min
  28. self.float32_max = np.finfo(np.float32).max
  29. self.float32_min = np.finfo(np.float32).min
  30. self.float64_max = np.finfo(np.float64).max
  31. self.float64_min = np.finfo(np.float64).mindef_get_type(self, min_val, max_val, types):if types =='int':if max_val <= self.int8_max and min_val >= self.int8_min:return np.int8
  32. elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:return np.int16
  33. elif max_val <= self.int32_max and min_val >= self.int32_min:return np.int32
  34. returnNoneelif types =='float':if max_val <= self.float16_max and min_val >= self.float16_min:return np.float16
  35. if max_val <= self.float32_max and min_val >= self.float32_min:return np.float32
  36. if max_val <= self.float64_max and min_val >= self.float64_min:return np.float64
  37. returnNonedef_memory_process(self, df):
  38. init_memory = df.memory_usage().sum()/1024**2/1024print('Original data occupies {} GB memory.'.format(init_memory))
  39. df_cols = df.columns
  40. for col in tqdm_notebook(df_cols):try:if'float'instr(df[col].dtypes):
  41. max_val = df[col].max()
  42. min_val = df[col].min()
  43. trans_types = self._get_type(min_val, max_val,'float')if trans_types isnotNone:
  44. df[col]= df[col].astype(trans_types)elif'int'instr(df[col].dtypes):
  45. max_val = df[col].max()
  46. min_val = df[col].min()
  47. trans_types = self._get_type(min_val, max_val,'int')if trans_types isnotNone:
  48. df[col]= df[col].astype(trans_types)except:print(' Can not do any process for column, {}.'.format(col))
  49. afterprocess_memory = df.memory_usage().sum()/1024**2/1024print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))return df
  50. memory_process = _Data_Preprocess()
  51. train.head()

数据预处理

  1. # (字符串转化为数字)
  2. unique_api = train['api'].unique()
  3. api2index ={item:(i+1)for i,item inenumerate(unique_api)}
  4. index2api ={(i+1):item for i,item inenumerate(unique_api)}
  5. train['api_idx']= train['api'].map(api2index)
  6. test['api_idx']= test['api'].map(api2index)# 获取每个文件对应的字符串序列defget_sequence(df,period_idx):
  7. seq_list =[]for _id,begin inenumerate(period_idx[:-1]):
  8. seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)
  9. seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)return seq_list
  10. train_period_idx = train.file_id.drop_duplicates(keep='first').index.values
  11. test_period_idx = test.file_id.drop_duplicates(keep='first').index.values
  12. train_df = train[['file_id','label']].drop_duplicates(keep='first')
  13. test_df = test[['file_id']].drop_duplicates(keep='first')
  14. train_df['seq']= get_sequence(train,train_period_idx)
  15. test_df['seq']= get_sequence(test,test_period_idx)

TextCNN网络结构

  1. from keras.preprocessing.text import Tokenizer
  2. from keras.preprocessing.sequence import pad_sequences
  3. from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional
  4. from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten
  5. from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D
  6. from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average
  7. from keras.models import Model
  8. from keras.optimizers import RMSprop,Adam
  9. from keras.layers.normalization import BatchNormalization
  10. from keras.callbacks import EarlyStopping, ModelCheckpoint
  11. from keras.optimizers import SGD
  12. from keras import backend as K
  13. from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
  14. from keras.layers import SpatialDropout1D
  15. from keras.layers.wrappers import Bidirectional
  1. defTextCNN(max_len,max_cnt,embed_size, num_filters,kernel_size,conv_action, mask_zero):
  2. _input = Input(shape=(max_len,), dtype='int32')
  3. _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)
  4. _embed = SpatialDropout1D(0.15)(_embed)
  5. warppers =[]for _kernel_size in kernel_size:
  6. conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)
  7. warppers.append(GlobalMaxPooling1D()(conv1d))
  8. fc = concatenate(warppers)
  9. fc = Dropout(0.5)(fc)#fc = BatchNormalization()(fc)
  10. fc = Dense(256, activation='relu')(fc)
  11. fc = Dropout(0.25)(fc)#fc = BatchNormalization()(fc)
  12. preds = Dense(8, activation ='softmax')(fc)
  13. model = Model(inputs=_input, outputs=preds)
  14. model.compile(loss='categorical_crossentropy',
  15. optimizer='adam',
  16. metrics=['accuracy'])return model
  17. train_labels = pd.get_dummies(train_df.label).values
  18. train_seq = pad_sequences(train_df.seq.values, maxlen =6000)
  19. test_seq = pad_sequences(test_df.seq.values, maxlen =6000)

TextCNN训练和预测

  1. from sklearn.model_selection import StratifiedKFold,KFold
  2. skf = KFold(n_splits=5, shuffle=True)
  3. max_len =6000
  4. max_cnt =295
  5. embed_size =256
  6. num_filters =64
  7. kernel_size =[2,4,6,8,10,12,14]
  8. conv_action ='relu'
  9. mask_zero =False
  10. TRAIN =Trueimport os
  11. os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
  12. meta_train = np.zeros(shape =(len(train_seq),8))
  13. meta_test = np.zeros(shape =(len(test_seq),8))
  14. FLAG =True
  15. i =0for tr_ind,te_ind in skf.split(train_labels):
  16. i +=1print('FOLD: '.format(i))print(len(te_ind),len(tr_ind))
  17. model_name ='benchmark_textcnn_fold_'+str(i)
  18. X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]
  19. X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]
  20. model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)
  21. model_save_path ='./NN/%s_%s.hdf5'%(model_name,embed_size)
  22. early_stopping =EarlyStopping(monitor='val_loss', patience=3)
  23. model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)if TRAIN and FLAG:
  24. model.fit(X_train,X_train_label,validation_data=(X_val,X_val_label),epochs=100,batch_size=64,shuffle=True,callbacks=[early_stopping,model_checkpoint])
  25. model.load_weights(model_save_path)
  26. pred_val = model.predict(X_val,batch_size=128,verbose=1)
  27. pred_test = model.predict(test_seq,batch_size=128,verbose=1)
  28. meta_train[te_ind]= pred_val
  29. meta_test += pred_test
  30. K.clear_session()
  31. meta_test /=5.0

结果提交

  1. test_df['prob0']=0
  2. test_df['prob1']=0
  3. test_df['prob2']=0
  4. test_df['prob3']=0
  5. test_df['prob4']=0
  6. test_df['prob5']=0
  7. test_df['prob6']=0
  8. test_df['prob7']=0
  9. test_df[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']]= meta_test
  10. test_df[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('nn_baseline_5fold.csv',index =None)

以上内容和代码全部来自于《阿里云天池大赛赛题解析(机器学习篇)》这本好书,十分推荐大家去阅读原书!


本文转载自: https://blog.csdn.net/weixin_45116099/article/details/126201895
版权归原作者 全栈O-Jay 所有, 如有侵权,请联系我们删除。

“阿里云天池大赛赛题(机器学习)——阿里云安全恶意程序检测(完整代码)”的评论:

还没有评论