文章目录
一、前言
- 任务目标:根据《泰坦尼克号登船人员名单》上的个人信息预测其是否生还
- 数据集:《泰坦尼克号登船人员名单》,自取https://download.csdn.net/download/weixin_43721000/87740848
- 数据集解释: 第一列age ,表示的是年龄(数值数据) 第二列cabin,表示客舱号(分类数据 :字符串类型) 第三列embarked表示登船港口,S是Southampton南安普顿,C是Cherbourg法国瑟堡,Q是Queenstown爱尔兰昆士敦(分类数据:直接类别) 第四列fare,表示船票价格(数值数据) 第五列name,表示的是名字(分类数据:字符串类型) 第六列parch,船上父母数/子女数,不同代的直系亲属数,比如某人和他的女儿及父亲同在这个船上,则他的这个数值就是父母数(1)+子女数(1)=2 第七列passengerId,表示乘船编号 第八列pclass,表示客舱等级,这里有三级,1为一等舱,2为二等舱,3为三等舱(分类数据:直接类别) 第九列sex,表示性别male为男性,famale为女性(分类数据) 第十列sibsp,表示兄弟姐妹数/配偶数,同一代的直系亲属人数,比如某人和他的弟弟及妻子同在这个船上,则他的这个数值就是兄妹数(1)+配偶数(1)=2 第十一列surverved,表示是否存活,1为生存,2为死亡(分类数据:直接类别) 第十二列ticket,表示船票编号(数值数据)
二、实现方法
1.读取数据集
import numpy as np
import pandas as pd
dataset = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')print(dataset)# PassengerId Survived Pclass \# 0 1 0 3 # 1 2 1 1 # 2 3 1 3 # 3 4 1 1 # 4 5 0 3 # .. ... ... ... # 886 887 0 2 # 887 888 1 1 # 888 889 0 3 # 889 890 1 1 # 890 891 0 3 # # Name Sex Age SibSp \# 0 Braund, Mr. Owen Harris male 22.0 1 # 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 # 2 Heikkinen, Miss. Laina female 26.0 0 # 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 # 4 Allen, Mr. William Henry male 35.0 0 # .. ... ... ... ... # 886 Montvila, Rev. Juozas male 27.0 0 # 887 Graham, Miss. Margaret Edith female 19.0 0 # 888 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 # 889 Behr, Mr. Karl Howell male 26.0 0 # 890 Dooley, Mr. Patrick male 32.0 0 # # Parch Ticket Fare Cabin Embarked # 0 0 A/5 21171 7.2500 NaN S # 1 0 PC 17599 71.2833 C85 C # 2 0 STON/O2. 3101282 7.9250 NaN S # 3 0 113803 53.1000 C123 S # 4 0 373450 8.0500 NaN S # .. ... ... ... ... ... # 886 0 211536 13.0000 NaN S # 887 0 112053 30.0000 B42 S # 888 2 W./C. 6607 23.4500 NaN S # 889 0 111369 30.0000 C148 C # 890 0 370376 7.7500 NaN Q# # [891 rows x 12 columns]print(X_test)# PassengerId Pclass Name \# 0 892 3 Kelly, Mr. James # 1 893 3 Wilkes, Mrs. James (Ellen Needs) # 2 894 2 Myles, Mr. Thomas Francis # 3 895 3 Wirz, Mr. Albert # 4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) # .. ... ... ... # 413 1305 3 Spector, Mr. Woolf # 414 1306 1 Oliva y Ocana, Dona. Fermina # 415 1307 3 Saether, Mr. Simon Sivertsen # 416 1308 3 Ware, Mr. Frederick # 417 1309 3 Peter, Master. Michael J # # Sex Age SibSp Parch Ticket Fare Cabin Embarked # 0 male 34.5 0 0 330911 7.8292 NaN Q # 1 female 47.0 1 0 363272 7.0000 NaN S # 2 male 62.0 0 0 240276 9.6875 NaN Q # 3 male 27.0 0 0 315154 8.6625 NaN S # 4 female 22.0 1 1 3101298 12.2875 NaN S # .. ... ... ... ... ... ... ... ... # 413 male NaN 0 0 A.5. 3236 8.0500 NaN S # 414 female 39.0 0 0 PC 17758 108.9000 C105 C # 415 male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S # 416 male NaN 0 0 359309 8.0500 NaN S # 417 male NaN 1 1 2668 22.3583 NaN C # # [418 rows x 11 columns]
2.数据清洗
- 清洗称谓
# 从 Name 字段中提取称谓,并将较少的称谓统一替换为 ‘rare’ 减少数据噪声
dataset_title =[i.split(',')[1].split('.')[0].strip()for i in dataset['Name']]# 提取称谓
dataset['Title']= pd.Series(dataset_title)# 插入新列print(dataset['Title'].value_counts())# # Mr 517# Miss 182# Mrs 125# Master 40# Dr 7# Rev 6# Mlle 2# Major 2# Col 2# the Countess 1# Capt 1# Ms 1# Sir 1# Lady 1# Mme 1# Don 1# Jonkheer 1# Name: Title, dtype: int64# 查看称谓数量分布
dataset['Title']= dataset['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona','Ms','Mme','Mlle'],'Rare')# 替换数量较少的称谓为 ‘rare’print(dataset['Title'].value_counts())# Mr 517# Miss 182# Mrs 125# Master 40# Rare 27# Name: Title, dtype: int64
dataset_title =[i.split(',')[1].split('.')[0].strip()for i in X_test['Name']]
X_test['Title']= pd.Series(dataset_title)print(X_test['Title'].value_counts())# Mr 240# Miss 78# Mrs 72# Master 21# Col 2# Rev 2# Ms 1# Dr 1# Dona 1# Name: Title, dtype: int64
X_test['Title']= X_test['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona','Ms','Mme','Mlle'],'Rare')print(X_test['Title'].value_counts())# Mr 240# Miss 78# Mrs 72# Master 21# Rare 7# Name: Title, dtype: int64
- 家庭成员数量清洗
# 从 SibSp(同代家属:兄弟姐妹或配偶)和 Parch(不同代家属:父母或子女)中提取上船的家庭人数=SibSp+Parch+1,并将家庭人数标签化
dataset['FamilyS']= dataset['SibSp']+ dataset['Parch']+1
X_test['FamilyS']= X_test['SibSp']+ X_test['Parch']+1deffamily(x):if x <2:return'Single'elif x ==2:return'Couple'elif x <=4:return'InterM'else:return'Large'
dataset['FamilyS']= dataset['FamilyS'].apply(family)
X_test['FamilyS']= X_test['FamilyS'].apply(family)print(dataset['FamilyS'].value_counts())# Single 537# Couple 161# InterM 131# Large 62# Name: FamilyS, dtype: int64
- 缺失数据填充
# 将 登船港口列 缺失的值替换为该列的众数(如有多个众数取第一个)
dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
X_test['Embarked'].fillna(X_test['Embarked'].mode()[0], inplace=True)# 将 年龄列 缺失的值替换为该列的平均数
dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
X_test['Age'].fillna(X_test['Age'].median(), inplace=True)# 将 船票价格 缺失的值替换为该列的平均数
dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].median(), inplace=True)
- 不使用数据清除
# 删掉不使用的列
dataset = dataset.drop(['PassengerId','Cabin','Name','SibSp','Parch','Ticket'], axis=1)
X_test_passengers = X_test['PassengerId']
X_test = X_test.drop(['PassengerId','Cabin','Name','SibSp','Parch','Ticket'], axis=1)print(dataset)# Survived Pclass Sex Age Fare Embarked Title FamilyS# 0 0 3 male 22.0 7.2500 S Mr Couple# 1 1 1 female 38.0 71.2833 C Mrs Couple# 2 1 3 female 26.0 7.9250 S Miss Single# 3 1 1 female 35.0 53.1000 S Mrs Couple# 4 0 3 male 35.0 8.0500 S Mr Single# .. ... ... ... ... ... ... ... ...# 886 0 2 male 27.0 13.0000 S Rare Single# 887 1 1 female 19.0 30.0000 S Miss Single# 888 0 3 female 28.0 23.4500 S Miss InterM# 889 1 1 male 26.0 30.0000 C Mr Single# 890 0 3 male 32.0 7.7500 Q Mr Single
3.划分训练集、验证集 和 测试集
- 样本和标签分离
# 拆分训练集的样本和标签
X_train = dataset.iloc[:,1:9].values
Y_train = dataset.iloc[:,0].values
# 测试集只有样本没有标签
X_test = X_test.values
- 文本标签转为 one-hot 编码
# 文本标签直接转为 one-hot 编码from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
print(X_train[0], X_train[0].shape)# [3 'male' 22.0 7.25 'S' 'Mr' 'Couple'] (7,)
one_hot_encoder = ColumnTransformer([('one_hot_encoder',# 转换器名称(任意)
OneHotEncoder(categories='auto'),# 编码类型[0,1,4,5,6]# 对哪些列进行编码)],
remainder='passthrough'# 保留未编码的列)
X_train = one_hot_encoder.fit_transform(X_train).tolist()
X_test = one_hot_encoder.fit_transform(X_test).tolist()print(X_train[0],len(X_train[0]))# [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 22.0, 7.25] 19# 7列变为19列的稀疏矩阵
- 从训练集中分离 1/10 的数据作为验证集
# 拆分训练集和验证集from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size =0.1)print(len(x_train))# 801print(len(x_val))# 90print(len(y_train))# 801print(len(y_val))# 90
4.创建模型
# 创建神经网络import torch
import torch.nn as nn
import torch.nn.functional as F
classNet(nn.Module):def__init__(self):super(Net, self).__init__()
self.fc1 = nn.Linear(19,270)# 上采样到270
self.fc2 = nn.Linear(270,2)# 2分类defforward(self, x):
x = self.fc1(x)
x = F.dropout(x, p=0.1)
x = F.elu(x)
x = self.fc2(x)# x = torch.sigmoid(x) # 输出结果映射到(0,1)之间return x
net = Net()
5.指定训练参数
# 训练参数
batch_size =50
num_epochs =50
learning_rate =0.01
batch_no =len(x_train)// batch_size
6.定义损失函数、优化器
# 损失函数、优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
7.训练
# 训练from sklearn.utils import shuffle
from torch.autograd import Variable
for epoch inrange(num_epochs):if epoch %5==0:print('Epoch {}'.format(epoch+1))
x_train, y_train = shuffle(x_train, y_train)# 乱序 # Mini batch learningfor i inrange(batch_no):
start = i * batch_size
end = start + batch_size
x_var = Variable(torch.FloatTensor(x_train[start:end]))
y_var = Variable(torch.LongTensor(y_train[start:end]))# Forward + Backward + Optimize
optimizer.zero_grad()
ypred_var = net(x_var)
loss =criterion(ypred_var, y_var)
loss.backward()
optimizer.step()
8.验证准确率
# 验证准确率
test_var = Variable(torch.FloatTensor(x_val), requires_grad=True)with torch.no_grad():
result = net(test_var)# values, labels = torch.max(result, 1)# print(values, labels)
labels = torch.argmax(result, dim=1)print(labels)# tensor([1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,# 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,# 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,# 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1])
num_right = np.sum(labels.data.numpy()== y_val)print('Accuracy {:.2f}'.format(num_right /len(y_val)))# Accuracy 0.81
9.预测
# 预测
X_test_var = Variable(torch.FloatTensor(X_test), requires_grad=True)with torch.no_grad():
test_result = net(X_test_var)# print(test_result)
values, labels = torch.max(test_result,1)
survived = labels.data.numpy()print(f"预测结果:{survived}")# 预测结果:[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1# 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0# 1 1 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0# 1 1 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0# 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1# 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0# 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1# 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0# 1 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0# 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0# 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0# 1 1 1 1 1 1 0 1 0 0 1]
本文转载自: https://blog.csdn.net/weixin_43721000/article/details/130429132
版权归原作者 什么都干的派森 所有, 如有侵权,请联系我们删除。
版权归原作者 什么都干的派森 所有, 如有侵权,请联系我们删除。