0


Python实验--手写五折交叉验证+调库实现SVM/RFC/KNN手写数字识别

1. 数据读取

先说一下要用到的数据集:

数据集自取地址:

链接:https://pan.baidu.com/s/1Vd2ADHEalSNnuOEcPJD8gQ
提取码:3hk6

数据集构成:

0-9十个数字,总共1934个样本,以数字_n命名,每个样本为32*32大小的txt文件(事先将图片处理后二值化)

数据读取代码:

  1. def img2vector(filename):
  2. # 创建向量
  3. returnVect = np.zeros((1, 1024))
  4. # 打开数据文件,读取每行内容
  5. fr = open(filename)
  6. for i in range(32):
  7. # 读取每一行
  8. lineStr = fr.readline()
  9. # 将每行前32字符转成int,存入向量
  10. for j in range(32):
  11. returnVect[0, 32 * i + j] = int(lineStr[j])
  12. return returnVect

最后生成数据集形式:

X:1934*1024的矩阵,每行代表一个样本,1024列为每个样本的1024个像素点(二值表示)

Y:1934*1的矩阵,每一行对应相同下标X中样本的标签

数据集生成代码:

  1. def trainData(trainPath):
  2. trainfile = os.listdir(trainPath) # 获取训练集文件下的所有文件名
  3. Y = np.zeros((len(trainfile), 1))
  4. # 先建立一个行数为训练样本数。列数为1024的0数组矩阵,1024为图片像素总和,即32*32
  5. X = np.zeros((len(trainfile), 1024))
  6. # 取文件名的第一个数字为标签名
  7. for i in range(0, len(trainfile)):
  8. thislabel = trainfile[i].split(".")[0].split("_")[0]
  9. if len(thislabel) != 0:
  10. Y[i][0] = int(thislabel) # 保存标签
  11. X[i, :] = img2vector(trainPath + "/" + trainfile[i]) # 将训练数据写入0矩阵
  12. return X, Y

2. K折交叉验证

关于K折交叉验证的原理其他博文都有详细说明,我这就不赘述了,直接上代码

(本人水平有限,实现方法繁琐,但是能用,手动狗头保命)

代码思路介绍:

  1. 将每个数字对应的下标打乱

  2. 由于每个数字样本数不一致,这里将其统一到200,不足的随机抽样本补充,多余的随机剔除

  3. 按照打乱后的下标生成shuffle后的数据集

  4. 按照每个数字40个样本的方式提取出五折子数据集

详细代码:

  1. # import dataset
  2. X, Y = trainData('data1')
  3. size = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # 保存每个数字的样本数
  4. for i in range(1934):
  5. size[int(Y[i][0])] += 1
  6. # shuffle and balance
  7. position = []
  8. for i in range(10):
  9. left = 0
  10. right = 0
  11. for k in range(i + 1):
  12. right += size[k]
  13. left = right - size[i] # 计算每个数字对应的下标左右边界
  14. ran = list(range(left, right))
  15. random.shuffle(ran) # 生成打乱后的下标列表
  16. if len(ran) < 200:
  17. for j in range(200 - len(ran)):
  18. ran.append(np.random.randint(left, right)) # 随机抽取补充
  19. if len(ran) > 200:
  20. for j in range(len(ran) - 200):
  21. del ran[-1] # 随机剔除
  22. position.append(ran)
  23. X_shuffled = np.zeros((2000, 1024))
  24. for i in range(10):
  25. for j in range(200):
  26. x = X[position[i][j]]
  27. X_shuffled[j + 200*i] = x # 按照打乱后的下标生成新数据集
  28. # split into 5 parts
  29. X_part = []
  30. for i in range(5):
  31. X_split = np.zeros((400, 1024))
  32. for j in range(10):
  33. for k in range(40):
  34. X_split[(k + 40*j), :] = X_shuffled[(k + 200*j + 40*i), :]
  35. X_part.append(X_split)
  36. Y_part = []
  37. for i in range(10):
  38. for j in range(40):
  39. Y_part.append(i) #生成对应的标签集

3. 调库实现手写数字识别

  1. RFC
  1. # K-Folder
  2. score =[]
  3. for i in range(5):
  4. X_test = X_part[i]
  5. Y_test = Y_part
  6. X_train = np.concatenate((X_part[(i+1) % 5], X_part[(i+2) % 5], X_part[(i+3) % 5], X_part[(i+4) % 5]), axis=0)
  7. Y_train = Y_test*4
  8. clf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=None,
  9. min_samples_split=3, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
  10. max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
  11. min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1,
  12. random_state=None, verbose=0, warm_start=False, class_weight=None)
  13. # train
  14. clf.fit(X_train, Y_train)
  15. score.append(clf.score(X_test, Y_test))
  16. Y_pred = clf.predict(X_test)
  17. cm = confusion_matrix(Y_test, Y_pred)
  18. plt.matshow(cm)
  19. plt.title('epoch %d' % (i + 1))
  20. plt.show()
  21. acc_sum = 0
  22. for i in range(5):
  23. acc_sum += score[i]
  24. print("Average Acc: %f" % (acc_sum / 5))
  1. SVM
  1. score =[]
  2. for i in range(5):
  3. X_test = X_part[i]
  4. Y_test = Y_part
  5. X_train = np.concatenate((X_part[(i+1) % 5], X_part[(i+2) % 5], X_part[(i+3) % 5], X_part[(i+4) % 5]), axis=0)
  6. Y_train = Y_test*4
  7. clf = svm.SVC(C=200.0, kernel='rbf', degree=3, gamma='auto',
  8. coef0=0.0, shrinking=True, probability=False, tol=0.001,
  9. cache_size=200, class_weight=None, verbose=False,
  10. max_iter=-1, decision_function_shape='ovr',
  11. random_state=None)
  12. # train
  13. clf.fit(X_train, Y_train)
  14. score.append(clf.score(X_test, Y_test))
  15. Y_pred = clf.predict(X_test)
  16. cm = confusion_matrix(Y_test, Y_pred)
  17. plt.matshow(cm)
  18. plt.title('epoch %d' % (i + 1))
  19. plt.show()
  20. acc_sum = 0
  21. for i in range(5):
  22. acc_sum += score[i]
  23. print("Average Acc: %f" % (acc_sum / 5))
  1. KNN
  1. score =[]
  2. plt.figure()
  3. for i in range(5):
  4. X_test = X_part[i]
  5. Y_test = Y_part
  6. X_train = np.concatenate((X_part[(i+1) % 5], X_part[(i+2) % 5], X_part[(i+3) % 5], X_part[(i+4) % 5]), axis=0)
  7. Y_train = Y_test*4
  8. clf = KNeighborsClassifier(n_neighbors=3, weights='uniform',
  9. algorithm='auto', leaf_size=30,
  10. p=2, metric='minkowski', metric_params=None,
  11. n_jobs=None, )
  12. # train
  13. clf.fit(X_train, Y_train)
  14. score.append(clf.score(X_test, Y_test))
  15. Y_pred = clf.predict(X_test)
  16. cm = confusion_matrix(Y_test, Y_pred)
  17. plt.matshow(cm)
  18. plt.title('epoch %d' % (i+1))
  19. plt.show()
  20. acc_sum = 0
  21. for i in range(5):
  22. acc_sum += score[i]
  23. print("Average Acc: %f" % (acc_sum / 5))

4. 用到的库列表

  1. # KNN
  2. import random
  3. import numpy as np
  4. import os
  5. from sklearn.neighbors import KNeighborsClassifier
  6. import matplotlib.pyplot as plt
  7. from sklearn.metrics import confusion_matrix
  8. # SVM
  9. import random
  10. import numpy as np
  11. import os
  12. import matplotlib.pyplot as plt
  13. from sklearn.metrics import confusion_matrix
  14. from sklearn import svm
  15. # RFC
  16. import random
  17. import numpy as np
  18. import os
  19. from sklearn. ensemble import RandomForestClassifier
  20. import matplotlib.pyplot as plt
  21. from sklearn.metrics import confusion_matrix

本文转载自: https://blog.csdn.net/weixin_43909400/article/details/122201049
版权归原作者 云龙弓手 所有, 如有侵权,请联系我们删除。

“Python实验--手写五折交叉验证+调库实现SVM/RFC/KNN手写数字识别”的评论:

还没有评论