0


【无线感知】【P7】WIFI 感知实战2- 数据集处理

前言:

  1. 这里面重点介绍一下如何提取训练的数据集(input, label)

这个项目是斯坦福大学和多伦多大学的合作项目,完整的项目地址

https://github.com/ermongroup/Wifi_Activity_Recognition

** 论文方案:1小时以上**

  1. 每次训练的时候,通过csv_import()方法加载数据集,

大概需要1个小时才能加载完毕。

** 优化方案:只需要 2分钟**

  1. 1 先通过 csv_import 提取训练input,label txt(只运行一次,大概18分钟)
  2. 2:** 每次训练的时候,只通过txt_import **,加载训练的数据集以及标签(2分钟)
  3. 这样可以专注模型优化


目录:

  1. 数据集分帧,标签
  2. 训练数据集加载

**一 **数据集分帧,标签

** 1.1: 作用**

** 1 **输入CSI 信号 进行分帧,每帧1s(1000行)

  1. 2 标签进行one-hot 编码
  2. 运行 cross_vali_data_convert_merge.py .
  3. 这个脚本提取 input features & label
  4. input_files 目录下面. 耗时: 15分钟.
  5. ![](https://i-blog.csdnimg.cn/direct/fbb9a399fc5442dba4ab647f6805493a.png)

**1.2: 输入 input **

  1. 采用了分帧的思想:
  2. window_size = 1000 (#窗户大小,大概1s
  3. slide_size = 200 (滑动窗口,帧与帧之间存在overlap,less than window_size!!)

xx (90, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (180, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (270, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (360, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (450, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (540, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (630, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (720, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (810, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (900, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (990, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (1080, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

xx (1170, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)

.......

**1.3 label **

  1. 采用了one-hot 编码
  2. threshold = 60
  3. run 为例: 当前 bed/windows > threshold/100,才认为是一个run action

1.4 cross_vali_data_convert_merge.py

  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Mon Jul 22 10:10:48 2024
  4. @author: chengxf2
  5. """
  6. import numpy as np
  7. import csv
  8. import glob
  9. import os
  10. from datetime import datetime
  11. #1s 间隔
  12. window_size = 1000
  13. threshold = 60
  14. #滑动窗口,帧与帧之间存在overlap,less than window_size!!
  15. slide_size = 200
  16. def dataimport(path1, path2):
  17. xx = np.empty([0,window_size,90],float)
  18. yy = np.empty([0,8],float)
  19. ###Input data###
  20. #data import from csv
  21. input_csv_files = sorted(glob.glob(path1))
  22. #'''
  23. print("\n input files", len(input_csv_files))
  24. starttime = datetime.now()
  25. for f in input_csv_files:
  26. #print("input_file_name=",f)
  27. data = [[ float(elm) for elm in v] for v in csv.reader(open(f, "r"))]
  28. tmp1 = np.array(data)
  29. x2 =np.empty([0,window_size,90],float)
  30. #data import by slide window
  31. k = 0
  32. while k <= (len(tmp1) + 1 - 2 * window_size):
  33. x = np.dstack(np.array(tmp1[k:k+window_size, 1:91]).T)
  34. x2 = np.concatenate((x2, x),axis=0)
  35. k += slide_size
  36. xx = np.concatenate((xx,x2),axis=0)
  37. print("\n xx ",xx.shape, "\t x2 ",x2.shape, "\t x",x.shape)
  38. xx = xx.reshape(len(xx),-1)
  39. time_interval = datetime.now()-starttime
  40. print("\n 读取input 时间 ",time_interval.seconds)
  41. ###Annotation data###
  42. #data import from csv
  43. starttime = datetime.now()
  44. #'''
  45. annotation_csv_files = sorted(glob.glob(path2))
  46. for ff in annotation_csv_files:
  47. #print("annotation_file_name=",ff)
  48. ano_data = [[ str(elm) for elm in v] for v in csv.reader(open(ff,"r"))]
  49. tmp2 = np.array(ano_data)
  50. #data import by slide window
  51. y = np.zeros(((len(tmp2) + 1 - 2 * window_size)//slide_size+1,8))
  52. k = 0
  53. while k <= (len(tmp2) + 1 - 2 * window_size):
  54. y_pre = np.stack(np.array(tmp2[k:k+window_size]))
  55. bed = 0
  56. fall = 0
  57. walk = 0
  58. pickup = 0
  59. run = 0
  60. sitdown = 0
  61. standup = 0
  62. noactivity = 0
  63. for j in range(window_size):
  64. if y_pre[j] == "bed":
  65. bed += 1
  66. elif y_pre[j] == "fall":
  67. fall += 1
  68. elif y_pre[j] == "walk":
  69. walk += 1
  70. elif y_pre[j] == "pickup":
  71. pickup += 1
  72. elif y_pre[j] == "run":
  73. run += 1
  74. elif y_pre[j] == "sitdown":
  75. sitdown += 1
  76. elif y_pre[j] == "standup":
  77. standup += 1
  78. else:
  79. noactivity += 1
  80. idx = int(k/slide_size)
  81. if bed > window_size * threshold / 100:
  82. y[idx,:] = np.array([0,1,0,0,0,0,0,0])
  83. elif fall > window_size * threshold / 100:
  84. y[idx,:] = np.array([0,0,1,0,0,0,0,0])
  85. elif walk > window_size * threshold / 100:
  86. y[idx,:] = np.array([0,0,0,1,0,0,0,0])
  87. elif pickup > window_size * threshold / 100:
  88. y[idx,:] = np.array([0,0,0,0,1,0,0,0])
  89. elif run > window_size * threshold / 100:
  90. y[idx,:] = np.array([0,0,0,0,0,1,0,0])
  91. elif sitdown > window_size * threshold / 100:
  92. y[idx,:] = np.array([0,0,0,0,0,0,1,0])
  93. elif standup > window_size * threshold / 100:
  94. y[idx,:] = np.array([0,0,0,0,0,0,0,1])
  95. else:
  96. y[idx,:] = np.array([2,0,0,0,0,0,0,0])
  97. k += slide_size
  98. yy = np.concatenate((yy, y),axis=0)
  99. print(xx.shape,yy.shape)
  100. time_interval = datetime.now()-starttime
  101. print("\n 读取 label 时间 ",time_interval.seconds)
  102. #xx (7111, 90000) yy (7111, 8)
  103. return (xx, yy)
  104. if __name__ == "__main__":
  105. train_dir = "input_files/"
  106. if not os.path.exists(train_dir):
  107. os.mkdir(train_dir)
  108. #“床”、“摔倒”、“上车”、“跑步”、“坐下”、“站起来”、“走路”
  109. labels = ["bed", "fall","pickup","run","sitdown","standup","walk"]
  110. for i, label in enumerate(labels):
  111. print("\n 读取数据集 ",label)
  112. filepath_input = "./Dataset/Data/input_*" + str(label) + "*.csv"
  113. filepath_label = "./Dataset/Data/annotation_*" + str(label) + "*.csv"
  114. outputfilename_input = "./input_files/xx_" + str(window_size) + "_" + str(threshold) + "_" + label + ".csv"
  115. outputfilename_label = "./input_files/yy_" + str(window_size) + "_" + str(threshold) + "_" + label + ".csv"
  116. x,y =dataimport(filepath_input, filepath_label)
  117. print("\n 保存训练数据集",label)
  118. with open(outputfilename_input, "w") as f:
  119. writer = csv.writer(f, lineterminator="\n")
  120. writer.writerows(x)
  121. with open(outputfilename_label, "w") as f:
  122. writer = csv.writer(f, lineterminator="\n")
  123. writer.writerows(y)
  124. print(label + "\t finish!")

**二 ** 训练数据集加载

  1. 总共大概18分钟(多了保存时间)

2.1 论文里面原始方案,耗时较长,至少需要1个小时

** 通过 csv_import 函数提取**

  1. from __future__ import print_function
  2. import gzip
  3. import os
  4. import numpy as np,numpy
  5. import csv
  6. import glob
  7. import pandas as pd
  8. from datetime import datetime
  9. class DataSet(object):
  10. def __init__(self, images, labels, fake_data=False):
  11. assert images.shape[0] == labels.shape[0], (
  12. "images.shape: %s labels.shape: %s" % (images.shape,
  13. labels.shape))
  14. self._num_examples = images.shape[0]
  15. images = images.reshape(images.shape[0],
  16. images.shape[1] * images.shape[2])
  17. self._images = images
  18. self._labels = labels
  19. self._epochs_completed = 0
  20. self._index_in_epoch = 0
  21. @property
  22. def images(self):
  23. return self._images
  24. @property
  25. def labels(self):
  26. return self._labels
  27. @property
  28. def num_examples(self):
  29. return self._num_examples
  30. @property
  31. def epochs_completed(self):
  32. return self._epochs_completed
  33. def next_batch(self, batch_size, fake_data=False):
  34. start = self._index_in_epoch
  35. self._index_in_epoch += batch_size
  36. if self._index_in_epoch > self._num_examples:
  37. # Finished epoch
  38. self._epochs_completed += 1
  39. # Shuffle the data
  40. perm = numpy.arange(self._num_examples)
  41. numpy.random.shuffle(perm)
  42. self._images = self._images[perm]
  43. self._labels = self._labels[perm]
  44. # Start next epoch
  45. start = 0
  46. self._index_in_epoch = batch_size
  47. assert batch_size <= self._num_examples
  48. end = self._index_in_epoch
  49. return self._images[start:end], self._labels[start:end]
  50. def csv_import():
  51. x_dic = {}
  52. y_dic = {}
  53. print("csv file importing...")
  54. for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
  55. # xx = np.array([[ float(elm) for elm in v] for v in csv.reader(open("./input_files/xx_1000_60_" + str(i) + ".csv","r"))])
  56. # yy = np.array([[ float(elm) for elm in v] for v in csv.reader(open("./input_files/yy_1000_60_" + str(i) + ".csv","r"))])
  57. # xx = xx[::2,:]
  58. # yy = yy[::2,:]
  59. start_time = datetime.now()
  60. SKIPROW = 2 #Skip every 2 rows -> overlap 800ms to 600ms (To avoid memory error)
  61. num_lines = sum(1 for l in open("./input_files/xx_1000_60_" + str(i) + ".csv"))
  62. skip_idx = [x for x in range(1, num_lines) if x % SKIPROW !=0]
  63. xx = np.array(pd.read_csv("./input_files/xx_1000_60_" + str(i) + ".csv", header=None, skiprows = skip_idx))
  64. yy = np.array(pd.read_csv("./input_files/yy_1000_60_" + str(i) + ".csv", header=None, skiprows = skip_idx))
  65. # eliminate the NoActivity Data
  66. rows, cols = np.where(yy>0)
  67. xx = np.delete(xx, rows[ np.where(cols==0)],0)
  68. yy = np.delete(yy, rows[ np.where(cols==0)],0)
  69. xx = xx.reshape(len(xx),1000,90)
  70. # 1000 Hz to 500 Hz (To avoid memory error)
  71. xx = xx[:,::2,:90]
  72. x_dic[str(i)] = xx
  73. y_dic[str(i)] = yy
  74. time_interval = datetime.now()-start_time
  75. print(str(i), "finished...", "xx=", xx.shape, "yy=", yy.shape,str(i),"耗时s ",time_interval.seconds)
  76. return x_dic["bed"], x_dic["fall"], x_dic["pickup"], x_dic["run"], x_dic["sitdown"], x_dic["standup"], x_dic["walk"], \
  77. y_dic["bed"], y_dic["fall"], y_dic["pickup"], y_dic["run"], y_dic["sitdown"], y_dic["standup"], y_dic["walk"]
  78. csv_import()

2.2 优化方案,2分钟左右(增加了保存txt 方案)

预处理:

  1. 先通过csv_import 提取Input, label txt(18分钟)

1: 每次训练的时候,只通过 txt_import加载数据集

** 这个只需要2分钟**

  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Jul 24 13:47:36 2024
  4. @author: chengxf2
  5. """
  6. import csv
  7. from datetime import datetime
  8. import numpy as np
  9. from ast import literal_eval
  10. def csv_import():
  11. #只运行一次,数据预处理
  12. print("csv file importing...")
  13. SKIPROW = 2 #Skip every 2 rows -> overlap 800ms to 600ms (To avoid memory error)
  14. for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
  15. start_time = datetime.now()
  16. label =str(i)
  17. xx_fileName = "./input_files/xx_1000_60_" + label + ".csv"
  18. yy_fileName = "./input_files/yy_1000_60_" + label + ".csv"
  19. xx_txt = "./input_files/xx_1000_60_txt" + label + ".csv"
  20. yy_txt = "./input_files/yy_1000_60_txt" + label + ".csv"
  21. xx_file = open(xx_fileName, 'r')
  22. yy_file = open(yy_fileName, 'r')
  23. lineNum = 0
  24. xx_lines = xx_file.readlines()
  25. yy_lines = yy_file.readlines()
  26. totalrows = 0
  27. rowsXX =[]
  28. rowsYY =[]
  29. #数据集种只有七种分类
  30. for line in yy_lines:
  31. NoActivity= int(line[0])
  32. #只保留偶数行,且非NoActivity
  33. if lineNum%SKIPROW == 0 and NoActivity==0:
  34. xx = xx_lines[lineNum]
  35. yy = yy_lines[lineNum]
  36. arrxx = literal_eval(xx)
  37. arryy = literal_eval(yy)
  38. rowsXX.append(arrxx)
  39. rowsYY.append(arryy)
  40. totalrows +=1
  41. lineNum+=1
  42. rowsXX = np.array(rowsXX)
  43. rowsYY = np.array(rowsYY)
  44. # 1000 Hz to 500 Hz (To avoid memory error)
  45. rowsXX = rowsXX.reshape(len(rowsXX), 1000,90)
  46. rowsXX = rowsXX[:,::2,:90]
  47. a = rowsXX.reshape(-1, rowsXX.shape[1]*rowsXX.shape[2]) # 第一个参数为-1,表示自动计算该维度的大小
  48. print("save txt")
  49. #保存
  50. np.savetxt(xx_txt, a,delimiter=',')
  51. np.savetxt(yy_txt, rowsYY,delimiter=',')
  52. # np.savetxt('a.csv', a, fmt='%d', delimiter=',') dtype=np.int
  53. time_interval = datetime.now()-start_time
  54. print("\n label:",label, "\t totalrows ",totalrows, "\t time_interval",time_interval.seconds,np.shape(rowsXX),np.shape(rowsYY))
  55. xx_file.close()
  56. yy_file.close()
  57. print(lineNum)
  58. def txt_import():
  59. #每次训练的时候只执行该函数
  60. x_dic = {}
  61. y_dic = {}
  62. print("txt file importing...")
  63. beg_time = datetime.now()
  64. for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
  65. label =str(i)
  66. start_time = datetime.now()
  67. xx_txt = "./input_files/xx_1000_60_txt" + label + ".csv"
  68. yy_txt = "./input_files/yy_1000_60_txt" + label + ".csv"
  69. arrXX = np.loadtxt(xx_txt, delimiter=',',dtype=np.float32)
  70. arrYY = np.loadtxt(yy_txt, delimiter=',',dtype=np.int32)
  71. arrXX = arrXX.reshape(-1, 500,90)
  72. time_interval = datetime.now()-start_time
  73. print(label, "\t 耗时(秒):",time_interval.seconds,"\t xx.shape:",np.shape(arrXX),"\t yy.shape",np.shape(arrYY))
  74. x_dic[label]=arrXX
  75. y_dic[label]=arrYY
  76. total_time = datetime.now()-beg_time
  77. print("\n 总共耗时(分钟): ",total_time.seconds/60)
  78. return x_dic["bed"], x_dic["fall"], x_dic["pickup"], x_dic["run"], x_dic["sitdown"], x_dic["standup"], x_dic["walk"], \
  79. y_dic["bed"], y_dic["fall"], y_dic["pickup"], y_dic["run"], y_dic["sitdown"], y_dic["standup"], y_dic["walk"]
  80. txt_import()

本文转载自: https://blog.csdn.net/chengxf2/article/details/140627052
版权归原作者 明朝百晓生 所有, 如有侵权,请联系我们删除。

“【无线感知】【P7】WIFI 感知实战2- 数据集处理”的评论:

还没有评论