前言:
这里面重点介绍一下如何提取训练的数据集(input, label)
这个项目是斯坦福大学和多伦多大学的合作项目,完整的项目地址
https://github.com/ermongroup/Wifi_Activity_Recognition
** 论文方案:1小时以上**
每次训练的时候,通过csv_import()方法加载数据集,
大概需要1个小时才能加载完毕。
** 优化方案:只需要 2分钟**
1: 先通过 csv_import 提取训练input,label 到txt(只运行一次,大概18分钟)
2:** 每次训练的时候,只通过txt_import **,加载训练的数据集以及标签(2分钟)
这样可以专注模型优化
目录:
- 数据集分帧,标签
- 训练数据集加载
**一 **数据集分帧,标签
** 1.1: 作用**
** 1 **输入CSI 信号 进行分帧,每帧1s(1000行)
2 标签进行one-hot 编码
运行 cross_vali_data_convert_merge.py .
这个脚本提取 input features & label 到
input_files 目录下面. 耗时: 15分钟.
![](https://i-blog.csdnimg.cn/direct/fbb9a399fc5442dba4ab647f6805493a.png)
**1.2: 输入 input **
采用了分帧的思想:
window_size = 1000 (#窗户大小,大概1s)
slide_size = 200 (滑动窗口,帧与帧之间存在overlap,less than window_size!!)
xx (90, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (180, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (270, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (360, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (450, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (540, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (630, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (720, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (810, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (900, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (990, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (1080, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
xx (1170, 1000, 90) x2 (90, 1000, 90) x (1, 1000, 90)
.......
**1.3 label **
采用了one-hot 编码
threshold = 60
以run 为例: 当前 bed/windows > threshold/100,才认为是一个run action
1.4 cross_vali_data_convert_merge.py
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 22 10:10:48 2024
@author: chengxf2
"""
import numpy as np
import csv
import glob
import os
from datetime import datetime
#1s 间隔
window_size = 1000
threshold = 60
#滑动窗口,帧与帧之间存在overlap,less than window_size!!
slide_size = 200
def dataimport(path1, path2):
xx = np.empty([0,window_size,90],float)
yy = np.empty([0,8],float)
###Input data###
#data import from csv
input_csv_files = sorted(glob.glob(path1))
#'''
print("\n input files", len(input_csv_files))
starttime = datetime.now()
for f in input_csv_files:
#print("input_file_name=",f)
data = [[ float(elm) for elm in v] for v in csv.reader(open(f, "r"))]
tmp1 = np.array(data)
x2 =np.empty([0,window_size,90],float)
#data import by slide window
k = 0
while k <= (len(tmp1) + 1 - 2 * window_size):
x = np.dstack(np.array(tmp1[k:k+window_size, 1:91]).T)
x2 = np.concatenate((x2, x),axis=0)
k += slide_size
xx = np.concatenate((xx,x2),axis=0)
print("\n xx ",xx.shape, "\t x2 ",x2.shape, "\t x",x.shape)
xx = xx.reshape(len(xx),-1)
time_interval = datetime.now()-starttime
print("\n 读取input 时间 ",time_interval.seconds)
###Annotation data###
#data import from csv
starttime = datetime.now()
#'''
annotation_csv_files = sorted(glob.glob(path2))
for ff in annotation_csv_files:
#print("annotation_file_name=",ff)
ano_data = [[ str(elm) for elm in v] for v in csv.reader(open(ff,"r"))]
tmp2 = np.array(ano_data)
#data import by slide window
y = np.zeros(((len(tmp2) + 1 - 2 * window_size)//slide_size+1,8))
k = 0
while k <= (len(tmp2) + 1 - 2 * window_size):
y_pre = np.stack(np.array(tmp2[k:k+window_size]))
bed = 0
fall = 0
walk = 0
pickup = 0
run = 0
sitdown = 0
standup = 0
noactivity = 0
for j in range(window_size):
if y_pre[j] == "bed":
bed += 1
elif y_pre[j] == "fall":
fall += 1
elif y_pre[j] == "walk":
walk += 1
elif y_pre[j] == "pickup":
pickup += 1
elif y_pre[j] == "run":
run += 1
elif y_pre[j] == "sitdown":
sitdown += 1
elif y_pre[j] == "standup":
standup += 1
else:
noactivity += 1
idx = int(k/slide_size)
if bed > window_size * threshold / 100:
y[idx,:] = np.array([0,1,0,0,0,0,0,0])
elif fall > window_size * threshold / 100:
y[idx,:] = np.array([0,0,1,0,0,0,0,0])
elif walk > window_size * threshold / 100:
y[idx,:] = np.array([0,0,0,1,0,0,0,0])
elif pickup > window_size * threshold / 100:
y[idx,:] = np.array([0,0,0,0,1,0,0,0])
elif run > window_size * threshold / 100:
y[idx,:] = np.array([0,0,0,0,0,1,0,0])
elif sitdown > window_size * threshold / 100:
y[idx,:] = np.array([0,0,0,0,0,0,1,0])
elif standup > window_size * threshold / 100:
y[idx,:] = np.array([0,0,0,0,0,0,0,1])
else:
y[idx,:] = np.array([2,0,0,0,0,0,0,0])
k += slide_size
yy = np.concatenate((yy, y),axis=0)
print(xx.shape,yy.shape)
time_interval = datetime.now()-starttime
print("\n 读取 label 时间 ",time_interval.seconds)
#xx (7111, 90000) yy (7111, 8)
return (xx, yy)
if __name__ == "__main__":
train_dir = "input_files/"
if not os.path.exists(train_dir):
os.mkdir(train_dir)
#“床”、“摔倒”、“上车”、“跑步”、“坐下”、“站起来”、“走路”
labels = ["bed", "fall","pickup","run","sitdown","standup","walk"]
for i, label in enumerate(labels):
print("\n 读取数据集 ",label)
filepath_input = "./Dataset/Data/input_*" + str(label) + "*.csv"
filepath_label = "./Dataset/Data/annotation_*" + str(label) + "*.csv"
outputfilename_input = "./input_files/xx_" + str(window_size) + "_" + str(threshold) + "_" + label + ".csv"
outputfilename_label = "./input_files/yy_" + str(window_size) + "_" + str(threshold) + "_" + label + ".csv"
x,y =dataimport(filepath_input, filepath_label)
print("\n 保存训练数据集",label)
with open(outputfilename_input, "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerows(x)
with open(outputfilename_label, "w") as f:
writer = csv.writer(f, lineterminator="\n")
writer.writerows(y)
print(label + "\t finish!")
**二 ** 训练数据集加载
总共大概18分钟(多了保存时间)
2.1 论文里面原始方案,耗时较长,至少需要1个小时
** 通过 csv_import 函数提取**
from __future__ import print_function
import gzip
import os
import numpy as np,numpy
import csv
import glob
import pandas as pd
from datetime import datetime
class DataSet(object):
def __init__(self, images, labels, fake_data=False):
assert images.shape[0] == labels.shape[0], (
"images.shape: %s labels.shape: %s" % (images.shape,
labels.shape))
self._num_examples = images.shape[0]
images = images.reshape(images.shape[0],
images.shape[1] * images.shape[2])
self._images = images
self._labels = labels
self._epochs_completed = 0
self._index_in_epoch = 0
@property
def images(self):
return self._images
@property
def labels(self):
return self._labels
@property
def num_examples(self):
return self._num_examples
@property
def epochs_completed(self):
return self._epochs_completed
def next_batch(self, batch_size, fake_data=False):
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
# Finished epoch
self._epochs_completed += 1
# Shuffle the data
perm = numpy.arange(self._num_examples)
numpy.random.shuffle(perm)
self._images = self._images[perm]
self._labels = self._labels[perm]
# Start next epoch
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._labels[start:end]
def csv_import():
x_dic = {}
y_dic = {}
print("csv file importing...")
for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
# xx = np.array([[ float(elm) for elm in v] for v in csv.reader(open("./input_files/xx_1000_60_" + str(i) + ".csv","r"))])
# yy = np.array([[ float(elm) for elm in v] for v in csv.reader(open("./input_files/yy_1000_60_" + str(i) + ".csv","r"))])
# xx = xx[::2,:]
# yy = yy[::2,:]
start_time = datetime.now()
SKIPROW = 2 #Skip every 2 rows -> overlap 800ms to 600ms (To avoid memory error)
num_lines = sum(1 for l in open("./input_files/xx_1000_60_" + str(i) + ".csv"))
skip_idx = [x for x in range(1, num_lines) if x % SKIPROW !=0]
xx = np.array(pd.read_csv("./input_files/xx_1000_60_" + str(i) + ".csv", header=None, skiprows = skip_idx))
yy = np.array(pd.read_csv("./input_files/yy_1000_60_" + str(i) + ".csv", header=None, skiprows = skip_idx))
# eliminate the NoActivity Data
rows, cols = np.where(yy>0)
xx = np.delete(xx, rows[ np.where(cols==0)],0)
yy = np.delete(yy, rows[ np.where(cols==0)],0)
xx = xx.reshape(len(xx),1000,90)
# 1000 Hz to 500 Hz (To avoid memory error)
xx = xx[:,::2,:90]
x_dic[str(i)] = xx
y_dic[str(i)] = yy
time_interval = datetime.now()-start_time
print(str(i), "finished...", "xx=", xx.shape, "yy=", yy.shape,str(i),"耗时s ",time_interval.seconds)
return x_dic["bed"], x_dic["fall"], x_dic["pickup"], x_dic["run"], x_dic["sitdown"], x_dic["standup"], x_dic["walk"], \
y_dic["bed"], y_dic["fall"], y_dic["pickup"], y_dic["run"], y_dic["sitdown"], y_dic["standup"], y_dic["walk"]
csv_import()
2.2 优化方案,2分钟左右(增加了保存txt 方案)
预处理:
先通过csv_import 提取Input, label 到txt(18分钟)
1: 每次训练的时候,只通过 txt_import加载数据集
** 这个只需要2分钟**
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 24 13:47:36 2024
@author: chengxf2
"""
import csv
from datetime import datetime
import numpy as np
from ast import literal_eval
def csv_import():
#只运行一次,数据预处理
print("csv file importing...")
SKIPROW = 2 #Skip every 2 rows -> overlap 800ms to 600ms (To avoid memory error)
for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
start_time = datetime.now()
label =str(i)
xx_fileName = "./input_files/xx_1000_60_" + label + ".csv"
yy_fileName = "./input_files/yy_1000_60_" + label + ".csv"
xx_txt = "./input_files/xx_1000_60_txt" + label + ".csv"
yy_txt = "./input_files/yy_1000_60_txt" + label + ".csv"
xx_file = open(xx_fileName, 'r')
yy_file = open(yy_fileName, 'r')
lineNum = 0
xx_lines = xx_file.readlines()
yy_lines = yy_file.readlines()
totalrows = 0
rowsXX =[]
rowsYY =[]
#数据集种只有七种分类
for line in yy_lines:
NoActivity= int(line[0])
#只保留偶数行,且非NoActivity
if lineNum%SKIPROW == 0 and NoActivity==0:
xx = xx_lines[lineNum]
yy = yy_lines[lineNum]
arrxx = literal_eval(xx)
arryy = literal_eval(yy)
rowsXX.append(arrxx)
rowsYY.append(arryy)
totalrows +=1
lineNum+=1
rowsXX = np.array(rowsXX)
rowsYY = np.array(rowsYY)
# 1000 Hz to 500 Hz (To avoid memory error)
rowsXX = rowsXX.reshape(len(rowsXX), 1000,90)
rowsXX = rowsXX[:,::2,:90]
a = rowsXX.reshape(-1, rowsXX.shape[1]*rowsXX.shape[2]) # 第一个参数为-1,表示自动计算该维度的大小
print("save txt")
#保存
np.savetxt(xx_txt, a,delimiter=',')
np.savetxt(yy_txt, rowsYY,delimiter=',')
# np.savetxt('a.csv', a, fmt='%d', delimiter=',') dtype=np.int
time_interval = datetime.now()-start_time
print("\n label:",label, "\t totalrows ",totalrows, "\t time_interval",time_interval.seconds,np.shape(rowsXX),np.shape(rowsYY))
xx_file.close()
yy_file.close()
print(lineNum)
def txt_import():
#每次训练的时候只执行该函数
x_dic = {}
y_dic = {}
print("txt file importing...")
beg_time = datetime.now()
for i in ["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]:
label =str(i)
start_time = datetime.now()
xx_txt = "./input_files/xx_1000_60_txt" + label + ".csv"
yy_txt = "./input_files/yy_1000_60_txt" + label + ".csv"
arrXX = np.loadtxt(xx_txt, delimiter=',',dtype=np.float32)
arrYY = np.loadtxt(yy_txt, delimiter=',',dtype=np.int32)
arrXX = arrXX.reshape(-1, 500,90)
time_interval = datetime.now()-start_time
print(label, "\t 耗时(秒):",time_interval.seconds,"\t xx.shape:",np.shape(arrXX),"\t yy.shape",np.shape(arrYY))
x_dic[label]=arrXX
y_dic[label]=arrYY
total_time = datetime.now()-beg_time
print("\n 总共耗时(分钟): ",total_time.seconds/60)
return x_dic["bed"], x_dic["fall"], x_dic["pickup"], x_dic["run"], x_dic["sitdown"], x_dic["standup"], x_dic["walk"], \
y_dic["bed"], y_dic["fall"], y_dic["pickup"], y_dic["run"], y_dic["sitdown"], y_dic["standup"], y_dic["walk"]
txt_import()
版权归原作者 明朝百晓生 所有, 如有侵权,请联系我们删除。