-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDataPreprocessing.py
More file actions
173 lines (145 loc) · 8.93 KB
/
DataPreprocessing.py
File metadata and controls
173 lines (145 loc) · 8.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
import glob
import os
def data_preprocessing(sports=['Badminton','Basketball','Foosball','Running','Skating','Walking'],
secondsToKeep=30,
trimLength=15,
switchAlgo=0):
"""
Pre-processes the raw data files to make sure the data from both sensors is of the same size.
Discards some amount of raw data at the start and end of the files to remove miscellaneous activity
that gets recorded while starting and ending data collection.
Then divides the data into integral samples of given size.
:param sports: List of sport names involved
:param secondsToKeep: Number of seconds to keep in each sample
:param trimLength: Number of seconds to trim at the start and enf of a raw data file
:param switchAlgo: testing scenario...
-- 0: [FINAL/TRAINING] use only original data for training and testing (no newPerson, no singleTest)
-- 1: [FINAL/TRAINING] use everything as one block (no separate newPerson, singleTest; they go into training)
-- 2: [NEWPERSON/TEST] create for newPerson and singleTest for separate testing (use with 0)
-- 3: [NEWPERSON/TEST] create only for singleTest for separate testing (use with 4)
-- 4: [FINAL/TRAINING] create original + newPerson for training (use with 3)
:return: Nothing
"""
for sport in sports:
source_dir = '../Data/' + sport
finalOutputFile = '../Data/' + sport + '/Final.csv'
if switchAlgo==2 or switchAlgo==3:
finalOutputFile = '../Data/' + sport + '/newPersonFinal.csv'
outputFileAcc = finalOutputFile[:len(finalOutputFile)-4]+'_Acc'+finalOutputFile[len(finalOutputFile)-4:]
outputFileGyro = finalOutputFile[:len(finalOutputFile)-4]+'_Gyro'+finalOutputFile[len(finalOutputFile)-4:]
file_list = glob.glob(source_dir + '/*.csv')
with open(outputFileAcc,'w') as outFileAcc:
with open(outputFileGyro,'w') as outFileGyro:
for i in range(len(file_list)):
# avoiding files with 'Final' in its name
if 'Final' in file_list[i]:
continue
if switchAlgo==0:
# avoiding files with 'newPerson' or 'singleTest' in its name
if 'newPerson' in file_list[i] or 'singleTest' in file_list[i]:
continue
elif switchAlgo==2:
# taking only files with 'newPerson' or 'singleTest' in its name
if 'newPerson' not in file_list[i] and 'singleTest' not in file_list[i]:
continue
elif switchAlgo==3:
# taking only files with 'singleTest' in its name
if 'singleTest' not in file_list[i]:
continue
elif switchAlgo==4:
# avoid files 'singleTest' in its name
if 'singleTest' in file_list[i]:
continue
with open(file_list[i], 'r') as fileStream:
counter=1
firstSensor = "Accelerometer"
secondSensor = "Gyroscope"
print '\nFile:', file_list[i]
firstLine=fileStream.readline()
firstWord=firstLine.split(', ')[0]
if "gyro" in firstWord.lower():
firstSensor = "Gyroscope"
while True:
line=fileStream.readline()
currFirstWord=line.split(', ')[0]
if firstWord==currFirstWord:
counter+=1
else:
break
line=fileStream.readline()
firstWord=line.split(', ')[0]
if "accel" in firstWord.lower():
secondSensor = "Accelerometer"
fileStream.seek(0)
lines = fileStream.readlines()
totalNumOfLines = len(lines)
secondCounter = totalNumOfLines-counter
print '1st sensor samples =', counter
print '2nd sensor samples =', secondCounter
# break data into two sets - for two sensors
firstSensorLines = lines[:counter] # size = counter
secondSensorLines = lines[counter:] # size = secondCounter
if secondCounter < counter:
# subsampling of larger counted sensor data
diff = counter - secondCounter
step = counter / diff
toDelete = np.zeros(diff)
for j in range(diff):
toDelete[j] = int(np.floor(j*step))
subsample_lineNum = np.delete(np.arange(counter,dtype=np.int32), toDelete)
subsample_lines=[lines[ind] for ind in subsample_lineNum]
# trimming
if len(subsample_lines) > 2*50*trimLength:
subsample_lines = subsample_lines[50*trimLength:len(subsample_lines)-50*trimLength]
secondSensorLines = secondSensorLines[50*trimLength:len(secondSensorLines)-50*trimLength]
# saving
linesToKeep = len(subsample_lines)-(len(subsample_lines)%(50*secondsToKeep))
finalCounterLines=subsample_lines[0:linesToKeep]
finalSecondCounterLines=secondSensorLines[0:linesToKeep]
elif secondCounter > counter:
diff = secondCounter - counter
step = secondCounter / diff
toDelete = np.zeros(diff)
for j in range(diff):
toDelete[j] = int(np.floor(j*step))
subsample_lineNum = np.delete(np.arange(secondCounter,dtype=np.int32), toDelete)
subsample_lines=[lines[counter+ind] for ind in subsample_lineNum]
if len(subsample_lines) > 2*50*trimLength:
firstSensorLines = firstSensorLines[50*trimLength:len(firstSensorLines)-50*trimLength]
subsample_lines = subsample_lines[50*trimLength:len(subsample_lines)-50*trimLength]
linesToKeep = len(subsample_lines)-(len(subsample_lines)%(50*secondsToKeep))
finalCounterLines=firstSensorLines[0:linesToKeep]
finalSecondCounterLines=subsample_lines[0:linesToKeep]
else:
if len(firstSensorLines) > 2*50*trimLength:
firstSensorLines = firstSensorLines[50*trimLength:len(firstSensorLines)-50*trimLength]
secondSensorLines = secondSensorLines[50*trimLength:len(secondSensorLines)-50*trimLength]
linesToKeep = len(secondSensorLines)-(len(secondSensorLines)%(50*secondsToKeep))
finalCounterLines=firstSensorLines[:linesToKeep]
finalSecondCounterLines=secondSensorLines[:linesToKeep]
print 'Adjusted 1st sensor samples =', len(finalCounterLines)
print 'Adjusted 2nd sensor samples =', len(finalSecondCounterLines)
if firstSensor == "Accelerometer":
for item in finalCounterLines:
outFileAcc.write("%s" % item)
for item in finalSecondCounterLines:
outFileGyro.write("%s" % item)
elif firstSensor == "Gyroscope":
for item in finalSecondCounterLines:
outFileAcc.write("%s" % item)
for item in finalCounterLines:
outFileGyro.write("%s" % item)
with open(outputFileAcc,'r') as inFileAcc:
with open(outputFileGyro,'r') as inFileGyro:
with open(finalOutputFile, 'w') as outFile:
lines=inFileAcc.readlines()
for item in lines:
outFile.write("%s" % item)
lines=inFileGyro.readlines()
for item in lines:
outFile.write("%s" % item)
os.remove(outputFileAcc)
os.remove(outputFileGyro)