SportSort/DataPreprocessing.py at master · anujkumar93/SportSort · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import numpy as np
import glob
import os

def data_preprocessing(sports=['Badminton','Basketball','Foosball','Running','Skating','Walking'],
                       secondsToKeep=30,
                       trimLength=15,
                       switchAlgo=0):
    """
    Pre-processes the raw data files to make sure the data from both sensors is of the same size.
    Discards some amount of raw data at the start and end of the files to remove miscellaneous activity
    that gets recorded while starting and ending data collection.
    Then divides the data into integral samples of given size.
    :param sports: List of sport names involved
    :param secondsToKeep: Number of seconds to keep in each sample
    :param trimLength: Number of seconds to trim at the start and enf of a raw data file
    :param switchAlgo: testing scenario...
        -- 0: [FINAL/TRAINING] use only original data for training and testing (no newPerson, no singleTest)
        -- 1: [FINAL/TRAINING] use everything as one block (no separate newPerson, singleTest; they go into training)
        -- 2: [NEWPERSON/TEST] create for newPerson and singleTest for separate testing (use with 0)
        -- 3: [NEWPERSON/TEST] create only for singleTest for separate testing (use with 4)
        -- 4: [FINAL/TRAINING] create original + newPerson for training (use with 3)
    :return: Nothing
    """
    for sport in sports:
        source_dir      = '../Data/' + sport
        finalOutputFile = '../Data/' + sport + '/Final.csv'
        if switchAlgo==2 or switchAlgo==3:
            finalOutputFile = '../Data/' + sport + '/newPersonFinal.csv'
        outputFileAcc   = finalOutputFile[:len(finalOutputFile)-4]+'_Acc'+finalOutputFile[len(finalOutputFile)-4:]
        outputFileGyro  = finalOutputFile[:len(finalOutputFile)-4]+'_Gyro'+finalOutputFile[len(finalOutputFile)-4:]

        file_list = glob.glob(source_dir + '/*.csv')

        with open(outputFileAcc,'w') as outFileAcc:
            with open(outputFileGyro,'w') as outFileGyro:
                for i in range(len(file_list)):
                    # avoiding files with 'Final' in its name
                    if 'Final' in file_list[i]:
                        continue

                    if switchAlgo==0:
                        # avoiding files with 'newPerson' or 'singleTest' in its name
                        if 'newPerson' in file_list[i] or 'singleTest' in file_list[i]:
                            continue
                    elif switchAlgo==2:
                        # taking only files with 'newPerson' or 'singleTest' in its name
                        if 'newPerson' not in file_list[i] and 'singleTest' not in file_list[i]:
                            continue
                    elif switchAlgo==3:
                        # taking only files with 'singleTest' in its name
                        if 'singleTest' not in file_list[i]:
                            continue
                    elif switchAlgo==4:
                        # avoid files 'singleTest' in its name
                        if 'singleTest' in file_list[i]:
                            continue

                    with open(file_list[i], 'r') as fileStream:
                        counter=1
                        firstSensor  = "Accelerometer"
                        secondSensor = "Gyroscope"

                        print '\nFile:', file_list[i]

                        firstLine=fileStream.readline()
                        firstWord=firstLine.split(', ')[0]

                        if "gyro" in firstWord.lower():
                            firstSensor = "Gyroscope"

                        while True:
                            line=fileStream.readline()
                            currFirstWord=line.split(', ')[0]
                            if firstWord==currFirstWord:
                                counter+=1
                            else:
                                break

                        line=fileStream.readline()
                        firstWord=line.split(', ')[0]

                        if "accel" in firstWord.lower():
                            secondSensor = "Accelerometer"

                        fileStream.seek(0)
                        lines           = fileStream.readlines()
                        totalNumOfLines = len(lines)
                        secondCounter   = totalNumOfLines-counter

                        print '1st sensor samples          =', counter
                        print '2nd sensor samples          =', secondCounter

                        # break data into two sets - for two sensors
                        firstSensorLines  = lines[:counter] # size = counter
                        secondSensorLines = lines[counter:] # size = secondCounter

                        if secondCounter < counter:
                            # subsampling of larger counted sensor data
                            diff = counter - secondCounter
                            step = counter / diff
                            toDelete = np.zeros(diff)
                            for j in range(diff):
                                toDelete[j] = int(np.floor(j*step))

                            subsample_lineNum = np.delete(np.arange(counter,dtype=np.int32), toDelete)
                            subsample_lines=[lines[ind] for ind in subsample_lineNum]
                            # trimming
                            if len(subsample_lines) > 2*50*trimLength:
                                subsample_lines = subsample_lines[50*trimLength:len(subsample_lines)-50*trimLength]
                                secondSensorLines = secondSensorLines[50*trimLength:len(secondSensorLines)-50*trimLength]

                            # saving
                            linesToKeep = len(subsample_lines)-(len(subsample_lines)%(50*secondsToKeep))
                            finalCounterLines=subsample_lines[0:linesToKeep]
                            finalSecondCounterLines=secondSensorLines[0:linesToKeep]

                        elif secondCounter > counter:
                            diff = secondCounter - counter
                            step = secondCounter / diff
                            toDelete = np.zeros(diff)
                            for j in range(diff):
                                toDelete[j] = int(np.floor(j*step))

                            subsample_lineNum = np.delete(np.arange(secondCounter,dtype=np.int32), toDelete)
                            subsample_lines=[lines[counter+ind] for ind in subsample_lineNum]

                            if len(subsample_lines) > 2*50*trimLength:
                                firstSensorLines = firstSensorLines[50*trimLength:len(firstSensorLines)-50*trimLength]
                                subsample_lines = subsample_lines[50*trimLength:len(subsample_lines)-50*trimLength]

                            linesToKeep = len(subsample_lines)-(len(subsample_lines)%(50*secondsToKeep))
                            finalCounterLines=firstSensorLines[0:linesToKeep]
                            finalSecondCounterLines=subsample_lines[0:linesToKeep]

                        else:
                            if len(firstSensorLines) > 2*50*trimLength:
                                firstSensorLines = firstSensorLines[50*trimLength:len(firstSensorLines)-50*trimLength]
                                secondSensorLines = secondSensorLines[50*trimLength:len(secondSensorLines)-50*trimLength]

                            linesToKeep = len(secondSensorLines)-(len(secondSensorLines)%(50*secondsToKeep))
                            finalCounterLines=firstSensorLines[:linesToKeep]
                            finalSecondCounterLines=secondSensorLines[:linesToKeep]

                        print 'Adjusted 1st sensor samples =', len(finalCounterLines)
                        print 'Adjusted 2nd sensor samples =', len(finalSecondCounterLines)

                        if firstSensor == "Accelerometer":
                            for item in finalCounterLines:
                                outFileAcc.write("%s" % item)
                            for item in finalSecondCounterLines:
                                outFileGyro.write("%s" % item)
                        elif firstSensor == "Gyroscope":
                            for item in finalSecondCounterLines:
                                outFileAcc.write("%s" % item)
                            for item in finalCounterLines:
                                outFileGyro.write("%s" % item)

        with open(outputFileAcc,'r') as inFileAcc:
            with open(outputFileGyro,'r') as inFileGyro:
                with open(finalOutputFile, 'w') as outFile:
                    lines=inFileAcc.readlines()
                    for item in lines:
                        outFile.write("%s" % item)
                    lines=inFileGyro.readlines()
                    for item in lines:
                        outFile.write("%s" % item)

        os.remove(outputFileAcc)
        os.remove(outputFileGyro)