-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.hpp
More file actions
408 lines (352 loc) · 13 KB
/
data.hpp
File metadata and controls
408 lines (352 loc) · 13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
/**
* @file data.hpp
* @brief Contains formats for example data
*
*/
#ifndef __DATA_H
#define __DATA_H
#include <assert.h>
#include <stdint.h>
#include "mnist.hpp"
/**
* \brief Ensure array has cycling values of some function f mod n.
* Given an array, this function will rearrange the values to ensure that
* the integer function passed in has values which cycle. For example, if
* a cycle length of four is specified, the values will be made to run 0,1,2,3,0,1,2,3.
* This is done in-place.
*
* The input function has the signature (int)(T). In the shuffling code we use it
* takes a pointer to the data of the example.
*/
template <class T,class TestFunc> void alternate(T *arr,int nitems,int cycle,TestFunc f){
// for each item, if it is not the appropriate value,
// scan forward until we find one which is and swap with that.
// Leave if we can't find one.
for(int i=0;i<nitems;i++){
if(f(arr[i])%cycle!=(i%cycle)){
// doesn't match; swap.
for(int j=i;;j++){
if(j>=nitems)return; // can't find a match, exit.
// scan for one that does
if(f(arr[j])%cycle==i%cycle){
// and swap and leave loop
T v=arr[i];
arr[i]=arr[j];
arr[j]=v;
break;
}
}
}
}
}
/**
* \brief
* A set of example data. Each datum consists of
* hormone (i.e. modulator value), inputs and outputs.
* The data is stored as a single double array, with each example made up
* of inputs, followed by outputs, followed by modulator value (h).
*/
class ExampleSet {
double **examples; //!< pointers to each example, stored as inputs, then outputs, then h.
double *data; //!< pointer to block of floats containing all example data
int ninputs; //!< number of inputs
int noutputs; //!< number of outputs
int ct; //!< number of examples
uint32_t outputOffset; //!< offset of outputs in example data
uint32_t hOffset; //!< offset of h in example data
/**
* \brief Does this set own its data?
* A little bit of a hack. An example set can be constructed as part of another
* set, in which case it shouldn't delete its memory. This is used in constructing
* cross-validation sets. If an example set is created in such a way, this
* should be false.
*/
bool ownsData;
/**
* \brief If there are discrete modulator levels, this is how many there
* are - if not, it should be 1.
*/
int numHLevels;
/**
* \brief minimum H level, 0 by default, set with setHRange()
*/
double minH;
/**
* \brief maximum H level, 1 by default, set with setHRange()
*/
double maxH;
public:
/**
* \brief
* Constructor - creates but doesn't fill in the data
* \param n number of examples
* \param nin number of inputs to each example
* \param nout number of outputs from each example
* \param levels number of modulator levels (see numHLevels)
*/
ExampleSet(int n,int nin,int nout,int levels){
ninputs=nin;
noutputs=nout;
ct=n;
numHLevels = levels;
minH=0;
maxH=1;
// printf("Allocating new set %d*(%d,%d)\n",
// n,ninputs,noutputs);
// size of a single example: number of inputs plus number of outputs
// plus one for the modulator.
uint32_t exampleSize = ninputs+noutputs+1;
// calculate the offsets
outputOffset = ninputs;
hOffset = ninputs+noutputs;
data = new double[exampleSize*ct]; // allocate data
examples = new double*[ct]; // allocate example pointers
for(int i=0;i<ct;i++){
// work out and store the example pointer
examples[i] = data+i*exampleSize;
}
ownsData = true;
}
/**
* \brief Constructor for making a subset of another set.
* This uses the actual data in the parent, but creates a fresh
* set of offset structures which can be independently shuffled.
* \param parent the set which holds our data.
* \param start the start index of the data in the parent.
* \param length the length of the subset.
*/
ExampleSet(const ExampleSet &parent,int start,int length){
if(length > parent.ct - start || start<0 || length<1)
throw std::out_of_range("subset out of range");
ownsData = false;
ninputs = parent.ninputs;
noutputs = parent.noutputs;
outputOffset = ninputs;
hOffset = ninputs+noutputs;
data = parent.data;
examples = new double*[length];
ct = length;
numHLevels = parent.numHLevels;
minH = parent.minH;
maxH = parent.maxH;
for(int i=0;i<ct;i++){
examples[i] = parent.examples[start+i];
}
}
/**
* \brief Special constructor for generating a data set
* from an MNIST database with a single labelling (i.e.
* for use in non-modulatory training). We copy the data
* from the MNIST object. The outputs will use a one-hot encoding.
* This example set will have no modulation.
*/
ExampleSet(const MNIST& mnist) : ExampleSet(
mnist.getCount(), // number of examples
mnist.r()*mnist.c(), // input count
mnist.getMaxLabel()+1, // output count
1 // single modulation level
){
// fill in the data
for(int i=0;i<ct;i++){
// convert each pixel into a 0-1 double and store
uint8_t *imgpix = mnist.getImg(i);
double *inpix = getInputs(i);
for(int i=0;i<ninputs;i++){
double pixval = *imgpix++;
pixval /= 255.0;
*inpix++ = pixval;
}
// fill in the one-hot encoded output
double *out = getOutputs(i);
for(int outIdx=0;outIdx<noutputs;outIdx++){
out[outIdx] = mnist.getLabel(i)==outIdx?1:0;
}
setH(i,0); // set nominal modulator value
}
ownsData=true;
}
/**
* \brief
* Destructor - deletes data and offset array
*/
~ExampleSet(){
if(ownsData){ // only delete the data if we aren't a subset
delete [] data;
}
}
public:
/**
* \brief Shuffling mode for shuffle()
*/
enum ShuffleMode {
/**
* \brief Shuffle blocks of numHLevels examples, rather than single examples.
* This is intended for cases where examples with the same inputs are added contiguously
* at different modulator levels.
* For this to work correctly, the modulator levels must be distributed evenly
* across their range. For example, for four modulator levels from 2-3:
*
* * ensure that numHLevels is 4
* * ensure that the values for 2,2.25,2.5 and 3 are equally represented in the data.
* * ensure that the data is provided in equally sized groups cycling through the
* modulator (similar to the output of the ALTERNATE mode)
*
* It is possible to run a shuffle(rd,ALTERNATE) on the data after input, followed
* by training with this mode.
*/
STRIDE,
/**
* \brief Shuffle single examples, but follow up by running a pass over the examples
* to ensure that they alternate by modulator level. This is useful where there are
* discrete modulator levels but the examples are mixed
* up (as happens in the robot experiments). This doesn't require equal distribution
* of modulator levels, but the levels should be evenly spaced across the range.
* If the distribution is unequal, a portion at the end of the set will not alternate
* correctly.
*/
ALTERNATE,
/**
* \brief Shuffle single examples, no matter the value of numHLevels.
*/
SINGLE,
/**
* \brief Don't shuffle examples at all
*/
NONE
};
/**
* \brief
* Shuffle the example using a PRNG and a Fisher-Yates shuffle.
* \param rd pointer to a PRNG data block
* \param mode ShuffleMode::STRIDE to keep blocks of size numHLevels together, ShuffleMode::ALTERNATE to
* shuffle all examples but ensure that h-levels alternate after shuffling, or ShuffleMode::NONE to just shuffle.
* \param nExamples how many examples to shuffle; if 0, do all of them
*/
void shuffle(drand48_data *rd,ShuffleMode mode,int nExamples=0){
if(mode == NONE) // this means we don't shuffle
return;
if(!nExamples)
nExamples=ct;
int blockSize; // size of the blocks we are shuffling, in bytes
if(mode == STRIDE)
blockSize = numHLevels;
else
blockSize = 1;
double **tmp = new double*[blockSize]; // temporary storage for swapping
for(int i=(nExamples/blockSize)-1;i>=1;i--){
long lr;
lrand48_r(rd,&lr);
int j = lr%(i+1);
memcpy(tmp,examples+i*blockSize,blockSize*sizeof(double*));
memcpy(examples+i*blockSize,examples+j*blockSize,blockSize*sizeof(double*));
memcpy(examples+j*blockSize,tmp,blockSize*sizeof(double*));
}
// if this mode is set, rearrange the shuffled data so that the h-levels cycle
if(mode == ALTERNATE){
alternate<double*>(examples, nExamples, numHLevels,
// abominations like this are why I used an overcomplicated
// example system at first...
[this](double *e){
double d = (e[hOffset]-minH)/(maxH-minH);
int i = (int)(d*(numHLevels-1));
return i;
});
}
delete [] tmp;
}
/**
* Modify the min/max h range, which is 0<=h<=1 by default.
* \param mn minimum H value in set domain
* \param mx maximum H value in set domain
*/
ExampleSet& setHRange(double mn,double mx){
minH = mn;
maxH = mx;
return *this;
}
/**
* \brief get the number of inputs in all examples
* \return number of inputs into each example
*/
int getInputCount() const {
return ninputs;
}
/**
* \brief get the number of outputs in all examples
* \return number of outputs from each example
*/
int getOutputCount() const {
return noutputs;
}
/**
* \brief get the number of examples
* \return number of examples
*/
int getCount() const {
return ct;
}
/**
* \brief
* Get a pointer to the inputs for a given example, for reading or writing
* \param example index of the example
*/
double *getInputs(int example) {
assert(example<ct);
return examples[example]; // inputs are first in each block
}
/**
* \brief
* Get a pointer to the outputs for a given example, for reading or writing
* \param example index of the example
*/
double *getOutputs(int example) {
assert(example<ct);
return examples[example] + outputOffset;
}
/**
* \brief
* Get the h (modulator) for a given example
* \param example index of the example
*/
double getH(int example) const {
assert(example<ct);
return *(examples[example] + hOffset);
}
/**
* \brief return the number of different H-levels
*/
int getNumHLevels(){
return numHLevels;
}
/**
* \brief
* Set the h (modulator) for a given example
* \param example index of the example
* \param h modulator to use
*/
void setH(int example, double h){
assert(example<ct);
*(examples[example] + hOffset) = h;
}
/**
* \brief dump to stdout
* \param start index to start dump
* \param end index to end dump (exclusive)
*/
void dump(int start=0,int end=-1){
if(end<0)end=ct;
for(int i=start;i<end;i++){
double *ins = getInputs(i);
double *outs = getOutputs(i);
for(int j=0;j<ninputs;j++){
printf("%f ",ins[j]);
}
printf(" modulator %f --> ",getH(i));
for(int j=0;j<noutputs;j++){
printf("%f ",outs[j]);
}
printf("\n");
}
}
};
#endif /* __DATA_H */