seqgan-music/utils.py at master · L0SG/seqgan-music · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pickle
from sklearn.model_selection import train_test_split


## make tokens file
# load preprocessed dataset
with open('./dataset/dataset2', 'rb') as fp:
    data = pickle.load(fp)

tokens = []
for song in data:
    for token in song:
        """
        # count the number of times tokens appear
        idx = tokens_ref.index(token)
        cnt_tokens[idx] += 1
        """
        if token not in tokens:
            tokens.append(token)
    print('tokens:', tokens)
    print('len(tokens):', len(tokens))
# save file
with open('./dataset/tokens', 'wb') as fp:
    pickle.dump(tokens, fp, protocol=2)
print('token file saved!')


## tokenize
with open('./dataset/dataset2', 'rb') as fp:
    data = pickle.load(fp)

# load list of unique tokens
with open('./dataset/tokens', 'rb') as fp:
    tokens_ref = pickle.load(fp)

batch = []
sequence = []
for song in data:
    for token in song:
        idx = tokens_ref.index(token)
        if len(sequence) < 100:
            sequence.append(idx)
        else:
            batch.append(sequence)
            sequence = []

# save file
with open('./dataset/prep_data', 'wb') as fp:
    pickle.dump(batch, fp, protocol=2)
print('data prepared')


## split train and validation data
# load preprocessed dataset
with open('./dataset/prep_data', 'rb') as fp:
    data = pickle.load(fp)

train, valid = train_test_split(data, test_size=0.2)
# save train dataset and validation dataset
with open('./dataset/train', 'wb') as fp:
    pickle.dump(train, fp, protocol=2)
with open('./dataset/valid', 'wb') as fp:
    pickle.dump(valid, fp, protocol=2)
print('train and valid dataset is saved')