deepsound-project
diff --git a/‎LICENSE‎
Lines changed: 674 additions & 0 deletions b/‎LICENSE‎
Lines changed: 674 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 51 additions & 0 deletions b/‎README.md‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎common.py‎
Lines changed: 39 additions & 0 deletions b/‎common.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎create_data_pickle.py‎
Lines changed: 59 additions & 0 deletions b/‎create_data_pickle.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎data/README‎
Lines changed: 2 additions & 0 deletions b/‎data/README‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎extract_filters.py‎
Lines changed: 132 additions & 0 deletions b/‎extract_filters.py‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎genre_recognizer.py‎
Lines changed: 20 additions & 0 deletions b/‎genre_recognizer.py‎
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,51 @@
+CRNN for Live Music Genre Recognition
+=====================================
+
+Convolutional-Recurrent Neural Networks for Live Music Genre Recognition is a project aimed at creating a neural network recognizing music genre and providing a user-friendly visualization for the network's current belief of the genre of a song. The project was created for the 24-hour Braincode Hackathon in Warsaw by Piotr Kozakowski, Jakub Królak, Łukasz Margas and Bartosz Michalak.
+
+This project uses Keras for the neural network and Tornado for serving requests.
+
+
+Demo
+----
+
+You can see a demo for a few selected songs here: [Demo](http://deepsound.io/genres/).
+
+
+Usage
+-----
+
+In a fresh virtualenv type:  
+
+```shell
+pip install -r requirements.txt
+```
+
+to install all the prerequisites. Run: 
+
+```shell
+THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python server.py  
+```
+
+to run the server at http://0.0.0.0:8080/  
+
+Then you can upload a song using the big (and only) button and see the results for yourself. All mp3 files should work fine.  
+
+Running server.py without additional parameters launches the server using a default model provided in the package. You can provide your own model, as long as it matches the input and output architecture of the provided model. You can train your own model by modifying and running train\_model.py. If you wish to train a model by yourself, download the [GTZAN dataset](http://opihi.cs.uvic.ca/sound/genres.tar.gz) (or provide analogous) to the data/ directory, extract it, run create\_data\_pickle.py to preprocess the data and then run train\_model.py to train the model:
+
+```shell
+cd data
+wget http://opihi.cs.uvic.ca/sound/genres.tar.gz
+tar zxvf genres.tar.gz
+cd ..
+python create_data_pickle.py
+THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python train_model.py
+```
+
+You can "visualize" the filters learned by the convolutional layers using extract\_filters.py. This script for each convolutional neuron extracts and concatenates a few chunks resulting in maximum activation of this neuron from the tracks from the dataset. By default, it will put the visualizations in the filters/ directory. It requires the GTZAN dataset and its pickled version in the data/ directory. Run the commands above to obtain them. You can control the number of extracted chunks using the --count0 argument. Extracting higher number of chunks will be slower.
+
+
+Background
+----------
+
+The rationale for this particular model is based on several works, primarily [Grzegorz Gwardys and Daniel Grzywczak, Deep Image Features in Music Information Retrieval](http://ijet.pl/index.php/ijet/article/view/10.2478-eletel-2014-0042/53) and [Recommending music on Spotify with Deep Learning](http://benanne.github.io/2014/08/05/spotify-cnns.html). The whole idea is extensively described in our blog post [Convolutional-Recurrent Neural Network for Live Music Genre Recognition](http://deepsound.io/music_genre_recognition.html).  
@@ -0,0 +1,39 @@
+# To avoid errors during importing librosa.
+import matplotlib
+matplotlib.use('Agg')
+
+import numpy as np
+import librosa as lbr
+import keras.backend as K
+
+GENRES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
+        'pop', 'reggae', 'rock']
+WINDOW_SIZE = 2048
+WINDOW_STRIDE = WINDOW_SIZE // 2
+N_MELS = 128
+MEL_KWARGS = {
+    'n_fft': WINDOW_SIZE,
+    'hop_length': WINDOW_STRIDE,
+    'n_mels': N_MELS
+}
+
+def get_layer_output_function(model, layer_name):
+    input = model.get_layer('input').input
+    output = model.get_layer(layer_name).output
+    f = K.function([input, K.learning_phase()], output)
+    return lambda x: f([x, 0]) # learning_phase = 0 means test
+
+def load_track(filename, enforce_shape=None):
+    new_input, sample_rate = lbr.load(filename, mono=True)
+    features = lbr.feature.melspectrogram(new_input, **MEL_KWARGS).T
+
+    if enforce_shape is not None:
+        if features.shape[0] < enforce_shape[0]:
+            delta_shape = (enforce_shape[0] - features.shape[0],
+                    enforce_shape[1])
+            features = np.append(features, np.zeros(delta_shape), axis=0)
+        elif features.shape[0] > enforce_shape[0]:
+            features = features[: enforce_shape[0], :]
+
+    features[features == 0] = 1e-6
+    return (np.log(features), float(new_input.shape[0]) / sample_rate)
@@ -0,0 +1,59 @@
+from common import load_track, GENRES
+import sys
+import numpy as np
+from math import pi
+from cPickle import dump
+import os
+from optparse import OptionParser
+
+TRACK_COUNT = 1000
+
+def get_default_shape(dataset_path):
+    tmp_features, _ = load_track(os.path.join(dataset_path,
+        'blues/blues.00000.au'))
+    return tmp_features.shape
+
+def collect_data(dataset_path):
+    '''
+    Collects data from the GTZAN dataset into a pickle. Computes a Mel-scaled
+    power spectrogram for each track.
+
+    :param dataset_path: path to the GTZAN dataset directory
+    :returns: triple (x, y, track_paths) where x is a matrix containing
+        extracted features, y is a one-hot matrix of genre labels and
+        track_paths is a dict of absolute track paths indexed by row indices in
+        the x and y matrices
+    '''
+    default_shape = get_default_shape(dataset_path)
+    x = np.zeros((TRACK_COUNT,) + default_shape, dtype=np.float32)
+    y = np.zeros((TRACK_COUNT, len(GENRES)), dtype=np.float32)
+    track_paths = {}
+
+    for (genre_index, genre_name) in enumerate(GENRES):
+        for i in xrange(TRACK_COUNT // len(GENRES)):
+            file_name = '{}/{}.000{}.au'.format(genre_name,
+                    genre_name, str(i).zfill(2))
+            print 'Processing', file_name
+            path = os.path.join(dataset_path, file_name)
+            track_index = genre_index  * (TRACK_COUNT // len(GENRES)) + i
+            x[track_index], _ = load_track(path, default_shape)
+            y[track_index, genre_index] = 1
+            track_paths[track_index] = os.path.abspath(path)
+
+    return (x, y, track_paths)
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option('-d', '--dataset_path', dest='dataset_path',
+            default=os.path.join(os.path.dirname(__file__), 'data/genres'),
+            help='path to the GTZAN dataset directory', metavar='DATASET_PATH')
+    parser.add_option('-o', '--output_pkl_path', dest='output_pkl_path',
+            default=os.path.join(os.path.dirname(__file__), 'data/data.pkl'),
+            help='path to the output pickle', metavar='OUTPUT_PKL_PATH')
+    options, args = parser.parse_args()
+
+    (x, y, track_paths) = collect_data(options.dataset_path)
+
+    data = {'x': x, 'y': y, 'track_paths': track_paths}
+    with open(options.output_pkl_path, 'w') as f:
+        dump(data, f)
@@ -0,0 +1,2 @@
+This directory should contain the GTZAN dataset in a subdirectory "genres".
+Read the main README to find out how to download it.
@@ -0,0 +1,132 @@
+from common import get_layer_output_function, WINDOW_SIZE, WINDOW_STRIDE
+from keras.models import model_from_yaml
+import librosa as lbr
+import numpy as np
+from functools import partial
+from optparse import OptionParser
+import cPickle
+import os
+
+def compose(f, g):
+    return lambda x: f(g(x))
+
+def undo_layer(length, stride, (i, j)):
+    return (stride * i, stride * (j - 1) + length)
+
+def extract_filters(model, data, filters_path, count0):
+    x = data['x']
+    track_paths = data['track_paths']
+
+    conv_layer_names = []
+    i = 1
+    while True:
+        name = 'convolution_' + str(i)
+        if model.get_layer(name) is None:
+            break
+        conv_layer_names.append(name)
+        i += 1
+
+    # Generate undoers for every convolutional layer. Undoer is a function
+    # translating a pair of coordinates in feature space (mel spectrograms or
+    # features extracted by convolutional layers) to the sample space (raw
+    # audio signal).
+    conv_layer_undoers = []
+
+    # undo the mel spectrogram extraction
+    undoer = partial(undo_layer, WINDOW_SIZE, WINDOW_STRIDE)
+
+    for name in conv_layer_names:
+        layer = model.get_layer(name)
+        length = layer.filter_length
+        stride = layer.subsample_length
+
+        # undo the convolution layer
+        undoer = compose(partial(undo_layer, length, stride), undoer)
+        conv_layer_undoers.append(undoer)
+
+        # undo the pooling layer
+        undoer = compose(partial(undo_layer, 2, 2), undoer)
+        conv_layer_output_funs = \
+        map(partial(get_layer_output_function, model), conv_layer_names)
+
+    # Extract track chunks with highest activations for each filter in each
+    # convolutional layer.
+    for (layer_index, output_fun) in enumerate(conv_layer_output_funs):
+        layer_path = os.path.join(filters_path, conv_layer_names[layer_index])
+        if not os.path.exists(layer_path):
+            os.makedirs(layer_path)
+
+        print 'Computing outputs for layer', conv_layer_names[layer_index]
+        output = output_fun(x)
+
+        # matrices of shape n_tracks x time x n_filters
+        max_over_time = np.amax(output, axis=1)
+        argmax_over_time = np.argmax(output, axis=1)
+
+        # number of input chunks to extract for each filter
+        count = count0 // 2 ** layer_index
+        argmax_over_track = \
+                np.argpartition(max_over_time, -count, axis=0)[-count :, :]
+
+        undoer = conv_layer_undoers[layer_index]
+
+        for filter_index in xrange(argmax_over_track.shape[1]):
+            print 'Processing layer', conv_layer_names[layer_index], \
+                    'filter', filter_index
+            
+            track_indices = argmax_over_track[:, filter_index]
+            time_indices = argmax_over_time[track_indices, filter_index]
+            sample_rate = [None]
+
+            def extract_sample_from_track(undoer, (track_index, time_index)):
+                track_path = track_paths[track_index]    
+                (track_samples, sample_rate[0]) = lbr.load(track_path,
+                        mono=True)
+                (t1, t2) = undoer((time_index, time_index + 1))
+                return track_samples[t1 : t2]
+
+            samples_for_filter = np.concatenate(
+                    map(partial(extract_sample_from_track, undoer),
+                            zip(track_indices, time_indices)))
+
+            filter_path = os.path.join(layer_path,
+                    '{}.wav'.format(filter_index))
+            lbr.output.write_wav(filter_path, samples_for_filter,
+                    sample_rate[0])
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option('-m', '--model_path', dest='model_path',
+            default=os.path.join(os.path.dirname(__file__),
+                'models/model.yaml'),
+            help='path to the model YAML file', metavar='MODEL_PATH')
+    parser.add_option('-w', '--weights_path', dest='weights_path',
+            default=os.path.join(os.path.dirname(__file__),
+                'models/weights.h5'),
+            help='path to the model weights hdf5 file',
+            metavar='WEIGHTS_PATH')
+    parser.add_option('-d', '--data_path', dest='data_path',
+            default=os.path.join(os.path.dirname(__file__),
+                'data/data.pkl'),
+            help='path to the data pickle',
+            metavar='DATA_PATH')
+    parser.add_option('-f', '--filters_path', dest='filters_path',
+            default=os.path.join(os.path.dirname(__file__),
+                'filters'),
+            help='path to the output filters directory',
+            metavar='FILTERS_PATH')
+    parser.add_option('-c', '--count0', dest='count0',
+            default='4',
+            help=('number of chunks to extract from the first convolutional ' +
+                'layer, this number is halved for each next layer'),
+            metavar='COUNT0')
+    options, args = parser.parse_args()
+
+    with open(options.model_path, 'r') as f:
+        model = model_from_yaml(f.read())
+    model.load_weights(options.weights_path)
+
+    with open(options.data_path, 'r') as f:
+        data = cPickle.load(f)
+
+    extract_filters(model, data, options.filters_path, int(options.count0))
@@ -0,0 +1,20 @@
+from common import load_track, get_layer_output_function
+import numpy as np
+from keras.layers import Input
+from keras.models import model_from_yaml, Model
+from keras import backend as K
+
+class GenreRecognizer():
+
+    def __init__(self, model_path, weights_path):
+        with open(model_path, 'r') as f:
+            model = model_from_yaml(f.read())
+        model.load_weights(weights_path)
+        self.pred_fun = get_layer_output_function(model, 'output_realtime')
+        print 'Loaded model.'
+    
+    def recognize(self, track_path):
+        print 'Loading song', track_path
+        (features, duration) = load_track(track_path)
+        features = np.reshape(features, (1,) + features.shape)
+        return (self.pred_fun(features), duration)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This directory should contain the GTZAN dataset in a subdirectory "genres".`
	`2`	`+Read the main README to find out how to download it.`