Search code examples
audiokerasautoencoder

Autoencoder for sound data in Keras


I have a 2d array of log-scaled mel-spectrograms of sound samples for 5 different categories.

For training I have used convolutional and dense neural network in Keras. Here the code:

model = Sequential()
model.add(Conv1D(80, 8, activation='relu', padding='same',input_shape=(60,108)))
model.add(MaxPooling1D(2,padding='same',strides=None))
model.add(Flatten())
initializer=initializers.TruncatedNormal()
model.add(Dense(200, activation='relu', kernel_initializer=initializer,bias_initializer=initializer))
model.add(BatchNormalization())
model.add(Dropout(0.8))
model.add(Dense(50, activation='relu', kernel_initializer=initializer,bias_initializer=initializer))
model.add(Dropout(0.8))
model.add(Dense(5, activation='softmax', kernel_initializer=initializer,bias_initializer=initializer))
model.compile(loss='categorical_crossentropy',
          optimizer='adam',lr=0.01,
          metrics=['accuracy'])

What kind of autoencoder can I apply to this type of data input? What model? Any suggestion or also code example would be helpful. :)


Solution

  • Since I don’t have answers to my question about the nature of the data, I will assume that we have set of 2 dimensional data with the shape like (NSamples, 68, 108). Also, I assume that answer on my suggestion to use Convolutional2D instead Convolutional1D is yes

    Here is sample of models for convolutional auto encoder, model, which can use a trained auto encoder and how to use weights from an auto encoder for the final model:

    from keras.layers.core import Dense, Dropout, Flatten, Reshape
    from keras.layers import Conv1D, Conv2D, Deconv2D, MaxPooling1D, MaxPooling2D, UpSampling2D, Conv2DTranspose, Flatten, BatchNormalization, Dropout
    from keras.callbacks import ModelCheckpoint
    import keras.models as models
    import keras.initializers as initializers
    from sklearn.model_selection import train_test_split
    
    ae = models.Sequential()
    #model.add(Conv1D(80, 8, activation='relu', padding='same',input_shape=(60,108)))
    #encoder
    c = Conv2D(80, 3, activation='relu', padding='same',input_shape=(60, 108, 1))
    ae.add(c)
    ae.add(MaxPooling2D(pool_size=(2, 2), padding='same', strides=None))
    ae.add(Flatten())
    initializer=initializers.TruncatedNormal()
    d1 = Dense(200, activation='relu', kernel_initializer=initializer,bias_initializer=initializer)
    ae.add(d1)
    ae.add(BatchNormalization())
    ae.add(Dropout(0.8))
    d2 = Dense(50, activation='relu', kernel_initializer=initializer,bias_initializer=initializer)
    ae.add(d2)
    ae.add(Dropout(0.8))
    #decodser
    ae.add(Dense(d2.input_shape[1], activation='sigmoid'))
    ae.add(Dense(d1.input_shape[1], activation='sigmoid'))
    ae.add(Reshape((30, 54, 80)))
    ae.add(UpSampling2D((2,2)))
    ae.add(Deconv2D(filters= c.filters, kernel_size= c.kernel_size, strides=c.strides, activation=c.activation, padding=c.padding, ))
    ae.add(Deconv2D(filters= 1, kernel_size= c.kernel_size, strides=c.strides, activation=c.activation, padding=c.padding, ))
    ae.compile(loss='binary_crossentropy',
    optimizer='adam',lr=0.001,
    metrics=['accuracy'])
    ae.summary()
    #now train your convolutional autoencoder to reconstruct your input data
    #reshape your data to (NSamples, 60, 108, 1)
    #Then train your autoencoder. it can be something like that:
    #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=43)
    #pre_mcp = ModelCheckpoint("CAE.hdf5", monitor='val_accuracy', verbose=2, save_best_only=True, mode='max')
    #pre_history = ae.fit(X_train, X_train, epochs=100, validation_data=(X_val, X_val), batch_size=22, verbose=2, callbacks=[pre_mcp])
    
    #model
    model = models.Sequential()
    #model.add(Conv1D(80, 8, activation='relu', padding='same',input_shape=(60,108)))
    model.add(Conv2D(80, 3, activation='relu', padding='same',input_shape=(60, 108, 1)))
    model.add(MaxPooling2D(pool_size=(2, 2), padding='same',strides=None))
    model.add(Flatten())
    initializer=initializers.TruncatedNormal()
    model.add(Dense(200, activation='relu', kernel_initializer=initializer,bias_initializer=initializer))
    model.add(BatchNormalization())
    model.add(Dropout(0.8))
    model.add(Dense(50, activation='relu', kernel_initializer=initializer,bias_initializer=initializer))
    model.add(Dropout(0.8))
    model.add(Dense(5, activation='softmax', kernel_initializer=initializer,bias_initializer=initializer))
    model.compile(loss='categorical_crossentropy',
    optimizer='adam',lr=0.001,
    metrics=['accuracy'])
    #Set weights              
    model.layers[0].set_weights(ae.layers[0].get_weights())       
    model.layers[3].set_weights(ae.layers[3].get_weights())  
    model.layers[4].set_weights(ae.layers[4].get_weights())  
    model.layers[6].set_weights(ae.layers[6].get_weights())  
    model.summary()
    #Now you can train your model with pre-trained weights from autoencoder
    

    A model like this was useful for me with MNIST dataset and improved accuracy of model with initial weights from auto encoder in comparison with model initialized with random weights

    However, I would recommend using of several convolutional/deconvolutional layers, probably 3 or more, since from my experience convolutional auto encoders with 3 and more convolutional layers are more efficient than with 1 convolutional layer. In fact, with one convolutional layer I can’t even see any accuracy improvements sometimes

    Update:

    I checked auto encoder with data provided by Emanuela, also I checked it with different auto encoders architectures without any success

    My hypothesis about that is that the data doesn’t contain any significant features, which can be distinguished by auto encoder or even CAE

    However, it looks like my assumption about 2 dimensional nature of the data was confirmed by reaching of almost 99.99% validation accuracy: enter image description here

    Nevertheless, in the same time, 97.31% accuracy of training data can indicate potential issues with dataset, so it looks like a good idea to revise it

    In addition, I would suggest using ensembles of networks. You could train, for example 10 networks with different validation data and assign a category for items by the most voted categories

    Here is my code:

    from keras.layers.core import Dense, Dropout, Flatten
    from keras.layers import Conv2D, BatchNormalization
    from keras.callbacks import ModelCheckpoint
    from keras.optimizers import Adam
    from sklearn.model_selection import train_test_split
    import keras.models as models
    import keras.initializers as initializers
    import msgpack
    import numpy as np
    
    with open('SoundDataX.msg', "rb") as fx,open('SoundDataY.msg', "rb") as fy: 
        dataX=msgpack.load(fx) 
        dataY=msgpack.load(fy)
    
    num_samples = len(dataX)
    x = np.empty((num_samples, 60, 108, 1), dtype = np.float32)
    y = np.empty((num_samples, 4), dtype = np.float32)
    
    for i in range(0, num_samples):
        x[i] = np.asanyarray(dataX[i]).reshape(60, 108, 1)
        y[i] = np.asanyarray(dataY[i])
    
    X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=43)
    
    #model
    model = models.Sequential()
    model.add(Conv2D(128, 3, activation='relu', padding='same',input_shape=(60, 108, 1)))
    model.add(Conv2D(128, 5, activation='relu', padding='same',input_shape=(60, 108, 1)))
    model.add(Conv2D(128, 7, activation='relu', padding='same',input_shape=(60, 108, 1)))
    model.add(Flatten())
    initializer=initializers.TruncatedNormal()
    model.add(Dense(200, activation='relu', kernel_initializer=initializer,bias_initializer=initializer))
    model.add(BatchNormalization())
    model.add(Dropout(0.8))
    model.add(Dense(50, activation='relu', kernel_initializer=initializer,bias_initializer=initializer))
    model.add(Dropout(0.8))
    model.add(Dense(4, activation='softmax', kernel_initializer=initializer,bias_initializer=initializer))
    model.compile(loss='categorical_crossentropy',
    optimizer=Adam(lr=0.0001),
    metrics=['accuracy'])
    model.summary()
    filepath="weights-{epoch:02d}-{val_acc:.7f}-{acc:.7f}.hdf5"
    mcp = ModelCheckpoint(filepath, monitor='val_acc', verbose=2, save_best_only=True, mode='max')
    history = model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), batch_size=64, verbose=2, callbacks=[mcp])