auto encoder with MFCC audio features

auto encoder with MFCC audio features - python

I tried to train an auto encoder to learn feature representations from some segments of audio MFCC features(my dataset is TIMIT). So I used a fixed size window to cut the audio MFCC data(13 dimensional MFCC), My data is (None, window_size*13),here my window size is 20(frames). I trained a auto encoder with feed_forward type, use loss function as least squared error(the difference norm between prediction and label), the problem is my loss stops decrease at a high level. My network part code is here:
self.centers_placeholder = tf.placeholder(tf.float32,[None,self.train_centers[0].shape[1]],name = 'center_placeholder')
layer = tf.layers.dense(self.centers_placeholder,2000,activation=tf.nn.relu)
layer = tf.layers.batch_normalization(layer)
layer = tf.layers.dense(layer,1500,activation=tf.nn.leaky_relu)
layer = tf.layers.batch_normalization(layer)
layer = tf.layers.dense(layer,500,activation=tf.nn.leaky_relu)
layer = tf.layers.batch_normalization(layer)
layer = tf.layers.dense(layer,100,activation=tf.nn.leaky_relu)
layer = tf.layers.batch_normalization(layer)
self.embedding= tf.layers.dense(layer,50,activation = tf.nn.leaky_relu)
layer = tf.layers.dense(layer,100,activation=tf.nn.leaky_relu)
layer = tf.layers.batch_normalization(layer)
layer = tf.layers.dense(layer,500,activation=tf.nn.leaky_relu)
layer = tf.layers.batch_normalization(layer)
layer = tf.layers.dense(layer,1500,activation=tf.nn.leaky_relu)
layer = tf.layers.batch_normalization(layer)
layer = tf.layers.dense(layer,2000,activation=tf.nn.leaky_relu)
layer = tf.layers.batch_normalization(layer)
self.decoded = tf.layers.dense(layer,self.train_centers[0].shape[1],activation=None)
#cost = tf.reduce_mean(tf.sqrt(tf.square(neighbors_placeholder-decoder4)))
self.cost = tf.reduce_mean(tf.norm(self.centers_placeholder-self.decoded,ord=2,axis=1))
#self.cost=tf.reduce_mean(tf.losses.mean_squared_error(labels=self.centers_placeholder,predictions=self.decoded))
target_norm = tf.norm(self.centers_placeholder,ord=2,axis=1)
difference_norm=tf.norm(self.centers_placeholder-self.decoded,ord=2,axis=1)
self.metric = tf.reduce_mean(difference_norm/target_norm)
print('model graph has built',flush=True)
global_step=tf.Variable(0,trainable=False)
starter_learning_rate=self.learning_rate
lr=tf.train.exponential_decay(starter_learning_rate,global_step,100000,0.96,staircase=True)
self.optimizer = tf.train.AdamOptimizer(lr).minimize(self.cost,global_step=global_step)
self.saver = tf.train.Saver()
I tried different architecture, deep or shallow, but it always convergence at a high level. Here is one example of my lossesenter image description here
Can anyone help? Is there anyone has some experiments with MFCC in auto encoder?
Thanks!

Related

TensorFlow Keras make ThresholdedReLU Theta optimizable

I've built a seq-to-seq model to take N periods of continuous input data and predict M periods of continuous response data (M<N). Because my response is sparse - usually 0, I want to squish predictions that are "small" to 0, so have added a Thresholded ReLU as my final layer. The model, with a single encoder and single decoder is:
# encoder
encoder_inputs_11 = tf.keras.layers.Input(shape=(n_past, n_features))
encoder_layr1_11 = tf.keras.layers.LSTM(encoderLayers[0], return_state=True)
encoder_outputs1_11 = encoder_layr1_11(encoder_inputs_11)
encoder_states1_11 = encoder_outputs1_11[1:]
# decoder
decoder_inputs_11 = tf.keras.layers.RepeatVector(n_future)(encoder_outputs1_11[0])
decoder_layr1_11 = tf.keras.layers.LSTM(decoderLayers[0], return_sequences=True)(decoder_inputs_11, initial_state = encoder_states1_11)
decoder_outputs1_11 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_response))(decoder_layr1_11)
# threshold layer
thresh_11 = tf.keras.layers.ThresholdedReLU(theta=reluTheta)(decoder_outputs1_11)
# entire model
result = tf.keras.models.Model(encoder_inputs_11, thresh_11)
How can I make theta in tf.keras.layers.ThresholdedReLU(theta=reluTheta) an optimizable parameter?

Issue running backward function after calculating loss

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms, datasets
#neural network class
class Net(nn.Module):
#intialize class
def __init__(self):
super().__init__()
#feedforward neural network passes data from input layer to output layer
#fully connected layer with input shape of 28*28 pixels (flatten image to one row) and output feature is size 64. Linear means flat layer
self.fc1 = nn.Linear(28*28,64)
#feed in data from fc1 to fc2
self.fc2 = nn.Linear(64,64)
self.fc3 = nn.Linear(64,64)
#output layer has input 64 and output of size 10 to represent 10 classes in MNIST
self.fc4 = nn.Linear(64,10)
#forward pass through the data
def forward(self, x):
#relu is activation function and performs operation on input data
#input and output dimenson of relu are the same
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
#softmax function gets probability distribution for each class adding up to 1 for output layer
x = F.log_softmax(self.fc4(x), dim=1)
return x
#declare a model
net = Net()
#print(net)
#passing in random data
x = torch.rand(28,28)
#resize to represent input shape (batch size, input x, input y)
x = x.view(-1,28,28)
#print(x)
#optmizer adjusts neural network based on error calculation
import torch.optim as optim
#net.parameters() means all the adjustable parts of the neural network, learning rate is amount of change (we don't want model to swerve based on one train case)
optimizer = optim.Adam(net.parameters(),lr=0.001)
#get datasets using torchvision.datasets transforms are application applied to data (transforms conversion to tensors)
train = torchvision.datasets.MNIST("", train=True, download=True,transform=transforms.Compose([transforms.ToTensor()]))
test = torchvision.datasets.MNIST("", train=False, download=True,transform=transforms.Compose([transforms.ToTensor()]))
#store in data loader, batch size is how many samples is passed through the model at once (in GPU memory), best batch size is between 8-64
#shuffling avoids feeding too much of one kind of image and leads to more generalization
trainset = torch.utils.data.DataLoader(train,batch_size=10,shuffle=True)
testset = torch.utils.data.DataLoader(test,batch_size=10,shuffle=True)
#full pass through data is epoch
EPOCHS = 3
for epoch in range(EPOCHS):
#data is a batch of data in the training set
for data in trainset:
#split into features and labels
features, labels = data
#print(features, labels)
#reset the gradient for next passes to avoid convoluting the results of multiple backpropogations
net.zero_grad()
#pass data into network (make sure input shape matches)
output = net(features.view(-1,28*28))
#compute error (output,expected)
loss = F.nll_loss(output,labels)
print("loss is the ", loss)
#backpropogate loss through trainiable parameters of model
loss.backward()
#adjust neural network
optimizer.step()
I am using Pytorch on Google colab. The error message says that the gradient isn't there.
I am unsure where the error stems from? I used this tutorial: https://www.youtube.com/watch?v=9j-_dOze4IM&list=PLQVvvaa0QuDdeMyHEYc0gxFpYwHY2Qfdh&index=4
Error message from Google colab
Error message

keras input shape: Input incompatible with the layer

I've looked at a few similar questions but I still don't understand how to solve my problem.
I am trying to build a CNN that estimates how many particles hit a detector, based on what's essentially an oscilloscope trace of the energy released in the detector over time.
I have 100,000 events of 1024 time samples, which I split 80/20 as train/test, like so:
from sklearn.model_selection import train_test_split
train_to_test_ratio=0.8 #proportion of the dataset to include in the train split
X_train,X_test,Y_train,Y_test=train_test_split(NormSignals,labels,train_size=train_to_test_ratio)
no_outputs = 14 # maximum number of particles expected
# force the labels to have 14 binary digits, one for each of the possible outputs
Y_train=tf.one_hot(Y_train,no_outputs)
Y_test=tf.one_hot(Y_test,no_outputs)
When I try to define the input shape for the network I do so like this (full CNN code below):
# Define input to neural network (tensors of 1024 time samples x 1 amplitude per sample)
inputs = keras.Input(shape=(1024,1))
But it gives me the error: "Input 0 of layer Conv_1 is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: [None, 1024, 1]"
I thought the input shape was as simple as the shape of the data arrays being passed to the network. Can someone please explain what the correct shape of my data should be?
Thank you very much in advance!
Full CNN:
from tensorflow import keras
# Following the architecture of the CNN from the image recognition lab (14/5/2020):
# Simple CNN:
class noiseLayer(keras.layers.Layer):
def __init__(self,mean):
super(noiseLayer, self).__init__()
self.mean = mean
def call(self, input):
mean = self.mean
return input + (np.random.poisson(mean))/mean
# Add data augmentation to produce a random flip of the data (the ECal is symmetrical)
# and add poissonian noise to all of the crystals - using large N and dividing by N normalises
# the noise to be approximately continuous between 0 and 1
data_augmentation = keras.Sequential([
noiseLayer(mean = 1000)
], name='DataAugm')
# Define input to neural network (tensors of 1024 time samples x 1 amplitude per sample)
inputs = keras.Input(shape=(1024,1))
#x=inputs
x = data_augmentation(inputs)
# primo blocco Convoluzionale
x = keras.layers.Conv2D(16, kernel_size=(3,3), name='Conv_1')(x)
x = keras.layers.LeakyReLU(0.1)(x)
x = keras.layers.MaxPool2D((2,2), name='MaxPool_1')(x)
# secondo blocco Convoluzionale
x = keras.layers.Conv2D(16, kernel_size=(3,3), name='Conv_2')(x)
x = keras.layers.LeakyReLU(0.1)(x)
x = keras.layers.MaxPool2D((2,2), name='MaxPool_2')(x)
# terzo blocco convoluzionale
x = keras.layers.Conv2D(32, kernel_size=(3,3), name='Conv_3')(x)
x = keras.layers.LeakyReLU(0.1)(x)
x = keras.layers.MaxPool2D((2,2), name='MaxPool_3')(x)
# Flatten output tensor of the last convolutional layer so it can be used as
# input to the dense layers
x = keras.layers.Flatten(name='Flatten')(x)
# dense network: 2 dense hidden layer with 256 neurons, with ReLU activation
# Classifier
x = keras.layers.Dense(64, name='Dense_1')(x)
x = keras.layers.ReLU(name='ReLU_dense_1')(x)
#x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, name='Dense_2')(x)
x = keras.layers.ReLU(name='ReLU_dense_2')(x)
outputs = keras.layers.Dense(no_outputs, activation='softmax', name='Output')(x)
# Model definition
model = keras.Model(inputs=inputs, outputs=outputs, name='VGGlike_CNN')
# Print model summary
model.summary()
# Show model structure
keras.utils.plot_model(model, show_shapes=True)

The problem was that I was using 2D layers to try to solve a 1D problem.
Changing all the 2D layers to 1D now compiles without errors:
x = keras.layers.Conv1D(16, kernel_size=(3), name='Conv_1')(x)
x = keras.layers.LeakyReLU(0.1)(x)
x = keras.layers.MaxPool1D((2), name='MaxPool_1')(x)
# secondo blocco Convoluzionale
x = keras.layers.Conv1D(16, kernel_size=(3), name='Conv_2')(x)
x = keras.layers.LeakyReLU(0.1)(x)
x = keras.layers.MaxPool1D((2), name='MaxPool_2')(x)
# terzo blocco convoluzionale
x = keras.layers.Conv1D(32, kernel_size=(3), name='Conv_3')(x)
x = keras.layers.LeakyReLU(0.1)(x)
x = keras.layers.MaxPool1D((2), name='MaxPool_3')(x)
# Flatten output tensor of the last convolutional layer so it can be used as
# input to the dense layers
x = keras.layers.Flatten(name='Flatten')(x)
# dense network: 2 dense hidden layer with 256 neurons, with ReLU activation
# Classifier
x = keras.layers.Dense(64, name='Dense_1')(x)
x = keras.layers.ReLU(name='ReLU_dense_1')(x)
#x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, name='Dense_2')(x)
x = keras.layers.ReLU(name='ReLU_dense_2')(x)

Scale actor network output to the action space bounds in Keras Rl

I am trying to implement DDPG from Keras RL and have the following actor network.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('linear'))
However, I would prefer to have the output scaled to a custom gym environment action space bounds for my problem. env.action_space.
https://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html shows this using the tflearn api where they use
def create_actor_network(self):
inputs = tflearn.input_data(shape=[None, self.s_dim])
net = tflearn.fully_connected(inputs, 400)
net = tflearn.layers.normalization.batch_normalization(net)
net = tflearn.activations.relu(net)
net = tflearn.fully_connected(net, 300)
net = tflearn.layers.normalization.batch_normalization(net)
net = tflearn.activations.relu(net)
# Final layer weights are init to Uniform[-3e-3, 3e-3]
w_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003)
out = tflearn.fully_connected(
net, self.a_dim, activation='tanh', weights_init=w_init)
# Scale output to -action_bound to action_bound
scaled_out = tf.multiply(out, self.action_bound)
return inputs, out, scaled_out
What is the equivalent command for scaling the output layer according to my requirements?

Find input that maximises output of a neural network using Keras and TensorFlow

I have used Keras and TensorFlow to classify the Fashion MNIST following this tutorial .
It uses the AdamOptimizer to find the value for model parameters that minimize the loss function of the network. The input for the network is a 2-D tensor with shape [28, 28], and output is a 1-D tensor with shape [10] which is the result of a softmax function.
Once the network has been trained, I want to use the optimizer for another task: find an input that maximizes one of the elements of the output tensor. How can this be done? Is it possible to do so using Keras or one have to use a lower level API?
Since the input is not unique for a given output, it would be even better if we could impose some constraints on the values the input can take.
The trained model has the following format
model = keras.Sequential([
keras.layers.Flatten(input_shape=(28, 28)),
keras.layers.Dense(128, activation=tf.nn.relu),
keras.layers.Dense(10, activation=tf.nn.softmax)
])

I feel you would want to backprop with respect to the input freezing all the weights to your model. What you could do is:
Add a dense layer after the input layer with the same dimensions as input and set it as trainable
Freeze all the other layers of your model. (except the one you added)
As an input, feed an identity matrix and train your model based on whatever output you desire.
This article and this post might be able to help you if you want to backprop based on the input instead. It's a bit like what you are aiming for but you can get the intuition.

It would be very similar to the way that filters of a Convolutional Network is visualized: we would do gradient ascent optimization in input space to maximize the response of a particular filter.
Here is how to do it: after training is finished, first we need to specify the output and define a loss function that we want to maximize:
from keras import backend as K
output_class = 0 # the index of the output class we want to maximize
output = model.layers[-1].output
loss = K.mean(output[:,output_class]) # get the average activation of our desired class over the batch
Next, we need to take the gradient of the loss we have defined above with respect to the input layer:
grads = K.gradients(loss, model.input)[0] # the output of `gradients` is a list, just take the first (and only) element
grads = K.l2_normalize(grads) # normalize the gradients to help having an smooth optimization process
Next, we need to define a backend function that takes the initial input image and gives the values of loss and gradients as outputs, so that we can use it in the next step to implement the optimization process:
func = K.function([model.input], [loss, grads])
Finally, we implement the gradient ascent optimization process:
import numpy as np
input_img = np.random.random((1, 28, 28)) # define an initial random image
lr = 1. # learning rate used for gradient updates
max_iter = 50 # number of gradient updates iterations
for i in range(max_iter):
loss_val, grads_val = func([input_img])
input_img += grads_val * lr # update the image based on gradients
Note that, after this process is finished, to display the image you may need to make sure that all the values in the image are in the range [0, 255] (or [0,1]).

After the hints Saket Kumar Singh gave in his answer, I wrote the following that seems to solve the question.
I create two custom layers. Maybe Keras offers already some classes that are equivalent to them.
The first on is a trainable input:
class MyInputLayer(keras.layers.Layer):
def __init__(self, output_dim, **kwargs):
self.output_dim = output_dim
super(MyInputLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.kernel = self.add_weight(name='kernel',
shape=self.output_dim,
initializer='uniform',
trainable=True)
super(MyInputLayer, self).build(input_shape)
def call(self, x):
return self.kernel
def compute_output_shape(self, input_shape):
return self.output_dim
The second one gets the probability of the label of interest:
class MySelectionLayer(keras.layers.Layer):
def __init__(self, position, **kwargs):
self.position = position
self.output_dim = 1
super(MySelectionLayer, self).__init__(**kwargs)
def build(self, input_shape):
super(MySelectionLayer, self).build(input_shape)
def call(self, x):
mask = np.array([False]*x.shape[-1])
mask[self.position] = True
return tf.boolean_mask(x, mask,axis=1)
def compute_output_shape(self, input_shape):
return self.output_dim
I used them in this way:
# Build the model
layer_flatten = keras.layers.Flatten(input_shape=(28, 28))
layerDense1 = keras.layers.Dense(128, activation=tf.nn.relu)
layerDense2 = keras.layers.Dense(10, activation=tf.nn.softmax)
model = keras.Sequential([
layer_flatten,
layerDense1,
layerDense2
])
# Compile the model
model.compile(optimizer=tf.train.AdamOptimizer(),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Train the model
# ...
# Freeze the model
layerDense1.trainable = False
layerDense2.trainable = False
# Build another model
class_index = 7
layerInput = MyInputLayer((1,784))
layerSelection = MySelectionLayer(class_index)
model_extended = keras.Sequential([
layerInput,
layerDense1,
layerDense2,
layerSelection
])
# Compile it
model_extended.compile(optimizer=tf.train.AdamOptimizer(),
loss='mean_absolute_error')
# Train it
dummyInput = np.ones((1,1))
target = np.ones((1,1))
model_extended.fit(dummyInput, target,epochs=300)
# Retrieve the weights of layerInput
layerInput.get_weights()[0]

Interesting. Maybe a solution would be to feed all your data to the network and for each sample save the output_layer after softmax.
This way, for 3 classes, where you want to find the best input for class 1, you are looking for outputs where the first component is high. For example: [1 0 0]
Indeed the output means the probability, or the confidence of the network, for the sample being one of the classes.

Funny coincident I was just working on the same "problem". I'm interested in the direction of adversarial training etc. What I did was to insert a LocallyConnected2D Layer after the input and then train with data which is all one and has as targets the class of interest.
As model I use
batch_size = 64
num_classes = 10
epochs = 20
input_shape = (28, 28, 1)
inp = tf.keras.layers.Input(shape=input_shape)
conv1 = tf.keras.layers.Conv2D(32, kernel_size=(3, 3),activation='relu',kernel_initializer='he_normal')(inp)
pool1 = tf.keras.layers.MaxPool2D((2, 2))(conv1)
drop1 = tf.keras.layers.Dropout(0.20)(pool1)
flat = tf.keras.layers.Flatten()(drop1)
fc1 = tf.keras.layers.Dense(128, activation='relu')(flat)
norm1 = tf.keras.layers.BatchNormalization()(fc1)
dropfc1 = tf.keras.layers.Dropout(0.25)(norm1)
out = tf.keras.layers.Dense(num_classes, activation='softmax')(dropfc1)
model = tf.keras.models.Model(inputs = inp , outputs = out)
model.compile(loss=tf.keras.losses.categorical_crossentropy,
optimizer=tf.keras.optimizers.RMSprop(),
metrics=['accuracy'])
model.summary()
after training I insert the new layer
def insert_intermediate_layer_in_keras(model,position, before_layer_id):
layers = [l for l in model.layers]
if(before_layer_id==0) :
x = new_layer
else:
x = layers[0].output
for i in range(1, len(layers)):
if i == before_layer_id:
x = new_layer(x)
x = layers[i](x)
else:
x = layers[i](x)
new_model = tf.keras.models.Model(inputs=layers[0].input, outputs=x)
return new_model
def fix_model(model):
for l in model.layers:
l.trainable=False
fix_model(model)
new_layer = tf.keras.layers.LocallyConnected2D(1, kernel_size=(1, 1),
activation='linear',
kernel_initializer='he_normal',
use_bias=False)
new_model = insert_intermediate_layer_in_keras(model,new_layer,1)
new_model.compile(loss=tf.keras.losses.categorical_crossentropy,
optimizer=tf.keras.optimizers.RMSprop(),
metrics=['accuracy'])
and finally rerun training with my fake data.
X_fake = np.ones((60000,28,28,1))
print(Y_test.shape)
y_fake = np.ones((60000))
Y_fake = tf.keras.utils.to_categorical(y_fake, num_classes)
new_model.fit(X_fake, Y_fake, epochs=100)
weights = new_layer.get_weights()[0]
imshow(weights.reshape(28,28))
plt.show()
Results are not yet satisfying but I'm confident of the approach and guess I need to play around with the optimiser.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

auto encoder with MFCC audio features - python

Related

TensorFlow Keras make ThresholdedReLU Theta optimizable

Issue running backward function after calculating loss

keras input shape: Input incompatible with the layer

Scale actor network output to the action space bounds in Keras Rl

Find input that maximises output of a neural network using Keras and TensorFlow

Categories

Resources