I'm trying to create a neural network for a classification problem about audios of me talking and audios of other people talking, so it classify it. But when I train it, it give me this weird result of accuracy and loss.
Here is my code.
'''
This is only to read the data and pass it into an array
1. Get the Audio data, my voice so we can visualize it into an array.
2. Build an ANN with the data already into an array. classification problem
3. Real time predictor using pyaudio and trained model
'''
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.layers.core import Dropout
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import numpy as np
from scipy.io import wavfile
from pathlib import Path
import os
# cut audio to make the same sizes, shape and length
def trim_wav( originalWavPath, newWavPath , start, new ):
'''
:param originalWavPath: the path to the source wav file
:param newWavPath: output wav file * can be same path as original
:param start: time in seconds
:param end: time in seconds
:return:
'''
sampleRate, waveData = wavfile.read( originalWavPath )
startSample = int( start * sampleRate )
endSample = int( new * sampleRate )
wavfile.write( newWavPath, sampleRate, waveData[startSample:endSample])
### DATASET
pathlist = Path(os.path.abspath('Voiceclassification/Data/me/')).rglob('*.wav')
# My voice data
for path in pathlist:
wp = str(path)
# Trim function here for each file
trim_wav(wp, wp.replace(".wav", ".wav"), 0,5)
filename = str(path)
# convert audio to numpy array and then 2D to 1D np Array
samplerate, data = wavfile.read(filename)
#print(f"sample rate: {samplerate}")
#print(f"data: {data}")
pathlist2 = Path(os.path.abspath('Voiceclassification/Data/other/')).rglob('*.wav')
# other voice data
for path2 in pathlist2:
wp2 = str(path2)
trim_wav(wp2, wp2.replace(".wav", ".wav"), 0,5)
filename2 = str(path2)
samplerate2, data2 = wavfile.read(filename2)
#print(data2)
### ADAPTING THE DATA FOR THE MODEL
X = data.reshape(-1, 1) # My voice
y = data2.reshape(-1, 1) # Other data
#print(X_.shape)
#print(y_.shape)
### Trainig the model
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
# Performing future scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
### Creating the ANN
ann = tf.keras.models.Sequential()
# First hidden layer of the ann
ann.add(tf.keras.layers.Dense(units=6, activation="relu"))
ann.add(Dropout(0.05))
# Second one
ann.add(tf.keras.layers.Dense(units=6, activation="relu"))
ann.add(Dropout(0.05))
# Output layer
ann.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Compile our neural network
ann.compile(optimizer="adam",
loss="binary_crossentropy",
metrics=['accuracy'])
# Fit ANN
ann.fit(x_train, y_train, batch_size=1024, epochs=100) ############ batch 32
ann.save('Models/voiceclassification.model')
does anyone know if there is anything wrong with my code that makes the acc very low?
#MarkLavin, your answer was correct, I'm the same person that ask it, this is my other account the real one, I try what #MarkLavin told me to do and it work, I modify my code with this.
### DATASET
data = []
labels = []
audio_files = [f for f in glob.glob(os.path.abspath(r"Voiceclassification\Data")+"/**/*", recursive=True) if not os.path.isdir(f)]
random.shuffle(audio_files)
# My voice data
for path in audio_files:
wp = str(path)
# Trim function here for each file
trim_wav(wp, wp.replace(".wav", ".wav"), 0,5)
filename = str(path)
# convert audio to numpy array and then 2D to 1D np Array
samplerate, data_array = wavfile.read(filename)
#print(f"data: {data}")
data_array.reshape(-1, 1)
data.append(data_array)
label = path.split(os.path.sep)[-2]
if label == "me":
label = 1
else:
label = 0
labels.append([label])
### ADAPTING THE DATA FOR THE MODEL
X = data # all voices data
y = np.array(labels) # data label 1 es me, 0 is other
Labels for y and all data with random for X, this are the results are 100% and 90% accuracy, thank you so much #MarkLavin :)
Related
I made the simple RNN model to learn and fit the one wave file.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dropout,Dense
from tensorflow.keras.layers import SimpleRNN
import librosa
import librosa.display
import numpy as np
a, sr = librosa.load("A.wav",sr=22050)
rawdata = librosa.stft(a, n_fft=512,hop_length= 512 // 4, window='hann') #make fourier transfered A.wav data.
rawdata = rawdata.transpose() # [Frame,Freq] => [Freq,Frame]
input_len = 10 # the frame number for learning to make next one
input=[]
target=[]
for i in range(0, len(rawdata) - input_len):
input.append( rawdata[i:i+input_len] ) # frames
target.append( rawdata[i+input_len] ) # one step forward frame for answer.
X = np.array(input)
Y = np.array(target)
#Separate 8:2 for training and test
x, val_x, y, val_y = train_test_split(X, Y, test_size=int(X.shape[0] * 0.2), shuffle=False)
n_hidden = 512
epoch = 100
model = Sequential()
model.add(SimpleRNN(n_hidden, input_shape=(input_len, n_in), return_sequences=False))
model.add(Dense(n_hidden, activation="linear"))
model.add(Dense(n_in, activation="linear"))
opt = Adam(lr=0.001)
model.compile(loss='mse', optimizer=opt)
model.summary()
history = model.fit(x, y, epochs=epoch, batch_size=10,validation_data=(val_x, val_y))
OK it works fine.
It learns the one wave file A.wav
However how can I learn multiple wave files??
B.wav C.wav
For example,
If I use model.fit() multiple times for each wav, does this model remember the past learning??
yes, model does remember previous train during fit, you can use fit multiple times as well. but its better to use model.train_on_batch this is simple version of fit to be used on small batch of data.
you can also modify your code to add other wav file feature to data.
# second way
input_len = 10 # the frame number for learning to make next one
input=[]
target=[]
for f in ['A.wav','B.wav','C.wav']:
a, sr = librosa.load(f,sr=22050)
rawdata = librosa.stft(a, n_fft=512,hop_length= 512 // 4, window='hann') #make fourier transfered A.wav data.
rawdata = rawdata.transpose() # [Frame,Freq] => [Freq,Frame]
for i in range(0, len(rawdata) - input_len):
input.append( rawdata[i:i+input_len] ) # frames
target.append( rawdata[i+input_len] ) # one step forward frame for answer.
enter image description here
I am trying to train a model that will detect the native language of the speaker from the speech data which is already taken which will be in the English language.
I got this error while I am trying to run the below code and it is unable to train the model.
What is the mistake anyone explain?
Below is the code snippet of the training model :
import pandas as pd
from collections import Counter
import sys
sys.path.append('../dialectdetect-master/src>')
import getsplit
from keras import utils
import accuracy
import multiprocessing
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import MaxPooling2D, Conv2D
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, TensorBoard
DEBUG = True
SILENCE_THRESHOLD = .01
RATE = 24000
N_MFCC = 13
COL_SIZE = 30
EPOCHS = 10 #35#250
def to_categorical(y):
'''
Converts list of languages into a binary class matrix
:param y (list): list of languages
:return (numpy array): binary class matrix
'''
lang_dict = {}
for index,language in enumerate(set(y)):
lang_dict[language] = index
y = list(map(lambda x: lang_dict[x],y))
return utils.to_categorical(y, len(lang_dict))
def get_wav(language_num):
'''
Load wav file from disk and down-samples to RATE
:param language_num (list): list of file names
:return (numpy array): Down-sampled wav file
'''
y, sr = librosa.load('../audio/{}.wav'.format(language_num))
return(librosa.core.resample(y=y,orig_sr=sr,target_sr=RATE, scale=True))
def to_mfcc(wav):
'''
Converts wav file to Mel Frequency Ceptral Coefficients
:param wav (numpy array): Wav form
:return (2d numpy array: MFCC
'''
return(librosa.feature.mfcc(y=wav, sr=RATE, n_mfcc=N_MFCC))
def remove_silence(wav, thresh=0.04, chunk=5000):
'''
Searches wav form for segments of silence. If wav form values are lower than 'thresh' for 'chunk' samples, the values will be removed
:param wav (np array): Wav array to be filtered
:return (np array): Wav array with silence removed
'''
tf_list = []
for x in range(len(wav) / chunk):
if (np.any(wav[chunk * x:chunk * (x + 1)] >= thresh) or np.any(wav[chunk * x:chunk * (x + 1)] <= -thresh)):
tf_list.extend([True] * chunk)
else:
tf_list.extend([False] * chunk)
tf_list.extend((len(wav) - len(tf_list)) * [False])
return(wav[tf_list])
def normalize_mfcc(mfcc):
'''
Normalize mfcc
:param mfcc:
:return:
'''
mms = MinMaxScaler()
return(mms.fit_transform(np.abs(mfcc)))
def make_segments(mfccs,labels):
'''
Makes segments of mfccs and attaches them to the labels
:param mfccs: list of mfccs
:param labels: list of labels
:return (tuple): Segments with labels
'''
segments = []
seg_labels = []
for mfcc,label in zip(mfccs,labels):
for start in range(0, int(mfcc.shape[1] / COL_SIZE)):
segments.append(mfcc[:, start * COL_SIZE:(start + 1) * COL_SIZE])
seg_labels.append(label)
return(segments, seg_labels)
def segment_one(mfcc):
'''
Creates segments from on mfcc image. If last segments is not long enough to be length of columns divided by COL_SIZE
:param mfcc (numpy array): MFCC array
:return (numpy array): Segmented MFCC array
'''
segments = []
for start in range(0, int(mfcc.shape[1] / COL_SIZE)):
segments.append(mfcc[:, start * COL_SIZE:(start + 1) * COL_SIZE])
return(np.array(segments))
def create_segmented_mfccs(X_train):
'''
Creates segmented MFCCs from X_train
:param X_train: list of MFCCs
:return: segmented mfccs
'''
segmented_mfccs = []
for mfcc in X_train:
segmented_mfccs.append(segment_one(mfcc))
return(segmented_mfccs)
def train_model(X_train,y_train,X_validation,y_validation, batch_size=128): #64
'''
Trains 2D convolutional neural network
:param X_train: Numpy array of mfccs
:param y_train: Binary matrix based on labels
:return: Trained model
'''
# Get row, column, and class sizes
rows = X_train[0].shape[0]
cols = X_train[0].shape[1]
val_rows = X_validation[0].shape[0]
val_cols = X_validation[0].shape[1]
num_classes = len(y_train[0])
# input image dimensions to feed into 2D ConvNet Input layer
input_shape = (rows, cols, 1)
X_train = X_train.reshape(X_train.shape[0], rows, cols, 1 )
X_validation = X_validation.reshape(X_validation.shape[0],val_rows,val_cols,1)
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'training samples')
model = Sequential()
model.add(Conv2D(32, kernel_size=(3,3), activation='relu',
data_format="channels_last",
input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64,kernel_size=(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
# Stops training if accuracy does not change at least 0.005 over 10 epochs
es = EarlyStopping(monitor='acc', min_delta=.005, patience=10, verbose=1, mode='auto')
# Creates log file for graphical interpretation using TensorBoard
tb = TensorBoard(log_dir='..\logs', histogram_freq=0, batch_size=32, write_graph=True, write_grads=True,
write_images=True, embeddings_freq=0, embeddings_layer_names=None,
embeddings_metadata=None)
# Image shifting
datagen = ImageDataGenerator(width_shift_range=0.05)
# Fit model using ImageDataGenerator
model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
steps_per_epoch=len(X_train) / 32
, epochs=EPOCHS,
callbacks=[es,tb], validation_data=(X_validation,y_validation))
return (model)
def save_model(model, model_filename):
'''
Save model to file
:param model: Trained model to be saved
:param model_filename: Filename
:return: None
'''
model.save('../models/{}.h5'.format(model_filename)) # creates a HDF5 file 'my_model.h5'
############################################################
#######################################
if __name__ == '__main__':
'''
Console command example:
python trainmodel.py bio_metadata.csv model50
'''
# Load arguments
# print(sys.argv)
file_name = sys.argv[1]
model_filename = sys.argv[2]
# Load metadata
df = pd.read_csv(file_name)
# Filter metadata to retrieve only files desired
filtered_df = getsplit.filter_df(df)
# filtered_df = filter_df(df)
# print(filtered_df)
# print("filterd df is empty {}".format(filtered_df))
# Train test split
X_train, X_test, y_train, y_test = getsplit.split_people(filtered_df)
# Get statistics
train_count = Counter(y_train)
test_count = Counter(y_test)
print("Entering main")
# import ipdb;
# ipdb.set_trace()
acc_to_beat = test_count.most_common(1)[0][1] / float(np.sum(list(test_count.values())))
# To categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
# Get resampled wav files using multiprocessing
if DEBUG:
print('Loading wav files....')
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
X_train = pool.map(get_wav, X_train)
X_test = pool.map(get_wav, X_test)
# Convert to MFCC
if DEBUG:
print('Converting to MFCC....')
X_train = pool.map(to_mfcc, X_train)
X_test = pool.map(to_mfcc, X_test)
# Create segments from MFCCs
X_train, y_train = make_segments(X_train, y_train)
X_validation, y_validation = make_segments(X_test, y_test)
# Randomize training segments
X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=50)
# Train model
model = train_model(np.array(X_train), np.array(y_train), np.array(X_validation),np.array(y_validation))
# Make predictions on full X_test MFCCs
y_predicted = accuracy.predict_class_all(create_segmented_mfccs(X_test), model)
# Print statistics
print('Training samples:', train_count)
print('Testing samples:', test_count)
print('Accuracy to beat:', acc_to_beat)
print('Confusion matrix of total samples:\n', np.sum(accuracy.confusion_matrix(y_predicted, y_test),axis=1))
print('Confusion matrix:\n',accuracy.confusion_matrix(y_predicted, y_test))
print('Accuracy:', accuracy.get_accuracy(y_predicted,y_test))
# Save model
save_model(model, model_filename)
I want to convert the code written in Python into Matlab code. May I know is it possible to do that. l am wonder, how can we use the python libraries in Matlab. Share the procedure to do the conversion
Here is the Data I used:
https://drive.google.com/open?id=1GLm87-5E_6YhUIPZ_CtQLV9F9wcGaTj2
Here is my code in Python:
# imports libraries
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import random
from scipy import signal
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential
from tensorflow import set_random_seed
from tensorflow.keras.initializers import glorot_uniform
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from importlib import reload
# useful pandas display settings
pd.options.display.float_format = '{:.3f}'.format
# useful functions
def plot_history(history, metrics_to_plot):
"""
Function plots history of selected metrics for fitted neural net.
"""
# plot
for metric in metrics_to_plot:
plt.plot(history.history[metric])
# name X axis informatively
plt.xlabel('epoch')
# name Y axis informatively
plt.ylabel('metric')
# add informative legend
plt.legend(metrics_to_plot)
# plot
plt.show()
def plot_fit(y_true, y_pred, title='title'):
"""
Function plots true values and predicted values, sorted in increase order by true values.
"""
# create one dataframe with true values and predicted values
results = y_true.reset_index(drop=True).merge(pd.DataFrame(y_pred), left_index=True, right_index=True)
# rename columns informartively
results.columns = ['true', 'prediction']
# sort for clarity of visualization
results = results.sort_values(by=['true']).reset_index(drop=True)
# plot true values vs predicted values
results.plot()
# adding scatter on line plots
plt.scatter(results.index, results.true, s=5)
plt.scatter(results.index, results.prediction, s=5)
# name X axis informatively
plt.xlabel('obs sorted in ascending order with respect to true values')
# add customizable title
plt.title(title)
# plot
plt.show();
def reset_all_randomness():
"""
Function assures reproducibility of NN estimation results.
"""
# reloads
reload(tf)
reload(np)
reload(random)
# seeds - for reproducibility
os.environ['PYTHONHASHSEED']=str(984797)
random.seed(984797)
set_random_seed(984797)
np.random.seed(984797)
my_init = glorot_uniform(seed=984797)
return my_init
def give_me_mse(true, prediction):
"""
This function returns mse for 2 vectors: true and predicted values.
"""
return np.mean((true-prediction)**2)
# Importing the dataset
X = pd.read_excel(r"C:\filelocation\Data.xlsx","Sheet1").values
y = pd.read_excel(r"C:\filelocation\Data.xlsx","Sheet2").values
# Importing the experiment data
Data = pd.read_excel(r"C:\filelocation\Data.xlsx","Sheet1")
v = pd.DataFrame(Data, columns= ['v']).values
c = pd.DataFrame(Data, columns= ['c']).values
ird = pd.DataFrame(Data, columns= ['ird']).values
tmp = pd.DataFrame(Data, columns= ['tmp']).values
#Data Prepration
ird = ird.ravel()
tmp = tmp.ravel()
ir = np.nanmax(ird)
tp = np.nanmax(tmp)
p = v*c
p = p.ravel()
peaks, _ = signal.find_peaks(p)
nop = len(peaks)
pv = p.max()
#Experimental Data for testing
E_data = np.array([[ir,tp,pv,nop]])
#importing some more libraries
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(np.ravel(y))
y_encoded = encoder.transform(np.ravel(y))
# convert integers to dummy variables (i.e. one hot encoded)
y_dummy = np_utils.to_categorical(y_encoded)
# reset_all_randomness - for reproducibility
my_init = reset_all_randomness()
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test, y_train_dummy, y_test_dummy = train_test_split(X, y, y_dummy, test_size = 0.3, random_state = 20)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
E_data = sc.transform(E_data)
# Initialising the ANN
model0 = Sequential()
# Adding 1 hidden layer: the input layer and the first hidden layer
model0.add(Dense(units = 160, activation = 'tanh', input_dim = 4, kernel_initializer=my_init))
# Adding 2 hidden layer
model0.add(Dense(units = 49, activation = 'tanh', kernel_initializer=my_init))
# Adding 3 hidden layer
model0.add(Dense(units = 24, activation = 'tanh', kernel_initializer=my_init))
# Adding 4 hidden layer
model0.add(Dense(units = 15, activation = 'tanh', kernel_initializer=my_init))
# Adding output layer
model0.add(Dense(units = 6, activation = 'softmax', kernel_initializer=my_init))
# Set up Optimizer
Optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.99)
# Compiling the ANN
model0.compile(optimizer = Optimizer, loss = 'categorical_crossentropy', metrics=['accuracy','categorical_crossentropy','mse'])
# Fitting the ANN to the Train set, at the same time observing quality on Valid set
history = model0.fit(X_train, y_train_dummy, validation_data=(X_test, y_test_dummy), batch_size = 100, epochs = 1500)
# Generate prediction for all Train, Valid set and Experimental set
y_train_pred_model0 = model0.predict(X_train)
y_test_pred_model0 = model0.predict(X_test)
y_exp_pred_model0 = model0.predict(E_data)
# find final prediction by taking class with highest probability
y_train_pred_model0 = np.array([[list(x).index(max(list(x))) + 1] for x in y_train_pred_model0])
y_test_pred_model0 = np.array([[list(x).index(max(list(x))) + 1] for x in y_test_pred_model0])
y_exp_pred_model0 = np.array([[list(x).index(max(list(x))) + 1] for x in y_exp_pred_model0])
# check what metrics are in fact available in history
history.history.keys()
# Inverse scaling
X_train_inverse = sc.inverse_transform(X_train)
X_test_inverse = sc.inverse_transform(X_test)
E_data_inverse = sc.inverse_transform(E_data)
#Plots
print('#######################################################################')
# look at model fitting history
plot_history(history, ['mean_squared_error', 'val_mean_squared_error'])
plot_history(history, ['categorical_crossentropy', 'val_categorical_crossentropy'])
plot_history(history, ['acc', 'val_acc'])
# look at model fit quality
plot_fit(pd.DataFrame(y_train), y_train_pred_model0, 'Fit on train data')
plot_fit(pd.DataFrame(y_test), y_test_pred_model0, 'Fit on test data')
#Results
print('#######################################################################')
print('=============Mean Squared Error============')
print('MSE on train data is: {}'.format(give_me_mse(y_train, y_train_pred_model0)))
print('MSE on test data is: {}'.format(give_me_mse(y_test, y_test_pred_model0)))
print('#######################################################################')
print('================Accuracy===================')
print('Accuracy of ANN is: {} Percentage'.format((accuracy_score(y_test, y_test_pred_model0))*100))
print('#######################################################################')
print('========Result of Test Data set is=========')
for i in range(len(y_test)):
print('%s => %d (expected %s)' % (X_test_inverse[i].tolist(), y_test_pred_model0[i], y_test[i].tolist()))
print('#######################################################################')
print('====Result of Experimental Data set is=====')
print('%s => %d' % (E_data_inverse, y_exp_pred_model0))
There is no "direct" way to convert Python code to MATLAB code.
What you can do is directly translate the approach (the algorithm) and write the code from scratch.
or what I think would be more preferable to you is to directly call python script in MATLAB using their API
here is the link for further reading: https://in.mathworks.com/help/matlab/call-python-libraries.html
for example:
>> py.math.sqrt(4)
ans =
1
To run your own function, you can create a file in your current MATLAB working directory. here is the file ‘hello.py’ that contained these two lines:
def world():
return 'hello world'
Then in MATLAB:
>> py.hello.world();
Hello world!
if you run into errors make sure you're using the supported version of Python and add
pyversion <path_to_executable>
to the start of your MATLAB file.
Although I'm not sure how well it will work considering all the Python libraries you're importing (Scipy, Tensorflow etc)
Im trying to create an speaker recognition system which take sound files from any movie and than train these sounds files using Neural Network and MFCC ( sound feature ) and then the system will say me on another sound file which speaker talked in this sound file.
So that's what I did -
Created MFCC vector for each speaker and put it on an array named X ( speaker can be more than one time )
Created Output number for each speaker
Created this model with tensorflow -
Dense Layer(512, 'relu')
Dropout (0.3)
Dense Layer(256, 'relu')
Dense Layer(128, 'relu')
Flattern
Dense Layer(length of outputs, 'relu')
than I trained and finally checked my results but as I said unfortuentlly my results are not high enough, only ~45% accucarry :(
I add my full code and my data base, notice that my data base can make some mistakes for example take voice of leonard and call it sheldon because it base on the srt file of the movie and the srt file have sometimes mistakes.
My Full Code :
import python_speech_features
import scipy.io.wavfile as wav
import numpy as np
from os import listdir
import os
import shutil
from os.path import isfile, join
from random import shuffle
from matplotlib import pyplot
from tqdm import tqdm
import tensorflow as tf
win_len = 0.04 # in seconds
step = win_len / 2
nfft = 2048
for TestNum in tqdm(range(5)): # We check it several times
X = [] # inputs
Y = [] # outputs
onlyfiles = [f for f in listdir("FinalAudios/") if isfile(join("FinalAudios/", f))] # Files in dir
names = [] # names of the speakers
for file in onlyfiles: # for each wav sound
# UNESSECERY TO UNDERSTAND THE CODE
if " " not in file.split("_")[0]:
names.append(file.split("_")[0])
else:
names.append(file.split("_")[0].split(" ")[0])
only_speakers = [] + names
namesWithoutDuplicate = list(dict.fromkeys(names))
namesWithoutDuplicateCopy = namesWithoutDuplicate[:]
for name in namesWithoutDuplicateCopy: # we remove low samples files
if names.count(name) < 60:
namesWithoutDuplicate.remove(name)
names = namesWithoutDuplicate
print(names) # print it
vector_names = [] # output for each name
i = 0
for name in names:
vector_for_each_name = i
vector_names.append(np.array(vector_for_each_name))
i += 1
for f in onlyfiles: # for all the files
if " " not in f.split("_")[0]:
f_speaker = f.split("_")[0]
else:
f_speaker = f.split("_")[0].split(" ")[0]
if f_speaker in namesWithoutDuplicate:
fs, audio = wav.read("FinalAudios/" + f) # read the file
try:
# compute MFCC
mfcc_feat = python_speech_features.mfcc(audio, samplerate=fs, winlen=win_len,
winstep=step, nfft=nfft, appendEnergy=False)
flat_list = [item for sublist in mfcc_feat for item in sublist]
# Create output + inputs
X.append(np.array(flat_list))
Y.append(np.array(vector_names[names.index(f_speaker)]))
except IndexError:
pass
else:
if not os.path.exists("TooLowSamples"): # if path not exist we create it
os.makedirs("TooLowSamples")
shutil.move("FinalAudios\\" + f, "TooLowSamples\\" + f)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
Z = list(zip(X, Y))
shuffle(Z) # WE SHUFFLE X,Y TO PERFORM RANDOM ON THE TEST LEVEL
X, Y = zip(*Z)
X = list(X)
Y = list(Y)
lenX = len(X)
# ------------------- RANDOMIZATION, UNNECESSARY TO UNDERSTAND THE CODE ------------------- #
y_test = np.asarray(Y[:100]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_test = np.asarray(X[:100]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_train = np.asarray(X[100:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
y_train = np.asarray(Y[100:]) # CHOOSE 100 FOR TEST, OTHERS FOR TRAIN
x_val = x_train[-100:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_val = y_train[-100:] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train[:-100] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
y_train = y_train[:-100] # FROM THE TRAIN CHOOSE 100 FOR VALIDATION
x_train = x_train.reshape(np.append(x_train.shape, 1)) # RESHAPE FOR INPUT
x_test = x_test.reshape(np.append(x_test.shape, 1)) # RESHAPE FOR INPUT
x_val = x_val.reshape(np.append(x_val.shape, 1)) # RESHAPE FOR INPUT
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(len(names), activation='softmax'),
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# -------------- OUR TENSOR FLOW NEURAL NETWORK MODEL -------------- #
print("fitting")
history = model.fit(x_train, y_train, epochs=4, validation_data=(x_val, y_val))
print("testing")
results = model.evaluate(x_test, y_test)
print(results)
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
My data set - https://filebin.net/ajho6kgzx66xayyn
Note : I tried also convolution layers but it worked even worse
I'm using the tf.keras API in TensorFlow2. I have 100,000 images or so that are saved as TFRecords (128 images per record). Each record has an input image, target image, and frame index. I can't find a clean way to keep the frame index with the prediction.
Here is an example, except I build a dataset with NumPy arrays instead of reading from TFRecords:
import tensorflow as tf
from tensorflow import keras
import numpy as np
# build dummy tf.data.Dataset
x = np.random.random(10000).astype(np.float32)
y = x + np.random.random(10000).astype(np.float32) * 0.1
idx = np.arange(10000, dtype=np.uint16)
np.random.shuffle(idx) # frames are random in my TFRecord files
ds = tf.data.Dataset.from_tensor_slices((x, y, idx))
# pretend ds returned from TFRecord
ds = ds.map(lambda f0, f1, f2: (f0, f1)) # strip off idx
ds = ds.batch(32)
# build and train model
x = keras.Input(shape=(1,))
y_hat = keras.layers.Dense(1)(x) # i.e. linear regression
model = keras.Model(x, y_hat)
model.compile('sgd', 'mse')
history = model.fit(ds, epochs=5)
# predict 1 batch
model.predict(ds, steps=1)
Short of reading through the dataset again to extract the indices (which is prone to error), is there a clean way to keep prediction correspondence with image index? In TF1.x it was straightforward. But I'd like to take advantage of clean Keras compile(), fit(), predict() API in TF2.
Ok, was thinking too hard, pretty easy actually. Just add index to dataset when you are making predictions, and pull out indices as you are iterating through batches:
rt tensorflow as tf
from tensorflow import keras
import numpy as np
def build_dataset(mode):
np.random.seed(1)
x = np.random.random(10000).astype(np.float32)
y = x + np.random.random(10000).astype(np.float32) * 0.1
idx = np.arange(10000, dtype=np.uint16)
if mode == 'train':
ds = tf.data.Dataset.from_tensor_slices((x, y))
ds = ds.shuffle(128)
else:
ds = tf.data.Dataset.from_tensor_slices((x, idx))
ds = ds.batch(32)
return ds
# build and train simple linear regression model
x_tf = keras.Input(shape=(1,))
yhat_tf = keras.layers.Dense(1)(x_tf)
model = keras.Model(x_tf, yhat_tf)
model.compile(optimizer='sgd', loss='mse')
ds = build_dataset('train')
history = model.fit(ds, epochs=5)
# predict 1 batch
ds = build_dataset('predict')
for batch in ds:
x_tf, indices_tf = batch
yhat_np = model.predict(x_tf)
break