Convolution preprocess consuming more RAM Python - python

I am trying to implement this CNN model for stock prize prediction: https://github.com/ZezhouLi/Convolutional-Networks-for-Stock-Predicting
But I am facing the issue while preprocessing the data for the implementation. The preprocess step is consuming a lot of RAM. (My system has 32 GB RAM and 256 GB SSD Hard disk)
Here is the file that is consuming the RAM and at last gives a Memory Error:
import numpy as np
import matplotlib.pyplot as plt
import glob
import math
from PIL import Image
import statsmodels.api as sm
def r_squared(y_true, y_hat):
ssr = 0
sst = 0
e = np.subtract(y_true, y_hat)
y_mean = np.mean(y_true)
for item in e:
ssr += item**2
for item in y_true:
sst += (item - y_mean)**2
r2 = 1 - ssr / sst
return r2
def data_process(data):
processed_data = []
for item in data:
m = np.mean(item)
s = np.std(item)
normal_item = [(float(i)-m)/s for i in item]
normal_item.insert(0, 1)
processed_data.append(normal_item)
return processed_data
def get_pixel_values():
file_name = r'\figures'
pixels = []
for filename in glob.glob(file_name + '\*.png'):
im = Image.open(filename)
temp_pixels = list(im.getdata())
pixels.append(temp_pixels)
return pixels
def find_returns(data):
returns = []
for group in data:
count = 30
while count <= (len(group)-5):
current_data = group[count-1]
future_data = group[count+4]
p1 = np.mean(current_data)
p2 = np.mean(future_data)
returns.append(math.log(p2/p1))
count += 1
return returns
def convert_image():
size = 54, 32
file_name = r'\figures'
for filename in glob.glob(file_name + '\*.png'):
img = Image.open(filename)
img.thumbnail(size)
img = img.convert('L')
img.save(filename)
def plot_data(data):
t = np.arange(0, 29, 1)
file_name_number = 0
fig = plt.figure(frameon=False)
for group in data:
count = 30
while count <= (len(group)-5):
high = []
low = []
for item in group[count-30:count]:
high.append(item[0])
low.append(item[1])
file_name = r'\fig_' + str(file_name_number)
ax = plt.Axes(fig, [0., 0., 1., 1.])
ax.set_axis_off()
fig.add_axes(ax)
ax.plot(t, high[0:-1], 'b', t, low[0:-1], 'g')
fig.savefig(r'\figures' + file_name)
fig.clf()
file_name_number += 1
count += 1
print('Created %d files!' % file_name_number)
def extract_useful_data(data):
groups = []
for group in data:
temp_buffer = []
for item in group:
temp = [item[2], item[3]]
temp = [float(i) for i in temp]
temp_buffer.append(temp)
groups.append(temp_buffer)
return groups
def split_data(data):
groups = []
for item in data:
temp_buffer = []
for string in item:
number = string.split(',')
temp_buffer.append(number)
groups.append(temp_buffer)
return groups
def extract_data():
file_name = r'\data.txt'
infile = open(file_name, 'r')
temp_buffer = []
for line in infile:
temp_buffer.append(line.strip('\n'))
temp_buffer = temp_buffer[8:]
i = 0
groups = []
temp = []
for item in temp_buffer:
if i != 390:
temp.append(item)
i += 1
else:
groups.append(temp)
temp = []
i = 0
groups.append(temp)
infile.close()
return groups
def main():
original_data = extract_data()
splitted_data = split_data(original_data)
useful_data = extract_useful_data(splitted_data)
plot_data(useful_data)
convert_image()
returns = np.asarray(find_returns(useful_data))
training_data = np.asarray(get_pixel_values())
training_data = sm.add_constant(training_data, has_constant='add')
results = sm.OLS(returns[0:4340], training_data[0:4340]).fit()
y_in_sample = results.predict(training_data[0:4340])
r2 = r_squared(returns[0:4340], y_in_sample)
print r2
if __name__ == "__main__":
main()
I have got Memory Error, which occurs when the program consumes all of the RAM memory of the system. Please improve on the program.

Related

'tensorflow_federated' has no attribute 'NamedTupleType

I am following this code https://github.com/BUAA-BDA/FedShapley/tree/master/TensorflowFL and trying to run the file same_OR.py
I also place input file "initial_model_parameters.txt" and data folder "MNIST_data" in same folder
from __future__ import absolute_import, division, print_function
import tensorflow_federated as tff
import tensorflow.compat.v1 as tf
import numpy as np
import time
from scipy.special import comb, perm
import os
# tf.compat.v1.enable_v2_behavior()
# tf.compat.v1.enable_eager_execution()
# NUM_EXAMPLES_PER_USER = 1000
BATCH_SIZE = 100
NUM_AGENT = 5
def get_data_for_digit(source, digit):
output_sequence = []
all_samples = [i for i, d in enumerate(source[1]) if d == digit]
for i in range(0, len(all_samples), BATCH_SIZE):
batch_samples = all_samples[i:i + BATCH_SIZE]
output_sequence.append({
'x': np.array([source[0][i].flatten() / 255.0 for i in batch_samples],
dtype=np.float32),
'y': np.array([source[1][i] for i in batch_samples], dtype=np.int32)})
return output_sequence
def get_data_for_digit_test(source, digit):
output_sequence = []
all_samples = [i for i, d in enumerate(source[1]) if d == digit]
for i in range(0, len(all_samples)):
output_sequence.append({
'x': np.array(source[0][all_samples[i]].flatten() / 255.0,
dtype=np.float32),
'y': np.array(source[1][all_samples[i]], dtype=np.int32)})
return output_sequence
def get_data_for_federated_agents(source, num):
output_sequence = []
Samples = []
for digit in range(0, 10):
samples = [i for i, d in enumerate(source[1]) if d == digit]
samples = samples[0:5421]
Samples.append(samples)
all_samples = []
for sample in Samples:
for sample_index in range(int(num * (len(sample) / NUM_AGENT)), int((num + 1) * (len(sample) / NUM_AGENT))):
all_samples.append(sample[sample_index])
# all_samples = [i for i in range(int(num*(len(source[1])/NUM_AGENT)), int((num+1)*(len(source[1])/NUM_AGENT)))]
for i in range(0, len(all_samples), BATCH_SIZE):
batch_samples = all_samples[i:i + BATCH_SIZE]
output_sequence.append({
'x': np.array([source[0][i].flatten() / 255.0 for i in batch_samples],
dtype=np.float32),
'y': np.array([source[1][i] for i in batch_samples], dtype=np.int32)})
return output_sequence
BATCH_TYPE = tff.NamedTupleType([
('x', tff.TensorType(tf.float32, [None, 784])),
('y', tff.TensorType(tf.int32, [None]))])
MODEL_TYPE = tff.NamedTupleType([
('weights', tff.TensorType(tf.float32, [784, 10])),
('bias', tff.TensorType(tf.float32, [10]))])
#tff.tf_computation(MODEL_TYPE, BATCH_TYPE)
def batch_loss(model, batch):
predicted_y = tf.nn.softmax(tf.matmul(batch.x, model.weights) + model.bias)
return -tf.reduce_mean(tf.reduce_sum(
tf.one_hot(batch.y, 10) * tf.log(predicted_y), axis=[1]))
#tff.tf_computation(MODEL_TYPE, BATCH_TYPE, tf.float32)
def batch_train(initial_model, batch, learning_rate):
# Define a group of model variables and set them to `initial_model`.
model_vars = tff.utils.create_variables('v', MODEL_TYPE)
init_model = tff.utils.assign(model_vars, initial_model)
# Perform one step of gradient descent using loss from `batch_loss`.
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
with tf.control_dependencies([init_model]):
train_model = optimizer.minimize(batch_loss(model_vars, batch))
# Return the model vars after performing this gradient descent step.
with tf.control_dependencies([train_model]):
return tff.utils.identity(model_vars)
LOCAL_DATA_TYPE = tff.SequenceType(BATCH_TYPE)
#tff.federated_computation(MODEL_TYPE, tf.float32, LOCAL_DATA_TYPE)
def local_train(initial_model, learning_rate, all_batches):
# Mapping function to apply to each batch.
#tff.federated_computation(MODEL_TYPE, BATCH_TYPE)
def batch_fn(model, batch):
return batch_train(model, batch, learning_rate)
l = tff.sequence_reduce(all_batches, initial_model, batch_fn)
return l
#tff.federated_computation(MODEL_TYPE, LOCAL_DATA_TYPE)
def local_eval(model, all_batches):
#
return tff.sequence_sum(
tff.sequence_map(
tff.federated_computation(lambda b: batch_loss(model, b), BATCH_TYPE),
all_batches))
SERVER_MODEL_TYPE = tff.FederatedType(MODEL_TYPE, tff.SERVER, all_equal=True)
CLIENT_DATA_TYPE = tff.FederatedType(LOCAL_DATA_TYPE, tff.CLIENTS)
#tff.federated_computation(SERVER_MODEL_TYPE, CLIENT_DATA_TYPE)
def federated_eval(model, data):
return tff.federated_mean(
tff.federated_map(local_eval, [tff.federated_broadcast(model), data]))
SERVER_FLOAT_TYPE = tff.FederatedType(tf.float32, tff.SERVER, all_equal=True)
#tff.federated_computation(
SERVER_MODEL_TYPE, SERVER_FLOAT_TYPE, CLIENT_DATA_TYPE)
def federated_train(model, learning_rate, data):
l = tff.federated_map(
local_train,
[tff.federated_broadcast(model),
tff.federated_broadcast(learning_rate),
data])
return l
# return tff.federated_mean()
def readTestImagesFromFile(distr_same):
ret = []
if distr_same:
f = open(os.path.join(os.path.dirname(__file__), "test_images1_.txt"), encoding="utf-8")
else:
f = open(os.path.join(os.path.dirname(__file__), "test_images1_.txt"), encoding="utf-8")
lines = f.readlines()
for line in lines:
tem_ret = []
p = line.replace("[", "").replace("]", "").replace("\n", "").split("\t")
for i in p:
if i != "":
tem_ret.append(float(i))
ret.append(tem_ret)
return np.asarray(ret)
def readTestLabelsFromFile(distr_same):
ret = []
if distr_same:
f = open(os.path.join(os.path.dirname(__file__), "test_labels_.txt"), encoding="utf-8")
else:
f = open(os.path.join(os.path.dirname(__file__), "test_labels_.txt"), encoding="utf-8")
lines = f.readlines()
for line in lines:
tem_ret = []
p = line.replace("[", "").replace("]", "").replace("\n", "").split(" ")
for i in p:
if i!="":
tem_ret.append(float(i))
ret.append(tem_ret)
return np.asarray(ret)
def getParmsAndLearningRate(agent_no):
f = open(os.path.join(os.path.dirname(__file__), "weights_" + str(agent_no) + ".txt"))
content = f.read()
g_ = content.split("***\n--------------------------------------------------")
parm_local = []
learning_rate_list = []
for j in range(len(g_) - 1):
line = g_[j].split("\n")
if j == 0:
weights_line = line[0:784]
learning_rate_list.append(float(line[784].replace("*", "").replace("\n", "")))
else:
weights_line = line[1:785]
learning_rate_list.append(float(line[785].replace("*", "").replace("\n", "")))
valid_weights_line = []
for l in weights_line:
w_list = l.split("\t")
w_list = w_list[0:len(w_list) - 1]
w_list = [float(i) for i in w_list]
valid_weights_line.append(w_list)
parm_local.append(valid_weights_line)
f.close()
f = open(os.path.join(os.path.dirname(__file__), "bias_" + str(agent_no) + ".txt"))
content = f.read()
g_ = content.split("***\n--------------------------------------------------")
bias_local = []
for j in range(len(g_) - 1):
line = g_[j].split("\n")
if j == 0:
weights_line = line[0]
else:
weights_line = line[1]
b_list = weights_line.split("\t")
b_list = b_list[0:len(b_list) - 1]
b_list = [float(i) for i in b_list]
bias_local.append(b_list)
f.close()
ret = {
'weights': np.asarray(parm_local),
'bias': np.asarray(bias_local),
'learning_rate': np.asarray(learning_rate_list)
}
return ret
def train_with_gradient_and_valuation(agent_list, grad, bi, lr, distr_type):
f_ini_p = open(os.path.join(os.path.dirname(__file__), "initial_model_parameters.txt"), "r")
para_lines = f_ini_p.readlines()
w_paras = para_lines[0].split("\t")
w_paras = [float(i) for i in w_paras]
b_paras = para_lines[1].split("\t")
b_paras = [float(i) for i in b_paras]
w_initial_g = np.asarray(w_paras, dtype=np.float32).reshape([784, 10])
b_initial_g = np.asarray(b_paras, dtype=np.float32).reshape([10])
f_ini_p.close()
model_g = {
'weights': w_initial_g,
'bias': b_initial_g
}
for i in range(len(grad[0])):
# i->迭代轮数
gradient_w = np.zeros([784, 10], dtype=np.float32)
gradient_b = np.zeros([10], dtype=np.float32)
for j in agent_list:
gradient_w = np.add(np.multiply(grad[j][i], 1/len(agent_list)), gradient_w)
gradient_b = np.add(np.multiply(bi[j][i], 1/len(agent_list)), gradient_b)
model_g['weights'] = np.subtract(model_g['weights'], np.multiply(lr[0][i], gradient_w))
model_g['bias'] = np.subtract(model_g['bias'], np.multiply(lr[0][i], gradient_b))
test_images = readTestImagesFromFile(False)
test_labels_onehot = readTestLabelsFromFile(False)
m = np.dot(test_images, np.asarray(model_g['weights']))
test_result = m + np.asarray(model_g['bias'])
y = tf.nn.softmax(test_result)
correct_prediction = tf.equal(tf.argmax(y, 1), tf.arg_max(test_labels_onehot, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return accuracy.numpy()
def remove_list_indexed(removed_ele, original_l, ll):
new_original_l = []
for i in original_l:
new_original_l.append(i)
for i in new_original_l:
if i == removed_ele:
new_original_l.remove(i)
for i in range(len(ll)):
if set(ll[i]) == set(new_original_l):
return i
return -1
def shapley_list_indexed(original_l, ll):
for i in range(len(ll)):
if set(ll[i]) == set(original_l):
return i
return -1
def PowerSetsBinary(items):
N = len(items)
set_all = []
for i in range(2 ** N):
combo = []
for j in range(N):
if (i >> j) % 2 == 1:
combo.append(items[j])
set_all.append(combo)
return set_all
if __name__ == "__main__":
start_time = time.time()
#data_num = np.asarray([5923,6742,5958,6131,5842])
#agents_weights = np.divide(data_num, data_num.sum())
for index in range(NUM_AGENT):
f = open(os.path.join(os.path.dirname(__file__), "weights_"+str(index)+".txt"), "w")
f.close()
f = open(os.path.join(os.path.dirname(__file__), "bias_" + str(index) + ".txt"), "w")
f.close()
mnist_train, mnist_test = tf.keras.datasets.mnist.load_data()
DISTRIBUTION_TYPE = "SAME"
federated_train_data_divide = None
federated_train_data = None
if DISTRIBUTION_TYPE == "SAME":
federated_train_data_divide = [get_data_for_federated_agents(mnist_train, d) for d in range(NUM_AGENT)]
federated_train_data = federated_train_data_divide
f_ini_p = open(os.path.join(os.path.dirname(__file__), "initial_model_parameters.txt"), "r")
para_lines = f_ini_p.readlines()
w_paras = para_lines[0].split("\t")
w_paras = [float(i) for i in w_paras]
b_paras = para_lines[1].split("\t")
b_paras = [float(i) for i in b_paras]
w_initial = np.asarray(w_paras, dtype=np.float32).reshape([784, 10])
b_initial = np.asarray(b_paras, dtype=np.float32).reshape([10])
f_ini_p.close()
initial_model = {
'weights': w_initial,
'bias': b_initial
}
model = initial_model
learning_rate = 0.1
for round_num in range(50):
local_models = federated_train(model, learning_rate, federated_train_data)
print("learning rate: ", learning_rate)
#print(local_models[0][0])#第0个agent的weights矩阵
#print(local_models[0][1])#第0个agent的bias矩阵
#print(len(local_models))
for local_index in range(len(local_models)):
f = open(os.path.join(os.path.dirname(__file__), "weights_"+str(local_index)+".txt"),"a",encoding="utf-8")
for i in local_models[local_index][0]:
line = ""
arr = list(i)
for j in arr:
line += (str(j)+"\t")
print(line, file=f)
print("***"+str(learning_rate)+"***",file=f)
print("-"*50,file=f)
f.close()
f = open(os.path.join(os.path.dirname(__file__), "bias_" + str(local_index) + ".txt"), "a", encoding="utf-8")
line = ""
for i in local_models[local_index][1]:
line += (str(i) + "\t")
print(line, file=f)
print("***" + str(learning_rate) + "***",file=f)
print("-"*50,file=f)
f.close()
m_w = np.zeros([784, 10], dtype=np.float32)
m_b = np.zeros([10], dtype=np.float32)
for local_model_index in range(len(local_models)):
m_w = np.add(np.multiply(local_models[local_model_index][0], 1/NUM_AGENT), m_w)
m_b = np.add(np.multiply(local_models[local_model_index][1], 1/NUM_AGENT), m_b)
model = {
'weights': m_w,
'bias': m_b
}
learning_rate = learning_rate * 0.9
loss = federated_eval(model, federated_train_data)
print('round {}, loss={}'.format(round_num, loss))
print(time.time()-start_time)
gradient_weights = []
gradient_biases = []
gradient_lrs = []
for ij in range(NUM_AGENT):
model_ = getParmsAndLearningRate(ij)
gradient_weights_local = []
gradient_biases_local = []
learning_rate_local = []
for i in range(len(model_['learning_rate'])):
if i == 0:
gradient_weight = np.divide(np.subtract(initial_model['weights'], model_['weights'][i]),
model_['learning_rate'][i])
gradient_bias = np.divide(np.subtract(initial_model['bias'], model_['bias'][i]),
model_['learning_rate'][i])
else:
gradient_weight = np.divide(np.subtract(model_['weights'][i - 1], model_['weights'][i]),
model_['learning_rate'][i])
gradient_bias = np.divide(np.subtract(model_['bias'][i - 1], model_['bias'][i]),
model_['learning_rate'][i])
gradient_weights_local.append(gradient_weight)
gradient_biases_local.append(gradient_bias)
learning_rate_local.append(model_['learning_rate'][i])
gradient_weights.append(gradient_weights_local)
gradient_biases.append(gradient_biases_local)
gradient_lrs.append(learning_rate_local)
all_sets = PowerSetsBinary([i for i in range(NUM_AGENT)])
group_shapley_value = []
for s in all_sets:
group_shapley_value.append(
train_with_gradient_and_valuation(s, gradient_weights, gradient_biases, gradient_lrs, DISTRIBUTION_TYPE))
print(str(s)+"\t"+str(group_shapley_value[len(group_shapley_value)-1]))
agent_shapley = []
for index in range(NUM_AGENT):
shapley = 0.0
for j in all_sets:
if index in j:
remove_list_index = remove_list_indexed(index, j, all_sets)
if remove_list_index != -1:
shapley += (group_shapley_value[shapley_list_indexed(j, all_sets)] - group_shapley_value[
remove_list_index]) / (comb(NUM_AGENT - 1, len(all_sets[remove_list_index])))
agent_shapley.append(shapley)
for ag_s in agent_shapley:
print(ag_s)
print("end_time", time.time()-start_time)
I installed tensor flow federated with this command
pip install --upgrade tensorflow_federated
and this line is also underlied with red color
import tensorflow.compat.v1 as tf
when i tried to execute go this error
File "same_OR.py", line 94, in
BATCH_TYPE = tff.NamedTupleType([ AttributeError: module 'tensorflow_federated' has no attribute 'NamedTupleType'
where is the problem? anyone can help?
tff.NamedTupleType was renamed to tff.StructType in TFF version 0.16.0 (release notes).
Two options:
Install a pre-0.16.0 version of TFF: this should be doable with pip install tensorflow_federated=0.15.0.
Update the code: the error should go away after replacing the tff.NamedTupleType with tff.StructType in the snippet:
BATCH_TYPE = tff.NamedTupleType([
('x', tff.TensorType(tf.float32, [None, 784])),
('y', tff.TensorType(tf.int32, [None]))])
MODEL_TYPE = tff.NamedTupleType([
('weights', tff.TensorType(tf.float32, [784, 10])),
('bias', tff.TensorType(tf.float32, [10]))])

How to fix broken data in feature extraction/pre-processing in speech recognition?

i am very new in machine learning. I stumble on this source code on github that has no database, so i decided to use my own database. This code is to recognize speaker with MFCC and GMM-UBM. But when i try to run the code, i got this error "ValueError: Found array with 1 sample(s) (shape=(1, 13)) while a minimum of 2 is required". It seems like when the code is trying to fit the GMM on the 68th dataset, the MFCC shape of the data is broken. I assume there's something wrong on the feature extraction process.
Please help me! thank you very much.
Here's the code
import python_speech_features as psf
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib
from scipy.io import wavfile
from functools import reduce
import numpy as np
from os import listdir
from os.path import isfile, join
import os
import re
DATA_PATH = 'dataCoba'
# Make a list of speakers from the newdata/data folder. The format for the files in the folder is
# name_1,wav for training and name_2.wav for testing
substring = "_2"
onlyfiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]
onlyfiles.sort()
onlyones = []
for filename in onlyfiles:
dups = re.search('[\w]+_2.wav', filename)
#dups = re.search('[\w].wav', filename)
if dups is None:
onlyones.append(''.join(filename.split('_')[0]))
print(onlyones)
SPEAKERS = onlyones
TOTAL_SPEAKERS = len(SPEAKERS)
MODEL_SPEAKERS = len(SPEAKERS)
print(len(SPEAKERS))
class SpeakerRecognition:
# Create a GMM and UBM model for each speaker. The GMM is modelled after the speaker and UBM for each speaker
# is modelled after all the other speakers. Likelihood Ratio test is used to verify speaker
def setGMMUBM(self, no_components):
self.GMM = []
self.UBM = []
for i in range(MODEL_SPEAKERS):
self.GMM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
self.UBM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
# Load in data from .wav files in data/
# Extract mfcc (first 13 coefficients) from each audio sample
def load_data(self):
#training
self.spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_1.wav') for i in SPEAKERS]
self.spk_mfcc = [psf.mfcc(self.spk[i][1], self.spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
#testing
self.p_spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_2.wav') for i in SPEAKERS]
self.p_spk_mfcc = [psf.mfcc(self.p_spk[i][1], self.p_spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
print(self.spk_mfcc)
for i in range(TOTAL_SPEAKERS):
self.spk_train_size.append(len(self.spk_mfcc[i]))
self.spk_start.append(len(self.total_mfcc))
print("Speaker Number(train) = ",i)
print ("self.spk_mfcc[i] = ", len(self.spk_mfcc[i]))
for mfcc in self.spk_mfcc[i]:
self.total_mfcc.append(mfcc)
self.speaker_label.append(i)
self.spk_end.append(len(self.total_mfcc))
print("self.total_mfcc = ", len(self.total_mfcc))
print("\n")
for i in range(TOTAL_SPEAKERS):
#print("self.p_spk_mfcc =", self.p_spk_mfcc)
self.spk_test_size.append(len(self.p_spk_mfcc[i]))
self.spk_start.append(len(self.p_total_mfcc))
print("Speaker Num(test) = ",i)
print("self.p_spk_mfcc = ",len(self.p_spk_mfcc[i]))
print("MFCC Shape = ",self.spk_mfcc[i].shape)
for mfcc in self.p_spk_mfcc[i]:
self.p_total_mfcc.append(mfcc)
self.p_speaker_label.append(i)
self.p_spk_end.append(len(self.p_total_mfcc))
print("self.total_mfcc = ", len(self.p_total_mfcc))
print("\n")
# Gaussian Mixture Model is made of a number of Gaussian distribution components.
# To model data, a suitable number o gaussian components have to be selected.
# There is no method for finding this. It is done by trial and error. This runs
# the program for different values of component and records accuracy for each one
[![This is the error when i run the code][1]][1]
def find_best_params(self):
best_no_components = 1
maxacc = 0
for i in range(100, 256):
self.setGMMUBM(i)
self.fit_model()
_, acc, _ = self.predict()
print("Accuracy for n = {} is {}".format(i, acc))
if acc > maxacc:
maxacc = acc
best_no_components = i
return best_no_components
# Fit the GMM UBM models with training data
# fit = N buah data * dimensi data
def fit_model(self):
for i in range(MODEL_SPEAKERS):
print("Fit start for {}".format(i))
self.GMM[i].fit(self.spk_mfcc[i])
print(self.spk_mfcc[i].shape)
self.UBM[i].fit(self.total_mfcc[:self.spk_start[i]] + self.total_mfcc[self.spk_end[i]:])
print("Fit end for {}".format(i))
joblib.dump(self.UBM[i], 'dumps/new/ubm' + str(i) + '.pkl')
joblib.dump(self.GMM[i], 'dumps/new/gmm' + str(i) + '.pkl')
def model(self, no_components = 244):
self.setGMMUBM(no_components)
self.fit_model()
# Predict the output for each model for each speaker and produce confusion matrix
def load_model(self):
for i in range(0, MODEL_SPEAKERS):
self.GMM.append(joblib.load('dumps/new/gmm' + str(i) + '.pkl'))
self.UBM.append(joblib.load('dumps/new/ubm' + str(i) + '.pkl'))
def predict(self):
avg_accuracy = 0
confusion = [[ 0 for y in range(MODEL_SPEAKERS) ] for x in range(TOTAL_SPEAKERS)]
for i in range(TOTAL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
x = self.GMM[j].score_samples(self.p_spk_mfcc[i]) - self.UBM[j].score_samples(self.p_spk_mfcc[i])
for score in x :
if score > 0:
confusion[i][j] += 1
confusion_diag = [confusion[i][i] for i in range(MODEL_SPEAKERS)]
diag_sum = 0
for item in confusion_diag:
diag_sum += item
remain_sum = 0
for i in range(MODEL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
if i != j:
remain_sum += confusion[i][j]
spk_accuracy = 0
for i in range(MODEL_SPEAKERS):
best_guess, _ = max(enumerate(confusion[i]), key=lambda p: p[1])
print("For Accent {}, best guess is {}".format(SPEAKERS[i], SPEAKERS[best_guess]))
if i == best_guess:
spk_accuracy += 1
#print(MODEL_SPEAKERS)
spk_accuracy /= MODEL_SPEAKERS
avg_accuracy = diag_sum/(remain_sum+diag_sum)
return confusion, avg_accuracy, spk_accuracy
def __init__(self):
self.test_spk = []
self.test_mfcc = []
# Speaker data and corresponding mfcc
self.spk = []
self.spk_mfcc = []
self.p_spk = []
self.p_spk_mfcc = []
# Holds all the training mfccs of all speakers and
# speaker_label is the speaker label for the corresponding mfcc
self.total_mfcc = []
self.speaker_label = []
self.spk_train_size = [] # Index upto which is training data for that speaker.
self.p_total_mfcc = []
self.p_speaker_label = []
#print(self.p_speaker_label)
self.spk_test_size = []
# Since the length of all the audio files are different, spk_start and spk_end hold
self.spk_start = []
self.spk_end = []
self.p_spk_start = []
self.p_spk_end = []
self.GMM = []
self.UBM = []
self.load_data()
self.cepstral_mean_subtraction()
# Cepstral Mean Subtraction (Feature Normalization step)
def cepstral_mean_subtraction(self):
for i, speaker_mfcc in enumerate(self.spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x/len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.spk_mfcc[i][j][k] -= average[k]
for i, speaker_mfcc in enumerate(self.p_spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x / len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.p_spk_mfcc[i][j][k] -= average[k]
#TBD : Ten fold validation
def ten_fold():
#fold_size = 0.1 * self.n
fold_offset = 0.0
accuracy_per_fold = 0
average_accuracy = 0
for i in range(0, 10):
print("Fold start is {} and fold end is {} ".format( fold_offset, fold_offset + fold_size))
#accuracy = self.execute(int(fold_offset), int(fold_offset + fold_size))
#print("Accuracy is of test {} is : {} ".format(i, accuracy))
#average_accuracy += accuracy
#fold_offset += fold_size
average_accuracy /= 10.0
print("Average accuracy " + str(100 * average_accuracy))
return average_accuracy
# Final result is a confusion matrix which represents the accuracy of the fit of the model
if __name__ == '__main__':
SR = SpeakerRecognition()
#SR.load_model()
SR.setGMMUBM(no_components=13)
#SR.find_best_params()
SR.fit_model()
confusion, mfcc_accuracy, spk_accuracy = SR.predict()
print("Confusion Matrix")
print(np.matrix(confusion))
print("Accuracy in predicting speakers : {}".format(spk_accuracy))
print("Accuracy in testing for MFCC : {}".format(mfcc_accuracy))

Generate synthetic time series data from existing sample data

Are there any good library/tools in python for generating synthetic time series data from existing sample data? For example I have sales data from January-June and would like to generate synthetic time series data samples from July-December )(keeping time series factors intact, like trend, seasonality, etc).
Leaving the question about quality of such data aside, here is a simple approach you can use Gaussian distribution to generate synthetic data based-off a sample. Below is the critical part.
import numpy as np
x # original sample np.array of features
feature_means = np.mean(x, axis=1)
feature_std = np.std(x, axis=1)
random_normal_feature_values = np.random.normal(feature_means, feature_std)
Here is a fully functioning code I used,
def generate_synthetic_data(sample_dataset, window_mean, window_std, fixed_window=None, variance_range =1 , sythesize_ratio = 2, forced_reverse = False):
synthetic_data = pd.DataFrame(columns=sample_dataset.columns)
synthetic_data.insert(len(sample_dataset.columns), "synthesis_seq", [], True)
for k in range(sythesize_ratio):
if len(synthetic_data) >= len(sample_dataset) * sythesize_ratio:
break;
#this loop generates a set that resembles the entire dataset
country_synthetic = pd.DataFrame(columns=synthetic_data.columns)
if fixed_window != None:
input_sequence_len = fixed_window
else:
input_sequence_len = int(np.random.normal(window_mean, window_std))
#population data change
country_data_i = sample_dataset
if len(country_data_i) < input_sequence_len :
continue
feature_length = configuration['feature_length'] #number of features to be randomized
country_data_array = country_data_i.to_numpy()
country_data_array = country_data_array.T[:feature_length]
country_data_array = country_data_array.reshape(feature_length,len(country_data_i))
x = country_data_array[:feature_length].T
reversed = np.random.normal(0,1)>0
if reversed:
x = x[::-1]
sets =0
x_list = []
dict_x = dict()
for i in range(input_sequence_len):
array_len = ((len(x) -i) - ((len(x)-i)%input_sequence_len))+i
if array_len <= 0:
continue
sets = int( array_len/ input_sequence_len)
if sets <= 0:
continue
x_temp = x[i:array_len].T.reshape(sets,feature_length,input_sequence_len)
uniq_keys = np.array([i+(input_sequence_len*k) for k in range(sets)])
x_temp = x_temp.reshape(feature_length,sets,input_sequence_len)
arrays_split = np.hsplit(x_temp,sets)
dict_x.update(dict(zip(uniq_keys, arrays_split)))
temp_x_list = [dict_x[i].T for i in sorted(dict_x.keys())]
temp_x_list = np.array(temp_x_list).squeeze()
feature_means = np.mean(temp_x_list, axis=1)
feature_std = np.std(temp_x_list, axis=1) /variance_range
random_normal_feature_values = np.random.normal(feature_means, feature_std).T
random_normal_feature_values = np.round(random_normal_feature_values,0)
random_normal_feature_values[random_normal_feature_values < 0] = 0
if reversed:
random_normal_feature_values = random_normal_feature_values.T[::-1]
random_normal_feature_values = random_normal_feature_values.T
for i in range(len(random_normal_feature_values)):
country_synthetic[country_synthetic.columns[i]] = random_normal_feature_values[i]
country_synthetic['synthesis_seq'] = k
synthetic_data = synthetic_data.append(country_synthetic, ignore_index=True)
return synthetic_data
for i in range(1):
directory_name = '/synthetic_'+str(i)
mypath = source_path+ '/cleaned'+directory_name
if os.path.exists(mypath) == False:
os.mkdir(mypath)
data = generate_synthetic_data(original_data, window_mean = 0, window_std= 0, fixed_window=2 ,variance_range = 10**i, sythesize_ratio = 1)
synthetic_data.append(data)
#data.to_csv(mypath+'/synthetic_'+str(i)+'_dt31_05_.csv', index=False )
print('synth step : ', i, ' len : ', len(synthetic_data))
Good luck!

Handwriting neural network weights don't change

from struct import unpack
import gzip
import numpy
from numpy import *
import matplotlib.pyplot as plt
learningRate = 0.1
def get_labeled_data(imagefile, labelfile):
"""Read input-vector (image) and target class (label, 0-9) and return
it as list of tuples.
"""
# Open the images with gzip in read binary mode
images = gzip.open(imagefile, 'rb')
labels = gzip.open(labelfile, 'rb')
# Read the binary data
# We have to get big endian unsigned int. So we need '>I'
# Get metadata for images
images.read(4) # skip the magic_number
number_of_images = images.read(4)
number_of_images = unpack('>I', number_of_images)[0]
rows = images.read(4)
rows = unpack('>I', rows)[0]
cols = images.read(4)
cols = unpack('>I', cols)[0]
# Get metadata for labels
labels.read(4) # skip the magic_number
N = labels.read(4)
N = unpack('>I', N)[0]
if number_of_images != N:
raise Exception('number of labels did not match the number of images')
# Get the data
x = zeros((N, rows, cols), dtype="float32") # Initialize numpy array
y = zeros((N, 1), dtype="uint8") # Initialize numpy array
for i in range(N):
if i % 1000 == 0:
print("i: %i" % i)
for row in range(rows):
for col in range(cols):
tmp_pixel = images.read(1) # Just a single byte
tmp_pixel = unpack('>B', tmp_pixel)[0]
x[i][row][col] = tmp_pixel
tmp_label = labels.read(1)
y[i] = unpack('>B', tmp_label)[0]
return (x, y)
ld = get_labeled_data("C:/Users/XBGFD/Desktop/Programming/NeuralNetworks/HRR/train-images-idx3-ubyte.gz", "C:/Users/XBGFD/Desktop/Programming/NeuralNetworks/HRR/train-labels-idx1-ubyte.gz")
def sigmoid(x):
return 1/(1+numpy.exp(-x))
def sigmoid_P(x):
return sigmoid(x) * (1 - sigmoid(x))
def cost(i, t):
return (i - t) ** 2
def cost_P(i, t):
return 2 * (i - t)
# 10x28x28 - number x row x column
weights = numpy.random.random((10, 28, 28))
biases = numpy.random.random((10, 28, 28))
dr = 0
da = 0
for loopi in range(10000):
r = numpy.random.randint(0, len(ld[0][0]))
targets = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
image = ld[0][r]
label = ld[1][r][0]
# weighted 3D Matrix of [number][row][column]
predictions = []
wPredictions = []
# average of predictions for each number
avgPred = []
avgPred2 = []
img = list(image)
for i in range(10):
x = []
y = []
for b, w in zip(biases[i], weights[i]):
x.append(sigmoid(numpy.dot(w, image) + b))
y.append(numpy.dot(w, image) + b)
predictions.append(x)
avgPred.append(numpy.average(list(x)))
avgPred2.append(numpy.average(list(y)))
for i in range(10):
sqError = cost(avgPred[i], targets[i])
# derivative of the cost with respect to each of the weights and biases
dc_dp = cost_P(avgPred[i], targets[i])
dp_dz = sigmoid_P(avgPred2[i])
#for b, w in zip(biases[i], weights[i]):
for imgRow in range(28):
for imgCol in range(28):
dz_dw = image[imgRow][imgCol]
dz_db = 1
print("dc_dp: " + str(dc_dp) + "\ndp_dz: "+ str(dp_dz) + "\ndz_dw: " + str(dz_dw))
dc_dw = dc_dp * dp_dz * dz_dw
dc_db = dc_dp * dp_dz * dz_db
dr = dc_dw
weights[i][imgRow][imgCol] -= learningRate * dc_dw
da = weights[i][imgRow][imgCol]
biases[i][imgRow][imgCol] -= learningRate * dc_db
while True:
big = 0
intid = int(input())
imag = ld[0][intid]
for l in range(10):
papa = []
for b, w in zip(biases[i], weights[i]):
papa.append(sigmoid(numpy.dot(w, imag) + b))
lol = numpy.average(papa)
if(lol > big):
big = l
print(str(dr) + " " + str(da))
print(big)
The weights aren't changing because dp_dz is always 0, I'm not sure what's causing that. I don't mean that they're changing but only a very small change, they're literally NOT changing at all. I believe it has to do with my approach in general, but I'm not sure how else I could approach this problem, I'm very new to neural networks. Any help would be greatly appreciated!

Is something wrong with my backpropagation calculation?

For the last 3 days, I have been trying to build my first neural network to no avail. I'm asking this question here because I can't think of anywhere else to get quality feedback and I haven't found any solutions by searching so far.
The network takes in 784 inputs (pixels) and has 9 outputs (numbers 0-9), I am trying to train it on the mnist hand written digit dataset.
The problem is that after several iterations, the output explodes to a list of random +1s and -1s eg.
[-1., 1., 1., 1., -1., -1., 1., 1., 1.]
I will include code snippets of the ANNs init(), f_pass() and backpropagate() as I think if there is an error, it will likely be in the code containing the matrices. I can upload more code on request if needed.
class Feedforward:
def __init__(self, size_vector):
self.nLayers = len(size_vector)
self.size_vector = size_vector
weight_matrix = lambda x: np.random.random((x[0],x[1]))
self.weights = {}
self.b = {}
self.z = {}
self.a = {}
for i in range(0,self.nLayers,1):
z = size_vector[i:i+2]
try:
self.b[i] = np.random.random((1,size_vector[i+1]))
except IndexError:
pass
if len(z) == 2:
self.weights[i] = weight_matrix(z)
f_pass():
def f_pass(self, data):
for layer in range(self.nLayers-1):
if layer == 0:
self.z[layer] = data.dot(self.weights[0])+self.b[layer].reshape((1,self.size_vector[layer+1]))
self.a[layer] = self.activation(self.z[layer]).reshape((1,self.size_vector[layer+1]))
else:
self.z[layer] = self.a[layer-1].dot(self.weights[layer])+self.b[layer]
self.a[layer] = self.activation(self.z[layer]).reshape((1,self.size_vector[layer+1]))
Backprop():
def backporpagate(self, data):
lr = .01
expected, inputs = data
cost = np.square(expected-self.a[self.nLayers-2])
partial_layer_error = {}
partial_weight_error = {}
partial_bias_error = {}
for i in range(self.nLayers-1):
layer = self.nLayers-2-i
if i == 0:
partial_layer_error[layer] = 2*(expected-self.a[self.nLayers-2])*self.activation_prime(self.z[layer]).reshape(1,self.size_vector[layer+1])
partial_weight_error[layer] = self.a[layer-1].transpose().dot(partial_layer_error[layer])
partial_bias_error[layer] = partial_layer_error[layer].reshape(1,self.size_vector[layer+1])
elif layer == 0:
partial_layer_error[layer] = partial_layer_error[layer+1].dot(self.weights[layer+1].transpose())
partial_weight_error[layer] = inputs.transpose().dot(partial_layer_error[layer])
partial_bias_error[layer] = partial_layer_error[layer].reshape(1,self.size_vector[layer+1])
else:
partial_layer_error[layer] = self.weights[layer+1].transpose().dot(partial_layer_error[layer+1])*self.activation_prime(self.z[layer])
partial_weight_error[layer] = self.a[layer-1].transpose().dot(partial_layer_error[layer])
partial_bias_error[layer] = partial_layer_error[layer].reshape(1,self.size_vector[layer+1])
for i in range(len(self.size_vector)-1):
self.weights[i] -= lr*partial_weight_error[i]
self.b[i] -= lr*partial_bias_error[i]
print(self.a[len(self.size_vector)-2])
print(expected)
The full code if anyone would like to view it is:
import numpy as np
import random
import pandas as pd
import scipy
class Feedforward:
def __init__(self, size_vector):
self.nLayers = len(size_vector)
self.size_vector = size_vector
weight_matrix = lambda x: np.random.random((x[0],x[1]))
self.weights = {}
self.b = {}
self.z = {}
self.a = {}
for i in range(0,self.nLayers,1):
z = size_vector[i:i+2]
try:
self.b[i] = np.random.random((1,size_vector[i+1]))
except IndexError:
pass
if len(z) == 2:
self.weights[i] = weight_matrix(z)
def activation(self, matrix):
#print(matrix)
matrix = np.clip( matrix, -300, 300 )
return (np.exp(2*matrix)-1)/(np.exp(2*matrix)+1)
def activation_prime(self, matrix):
return 1/(1-np.square(matrix))
def f_pass(self, data):
for layer in range(self.nLayers-1):
if layer == 0:
self.z[layer] = data.dot(self.weights[0])+self.b[layer].reshape((1,self.size_vector[layer+1]))
self.a[layer] = self.activation(self.z[layer]).reshape((1,self.size_vector[layer+1]))
else:
self.z[layer] = self.a[layer-1].dot(self.weights[layer])+self.b[layer]
self.a[layer] = self.activation(self.z[layer]).reshape((1,self.size_vector[layer+1]))
def backporpagate(self, data):
lr = .01
expected, inputs = data
cost = np.square(expected-self.a[self.nLayers-2])
partial_layer_error = {}
partial_weight_error = {}
partial_bias_error = {}
for i in range(self.nLayers-1):
layer = self.nLayers-2-i
if i == 0:
partial_layer_error[layer] = 2*(expected-self.a[self.nLayers-2])*self.activation_prime(self.z[layer]).reshape(1,self.size_vector[layer+1])
partial_weight_error[layer] = self.a[layer-1].transpose().dot(partial_layer_error[layer])
partial_bias_error[layer] = partial_layer_error[layer].reshape(1,self.size_vector[layer+1])
elif layer == 0:
partial_layer_error[layer] = partial_layer_error[layer+1].dot(self.weights[layer+1].transpose())
partial_weight_error[layer] = inputs.transpose().dot(partial_layer_error[layer])
partial_bias_error[layer] = partial_layer_error[layer].reshape(1,self.size_vector[layer+1])
else:
partial_layer_error[layer] = self.weights[layer+1].transpose().dot(partial_layer_error[layer+1])*self.activation_prime(self.z[layer])
partial_weight_error[layer] = self.a[layer-1].transpose().dot(partial_layer_error[layer])
partial_bias_error[layer] = partial_layer_error[layer].reshape(1,self.size_vector[layer+1])
for i in range(len(self.size_vector)-1):
self.weights[i] -= lr*partial_weight_error[i]
self.b[i] -= lr*partial_bias_error[i]
print(self.a[len(self.size_vector)-2])
print(expected)
def train(self, data):
batch = data.sample(2000)
pairs = [batch.iloc[:,0],batch.iloc[:,1]]
avg_I = np.zeros(9)
avg_O = np.zeros(784)
for index, (label, img) in batch.iterrows():
label = np.array(label)
img = np.array(img)[:,np.newaxis].transpose()/255
self.f_pass(img)
self.backporpagate((label, img))
def prepare_mnist():
print('preparing MNIST: please wait' + '\n')
with open('mnist_test.csv') as f:
data = f.readlines()
mnist = []
print('Reading Data: \n')
for i in data:
key = []
for j in range(9):
if j == int(i[0])-1:
key.append(1)
else:
key.append(0)
value = i[1:]
value = value.replace('\n','')
value = value.split(',')
value.pop(0)
value = [int(x) for x in value]
mnist.append((key,value))
print("Converting to DataFrame \n")
df = pd.DataFrame(mnist)
return df
N = Feedforward([784,50,9])
N.train(prepare_mnist())
If anyone could help me out, I would really appreciate it. This has bothered me for days, and I really want to get to grips with building these things practically.

Categories

Resources