How can I get around memory limitation in this script? - python

I'm trying to normalize my dataset which is 1.7 Gigabyte. I have 14Gig of RAM and I hit my limit very quickly.
This happens when computing the mean/std of the training data. The training data takes up the majority of the memory when loaded into RAM(13.8Gig),thus the mean gets calculated, but when it reaches to the next line while calculating the std, it crashes.
Follows the script:
import caffe
import leveldb
import numpy as np
from caffe.proto import caffe_pb2
import cv2
import sys
import time
direct = 'examples/svhn/'
db_train = leveldb.LevelDB(direct+'svhn_train_leveldb')
db_test = leveldb.LevelDB(direct+'svhn_test_leveldb')
datum = caffe_pb2.Datum()
#using the whole dataset for training which is 604,388
size_train = 604388 #normal training set is 73257
size_test = 26032
data_train = np.zeros((size_train, 3, 32, 32))
label_train = np.zeros(size_train, dtype=int)
print 'Reading training data...'
i = -1
for key, value in db_train.RangeIter():
i = i + 1
if i % 1000 == 0:
print i
if i == size_train:
break
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
data_train[i] = data
label_train[i] = label
print 'Computing statistics...'
print 'calculating mean...'
mean = np.mean(data_train, axis=(0,2,3))
print 'calculating std...'
std = np.std(data_train, axis=(0,2,3))
#np.savetxt('mean_svhn.txt', mean)
#np.savetxt('std_svhn.txt', std)
print 'Normalizing training'
for i in range(3):
print i
data_train[:, i, :, :] = data_train[:, i, :, :] - mean[i]
data_train[:, i, :, :] = data_train[:, i, :, :]/std[i]
print 'Outputting training data'
leveldb_file = direct + 'svhn_train_leveldb_normalized'
batch_size = size_train
# create the leveldb file
db = leveldb.LevelDB(leveldb_file)
batch = leveldb.WriteBatch()
datum = caffe_pb2.Datum()
for i in range(size_train):
if i % 1000 == 0:
print i
# save in datum
datum = caffe.io.array_to_datum(data_train[i], label_train[i])
keystr = '{:0>5d}'.format(i)
batch.Put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
db.Write(batch, sync=True)
batch = leveldb.WriteBatch()
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
db.Write(batch, sync=True)
print 'last batch'
print (i + 1)
#explicitly freeing memory to avoid hitting the limit!
#del data_train
#del label_train
print 'Reading test data...'
data_test = np.zeros((size_test, 3, 32, 32))
label_test = np.zeros(size_test, dtype=int)
i = -1
for key, value in db_test.RangeIter():
i = i + 1
if i % 1000 == 0:
print i
if i ==size_test:
break
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
data_test[i] = data
label_test[i] = label
print 'Normalizing test'
for i in range(3):
print i
data_test[:, i, :, :] = data_test[:, i, :, :] - mean[i]
data_test[:, i, :, :] = data_test[:, i, :, :]/std[i]
#Zero Padding
#print 'Padding...'
#npad = ((0,0), (0,0), (4,4), (4,4))
#data_train = np.pad(data_train, pad_width=npad, mode='constant', constant_values=0)
#data_test = np.pad(data_test, pad_width=npad, mode='constant', constant_values=0)
print 'Outputting test data'
leveldb_file = direct + 'svhn_test_leveldb_normalized'
batch_size = size_test
# create the leveldb file
db = leveldb.LevelDB(leveldb_file)
batch = leveldb.WriteBatch()
datum = caffe_pb2.Datum()
for i in range(size_test):
# save in datum
datum = caffe.io.array_to_datum(data_test[i], label_test[i])
keystr = '{:0>5d}'.format(i)
batch.Put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
db.Write(batch, sync=True)
batch = leveldb.WriteBatch()
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
db.Write(batch, sync=True)
print 'last batch'
print (i + 1)
How can I make it consume less memory so that I can get to run the script?

Why not compute the statistics on a subset of the original data? For example, here we compute the mean and std for just 100 points:
sample_size = 100
data_train = np.random.rand(1000, 20, 10, 10)
# Take subset of training data
idxs = np.random.choice(data_train.shape[0], sample_size)
data_train_subset = data_train[idxs]
# Compute stats
mean = np.mean(data_train_subset, axis=(0,2,3))
std = np.std(data_train_subset, axis=(0,2,3))
If your data is 1.7Gb, it is highly unlikely that you need all the data to get an accurate estimation of the mean and std.
In addition, could you get away with fewer bits in your datatype? I'm not sure what datatype caffe.io.datum_to_array returns, but you could do:
data = caffe.io.datum_to_array(datum).astype(np.float32)
to ensure the data is float32 format. (If the data is currently float64, then this will save you half the space).

The culprit that caused so much issues and constant crashing due to insufficient memory, was due to batch size being the size of whole training set:
print 'Outputting test data'
leveldb_file = direct + 'svhn_test_leveldb_normalized'
batch_size = size_test
This apparently was the cause, nothing would get committed and saved to the disk until the whole dataset was read and loaded into one huge transaction, this is also the case when using np.float32 suggested by #BillCheatham didn't work properly.
The memorymap solution wouldn't work for me for some reason and I used the solution I mentioned above.
PS: Later on, I completely changed to float32, fixed the batch_size and ran the whole thing all together, that's how I could say my former solution (divide and add the fractions together) works and gives the exact number up to 2 decimals.

Related

How do I get column wise relative error between multidimensional array?

I have a function which takes an 2 arrays, a array of model predictions and an array of true values. It works fine when dealing with 1d arrays, but I need to adjust it for multidimensional arrays. I would like to compute my threshold_acc again but this time for each column. How do I go about this?
threshold_acc which represents the proportion of set with error below the specified threshold.
Also do I need to change my threshold to change my threshold to +- since I've started seeing negative values in the multidimensional array or is there a better error measure I could use?
import numpy as np
import pandas as pd
# sample data
np.random.seed(20)
dd = np.random.uniform(low=-20., high=20, size=(25, 4))
dp = np.random.uniform(low=5, high=25, size=(25, 4))
data = [dd, dp]
def inference( dummy_data, error_threshold=10):
rel_err_list = []
AE_error_list = []
mse_list = []
input_var = []
true_var =[]
pred_var = []
n_correct = 0; n_wrong = 0; n_inf =0
# Iterate through data loader and inference and evaluate data
targets, outputs = data[0], data[1]
for idx, (outputs, targets) in enumerate(zip(outputs, targets)):
rel_error = np.abs(outputs- targets )/targets
rel_error = rel_error * 100
AE_error = np.abs(outputs- targets )
if np.isfinite(rel_error).all():
rel_err_list.append(rel_error)
AE_error_list.append(AE_error)
# Negative errors
print(f"error: {rel_error} output: {outputs} target: {targets}")
else: n_inf +=1
if rel_error.all() < error_threshold:
n_correct +=1
else: n_wrong += 1
true_var.append(targets)
pred_var.append(outputs)
median_err, max_err, min_err = np.median(rel_err_list), np.max(rel_err_list), np.min(rel_err_list)
threshold_acc = ((n_correct * 1.0) / 25) * 100
true_var = np.array(true_var)
pred_var = np.array(pred_var)
err_var = np.array(rel_err_list)
AE_var = np.array(AE_error_list)
true_var = np.reshape(true_var, dummy_data[0].shape)
pred_var = np.reshape(pred_var, dummy_data[0].shape)
err_var = np.reshape(err_var, dummy_data[0].shape)
AE_var = np.reshape(AE_var, dummy_data[0].shape)
results = np.concatenate([true_var, pred_var, err_var, AE_var], axis=1)
results_df = pd.DataFrame(results)
return median_err, max_err, min_err, threshold_acc, n_inf, n_wrong, results_df, pred_var
dd = np.random.uniform(low=1., high=20, size=(25, 1))
dp = np.random.uniform(low=5, high=25, size=(25, 1))
median_err, max_err, min_err, threshold_acc, n_inf, n_wrong, results_df, pred_var = inference(data, 10)
print(f"\nAverage relative error over valid predictions : {median_err:.3f} \nMax error over valid predictions : {max_err:.3f} \nMin error over valid predictions : {min_err:.3f}\nProportion of test set with accuracy over 90%: {threshold_acc:.3f}\n\n\
{n_inf} null predictions \n{n_wrong} incorrect (<90%) predictions \n{n_inf+n_wrong} null or incorrect predictions out of 25")
median_err, max_err, min_err, threshold_acc, n_inf, n_wrong, results_df, pred_var = inference(data, 10)
print(f"\nAverage relative error over valid predictions : {median_err:.3f} \nMax error over valid predictions : {max_err:.3f} \nMin error over valid predictions : {min_err:.3f}\nProportion of test set with accuracy over 90%: {threshold_acc:.3f}\n\n\
{n_inf} null predictions \n{n_wrong} incorrect (<90%) predictions \n{n_inf+n_wrong} null or incorrect predictions out of 25")

How to improve performance of coincidence filtering of a time-series?

I'm working on instationary experimental data from fluid dynamics. We have measured data on three channels, so the samples are not directly coincident (measured at the same time). I want to filter them with a window scheme to get coincident samples and disgard all others.
Unfortunately, I cannot upload the original data set due to restrictions of the company. But I tried to set up a minimal example, which generates a similiar (smaller) dataset. The original dataset consists of 500000 values per channel, each noted with an arrival time. The coincidence is checked with these time stamps.
Just now, I loop over each sample from the first channel and look at the time differences to the other channels. If it is smaller than the specified window width, the index is saved. Probably it would be a little bit faster if I specifiy an intervall in which to check for the differences (like 100 or 1000 samples in the neighborhood). But the datarate between the channels can differ significantly, so it is not implemented yet. I prefer to get rid of looping over each sample - if possible.
def filterCoincidence(df, window = 50e-6):
'''
Filters the dataset with arbitrary different data rates on different channels to coincident samples.
The coincidence is checked with regard to a time window specified as argument.
'''
AT_cols = [col for col in df.columns if 'AT' in col]
if len(AT_cols) == 1:
print('only one group available')
return
used_ix = np.zeros( (df.shape[0], len(AT_cols)))
used_ix.fill(np.nan)
for ix, sample in enumerate(df[AT_cols[0]]):
used_ix[ix, 0] = ix
test_ix = np.zeros(2)
for ii, AT_col in enumerate(AT_cols[1:]):
diff = np.abs(df[AT_col] - sample)
index = diff[diff <= window].sort_values().index.values
if len(index) == 0:
test_ix[ii] = None
continue
test_ix[ii] = [ix_use if (ix_use not in used_ix[:, ii+1] or ix == 0) else None for ix_use in index][0]
if not np.any(np.isnan(test_ix)):
used_ix[ix, 1:] = test_ix
else:
used_ix[ix, 1:] = [None, None]
used_ix = used_ix[~np.isnan(used_ix).any(axis=1)]
print(used_ix.shape)
return
no_points = 10000
no_groups = 3
meas_duration = 60
df = pd.DataFrame(np.transpose([np.sort(np.random.rand(no_points)*meas_duration) for _ in range(no_groups)]), columns=['AT {}'.format(i) for i in range(no_groups)])
filterCoincidence(df, window=1e-3)
Is there a module already implemented, which can do this sort of filtering? However, it would be awesome if you can give me some hints to increase the performance of the code.
Just to update this thread if somebody else have a similar problem. I think after several code revisions, I have found a proper solution to this.
def filterCoincidence(self, AT1, AT2, AT3, window = 0.05e-3):
'''
Filters the dataset with arbitrary different data rates on different channels to coincident samples.
The coincidence is checked with regard to a time window specified as argument.
- arguments:
- three times series AT1, AT2 and AT3 (arrival times of particles in my case)
- window size (50 microseconds as default setting)
- output: indices of combined samples
'''
start_time = datetime.datetime.now()
AT_list = [AT1, AT2, AT3]
# take the shortest period of time
min_EndArrival = np.max(AT_list)
max_BeginArrival = np.min(AT_list)
for i, col in enumerate(AT_list):
min_EndArrival = min(min_EndArrival, np.max(col))
max_BeginArrival = max(max_BeginArrival, np.min(col))
for i, col in enumerate(AT_list):
AT_list[i] = np.delete(AT_list[i], np.where((col < max_BeginArrival - window) | (col > min_EndArrival + window)))
# get channel with lowest datarate
num_points = np.zeros(len(AT_list))
datarate = np.zeros(len(AT_list))
for i, AT in enumerate(AT_list):
num_points[i] = AT.shape[0]
datarate[i] = num_points[i] / (AT[-1]-AT[0])
used_ref = np.argmin(datarate)
# process coincidence
AT_ref_val = AT_list[used_ref]
AT_list = list(np.delete(AT_list, used_ref))
overview = np.zeros( (AT_ref_val.shape[0], 3), dtype=int)
overview[:,0] = np.arange(AT_ref_val.shape[0], dtype=int)
borders = np.empty(2, dtype=object)
max_diff = np.zeros(2, dtype=int)
for i, AT in enumerate(AT_list):
neighbors_lower = np.searchsorted(AT, AT_ref_val - window, side='left')
neighbors_upper = np.searchsorted(AT, AT_ref_val + window, side='left')
borders[i] = np.transpose([neighbors_lower, neighbors_upper])
coinc_ix = np.where(np.diff(borders[i], axis=1).flatten() != 0)[0]
max_diff[i] = np.max(np.diff(borders[i], axis=1))
overview[coinc_ix, i+1] = 1
use_ix = np.where(~np.any(overview==0, axis=1))
borders[0] = borders[0][use_ix]
borders[1] = borders[1][use_ix]
overview = overview[use_ix]
# create all possible combinations refer to the reference
combinations = np.prod(max_diff)
test = np.empty((overview.shape[0]*combinations, 3), dtype=object)
for i, [ref_ix, at1, at2] in enumerate(zip(overview[:, 0], borders[0], borders[1])):
test[i * combinations:i * combinations + combinations, 0] = ref_ix
at1 = np.arange(at1[0], at1[1])
at2 = np.arange(at2[0], at2[1])
test[i*combinations:i*combinations+at1.shape[0]*at2.shape[0],1:] = np.asarray(list(itertools.product(at1, at2)))
test = test[~np.any(pd.isnull(test), axis=1)]
# check distances
ix_ref = test[:,0]
test = test[:,1:]
test = np.insert(test, used_ref, ix_ref, axis=1)
test = test.astype(int)
AT_list.insert(used_ref, AT_ref_val)
AT_mat = np.zeros(test.shape)
for i, AT in enumerate(AT_list):
AT_mat[:,i] = AT[test[:,i]]
distances = np.zeros( (test.shape[0], len(list(itertools.combinations(range(3), 2)))))
for i, AT in enumerate(itertools.combinations(range(3), 2)):
distances[:,i] = np.abs(AT_mat[:,AT[0]]-AT_mat[:,AT[1]])
ix = np.where(np.all(distances <= window, axis=1))[0]
test = test[ix,:]
distances = distances[ix,:]
# check duplicates
# use sum of differences as similarity factor
dist_sum = np.max(distances, axis=1)
unique_sorted = np.argsort([np.unique(test[:,i]).shape[0] for i in range(test.shape[1])])[::-1]
test = np.hstack([test, dist_sum.reshape(-1, 1)])
test = test[test[:,-1].argsort()]
for j in unique_sorted:
_, ix = np.unique(test[:,j], return_index=True)
test = test[ix, :]
test = test[:,:3]
test = test[test[:,used_ref].argsort()]
# check that all values are after each other
ix = np.where(np.any(np.diff(test, axis=0) > 0, axis=1))[0]
ix = np.append(ix, test.shape[0]-1)
test = test[ix,:]
print('{} coincident samples obtained in {}.'.format(test.shape[0], datetime.datetime.now()-start_time))
return test
I'm certain that there is a better solution, but for me it works now. And I know, the variable names should definitely be chosen with more clarity (e.g. test), but I will clean up my code at the end of my master thesis... perhaps :-)

My neural network only returns 0, 1 or other constant value

I'm trying to create a network, that would help predict stock prices the following day. My input data are: open, high, low and close stock values, volume, index values, a few technical indicators and exchange rate; the output is closing price from the next day. I'm using data uploaded from Excel file.
I wrote a program, that I will paste below, but it doesn't seem to be working correctly. Network always returns 1, 0 or other constant value (between 0 - 1).
I took the following steps so far:
tried to normalise the data like so: X_norm = X/(10 ** d) where d is the smallest number for which this conditon is met: abs(X_norm) < 1. I did that for the whole set in Excel before dividing it into training and test.
shuffled the data before dividing it into training/test, so that learning examples are not from consecutive days
running the network on a smaller data set and on example data set (I generated random numbers and did a simple math using them for an output and tried running network with that)
changing amount of hidden neurons
chaninging number of iterations (up to a 1000, which was a lot for my computer considering the data set, so I didn't try any more because it would take too much time)
changing learning rate.
No matter what steps I took the outcome was always the same. I think my problem could be that I don't have a bias, but perhaps I also have other mistakes in my code that are contributing to this error.
My program:
import numpy as np
import pandas as pd
df = pd.read_excel(r"path", sheet_name="DATA", index_col=0, header=0)
df = df.to_numpy()
np.random.shuffle(df)
X_data = df[:, 0:15]
X_data = X_data.reshape(1000, 1, 15)
print(f"X_data: {X_data}")
Y_data = df[:, 15]
Y_data = Y_data.reshape(1000, 1, 1)
print(f"Y_data: {Y_data}")
X = X_data[0:801]
x_test = X_data[801:]
y = Y_data[0:801]
y_test = Y_data[801:]
print(f"X_train: {X}")
print(f"x_test: {x_test}")
print(f"Y_train: {y}")
print(f"y_test: {y_test}")
rate = 0.2
class NeuralNetwork:
def __init__(self):
self.input_neurons = 15
self.hidden1_neurons = 10
self.hidden2_neurons = 5
self.output_neuron = 1
self.input_to_hidden1_w = (np.random.random((self.input_neurons, self.hidden1_neurons))) # 14x30
self.hidden1_to_hidden2_w = (np.random.random((self.hidden1_neurons, self.hidden2_neurons))) # 30x20
self.hidden2_to_output_w = (np.random.random((self.hidden2_neurons, self.output_neuron))) # 20x1
def activation(self, x):
sigmoid = 1/(1+np.exp(-x))
return sigmoid
def activation_d(self, x):
derivative = x * (1 - x)
return derivative
def feed_forward(self, X):
self.z1 = np.dot(X, self.input_to_hidden1_w)
self.z1_a = self.activation(self.z1)
self.z2 = np.dot(self.z1_a, self.hidden1_to_hidden2_w)
self.z2_a = self.activation(self.z2)
self.z3 = np.dot(self.z2_a, self.hidden2_to_output_w)
output = self.activation(self.z3)
return output
def backward(self, X, y, rate, output):
error = y - output
z3_error_delta = error * self.activation_d(output)
z2_error = np.dot(z3_error_delta, np.transpose(self.hidden2_to_output_w))
z2_error_delta = z2_error * self.activation_d(self.z2)
z1_error = np.dot(z2_error_delta, np.transpose(self.hidden1_to_hidden2_w))
z1_error_delta = z1_error * self.activation_d(self.z1)
self.input_to_hidden1_w += rate * np.dot(np.transpose(X), z1_error_delta)
self.hidden1_to_hidden2_w += rate * np.dot(np.transpose(self.z1), z2_error_delta)
self.hidden2_to_output_w += rate * np.dot(np.transpose(self.z2), z3_error_delta)
def train(self, X, y):
output = self.feed_forward(X)
self.backward(X, y, rate, output)
def save_weights(self):
np.savetxt("w1.txt", self.input_to_hidden1_w, fmt="%s")
np.savetxt("w2.txt", self.hidden1_to_hidden2_w, fmt="%s")
np.savetxt("w3.txt", self.hidden2_to_output_w, fmt="%s")
def check(self, x_test, y_test):
self.feed_forward(x_test)
np.mean(np.square((y_test - self.feed_forward(x_test))))
Net = NeuralNetwork()
for l in range(100):
for i, pattern in enumerate(X):
for j, outcome in enumerate(y):
print(f"#: {l}")
print(f'''
# {str(l)}
# {str(X[i])}
# {str(y[j])}''')
print(f"Predicted output: {Net.feed_forward(X[i])}")
Net.train(X[i], y[j])
print(f"Error training: {(np.mean(np.square(y - Net.feed_forward(X))))}")
Net.save_weights()
for i, pattern in enumerate(x_test):
for j, outcome in enumerate(y_test):
Net.check(x_test[i], y_test[j])
print(f"Error test: {(np.mean(np.square(y_test - Net.feed_forward(x_test))))}")

How to build multi-dimensional neural network using Python NLTK

As part of a larger project, I'm trying to build a neural network that is built using two independent training sets against one input training series. In other words, I have a single input array that I want to build two independent synapses to be trained against.
In simplest terms, it looks something like this:
x = [[0,0,0],[0,1,1],[0,1,0]...] y = [[0,1],[0,0],[1,0]...] z =
[[0,0,0,0,1],[1,0,0,0,0],[1,1,0,1,1]...]
where y and z would be trained as independent synapses against trainset x.
I've been unable to modify my code to get this to work and I know it can't be super complex but I have been stuck for over a day and need some help.
Below is the code that I can't seem to get to work using multiple dimensions.
# Import Necessary Modules for Function
import numpy as np
import time
# Computer a non-linear Sigmoid curve (__--)
def sigmoid(x):
sigmoidOutput = 1/(1+np.exp(-x))
return sigmoidOutput
# Convert the output of the signmoid function to its derivative
def sigmoidDerivative(sigmoidOutput):
return sigmoidOutput*(1-sigmoidOutput)
def cleanSentence(sentence):
# Tokenize the words within the input sentence
sentenceWords = nltk.word_tokenize(sentence)
# Stem the words within the tokenized setence
sentenceWords = [stemmer.stem(userInput.lower()) for userInput in sentenceWords]
return sentenceWords
# Return a binary bag of words [0 or 1] to evaluate whether or not a word exists within
# a sentence.
def bagWordCheck(sentence, userInput, show_details=False):
# Tokenize the sentence
sentenceWords = cleanSentence(sentence)
# Create a word bag using the training-data from the user-input
wordBag = [0]*len(userInput)
for sWord in sentenceWords:
for i,w in enumerate(userInput):
if w == sWord:
wordBag[i] = 1
if show_details:
print ("found in bag: %s" % w)
return(np.array(wordBag))
# Evaluate the user's input
def think(sentence, showDetails=False):
x = bagWordCheck(sentence.lower(), userInput, showDetails)
if showDetails:
print ("sentence:", sentence, "\n bagWordCheck:", x)
# input layer is our bag of words
l0 = x
# matrix multiplication of input and hidden layer
l1 = sigmoid(np.dot(l0, synapse0))
# output layer
l2 = sigmoid(np.dot(l1, synapse2))
return l2
# ANN and Gradient Descent code from https://iamtrask.github.io//2015/07/27/python-network-part2/
def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5):
print ("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') )
print ("Input matrix: %sx%s Output matrix: %sx%s" % (len(X),len(X[0]),1, len(conversationTypes)) )
np.random.seed(1)
lastMeanError = 1
# randomly initialize our weights with mean 0
synapse0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
synapse2 = 2*np.random.random((hidden_neurons, len(conversations))) - 1
#synapse2 = 2*np.random.random((hidden_neurons, len(conversationTypes))) - 1
#synapse2 = 2*np.random.random((hidden_neurons, len(conversationSubjects))) - 1
prev_synapse0_weight_update = np.zeros_like(synapse0)
prev_synapse2_weight_update = np.zeros_like(synapse2)
synapse0_direction_count = np.zeros_like(synapse0)
synapse2_direction_count = np.zeros_like(synapse2)
for j in iter(range(epochs+1)):
# Feed forward through layers 0, 1, and 2
layer_0 = X
layer_1 = sigmoid(np.dot(layer_0, synapse0))
if(dropout):
layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
layer_2 = sigmoid(np.dot(layer_1, synapse2))
# how much did we miss the target value?
layer_2_error = y - layer_2
if (j% 10000) == 0 and j > 5000:
# if this 10k iteration's error is greater than the last iteration, break out
if np.mean(np.abs(layer_2_error)) < lastMeanError:
print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
lastMeanError = np.mean(np.abs(layer_2_error))
else:
print ("break:", np.mean(np.abs(layer_2_error)), ">", lastMeanError )
break
# in what direction is the target value?
# were we really sure? if so, don't change too much.
layer_2_delta = layer_2_error * sigmoidDerivative(layer_2)
# how much did each l1 value contribute to the l2 error (according to the weights)?
layer_1_error = layer_2_delta.dot(synapse2.T)
# in what direction is the target l1?
# were we really sure? if so, don't change too much.
layer_1_delta = layer_1_error * sigmoidDerivative(layer_1)
synapse2_weight_update = (layer_1.T.dot(layer_2_delta))
synapse0_weight_update = (layer_0.T.dot(layer_1_delta))
if(j > 0):
synapse0_direction_count += np.abs(((synapse0_weight_update > 0)+0) - ((prev_synapse0_weight_update > 0) + 0))
synapse2_direction_count += np.abs(((synapse2_weight_update > 0)+0) - ((prev_synapse2_weight_update > 0) + 0))
synapse2 += alpha * synapse2_weight_update
synapse0 += alpha * synapse0_weight_update
prev_synapse0_weight_update = synapse0_weight_update
prev_synapse2_weight_update = synapse2_weight_update
now = datetime.datetime.now()
# persist synapses
synapse = {'synapse0': synapse0.tolist(), 'synapse2': synapse2.tolist(),
'datetime': now.strftime("%Y-%m-%d %H:%M"),
'userInput': userInput,
'conversations' : conversations,
'conversationTypes': conversationTypes,
'conversationSubjects' : conversationSubjects
}
synapse_file = "synapses.json"
with open(synapse_file, 'w') as outfile:
json.dump(synapse, outfile, indent=4, sort_keys=True)
print ("saved synapses to:", synapse_file)
X = np.array(trainingSet)
y = np.array(completeConversations)
y1 = np.array(completeTypes)
y2 = np.array(completeSubjects)
start_time = time.time()
train(X, y, hidden_neurons=5, alpha=0.1, epochs=30000, dropout=False, dropout_percent=0.2)
#train(X, y1, hidden_neurons=5, alpha=0.1, epochs=30000, dropout=False, dropout_percent=0.2)
#train(X, y2, hidden_neurons=5, alpha=0.1, epochs=30000, dropout=False, dropout_percent=0.2)
elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")
# probability threshold
ERROR_THRESHOLD = 0.75
# load our calculated synapse values
synapse_file = 'synapses.json'
with open(synapse_file) as data_file:
synapse = json.load(data_file)
synapse0 = np.asarray(synapse['synapse0'])
synapse2 = np.asarray(synapse['synapse2'])
def classify(sentence, showDetails=False):
results = think(sentence, showDetails)
results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ]
results.sort(key=lambda x: x[1], reverse=True)
return_results =[[conversations[r[0]],r[1]] for r in results]
#return_results =[[conversationSubjects[r[0]],r[1]] for r in results]
#return_results =[[conversationTypes[r[0]],r[1]] for r in results]
return return_results
classify("charlotte explorer is not letting me ")
Please help!

Matrix multiplication using hdf5

I'm trying to multiplicate 2 big matrices with memory limit using hdf5 (pytables)
but function numpy.dot seems to give me error:
Valueerror: array is too big
I need to do matrix multiplication by myself maybe blockwise or there is some another python function similar to numpy.dot?
import numpy as np
import time
import tables
import cProfile
import numexpr as ne
n_row=10000
n_col=100
n_batch=10
rows = n_row
cols = n_col
batches = n_batch
atom = tables.UInt8Atom() #?
filters = tables.Filters(complevel=9, complib='blosc') # tune parameters
fileName_a = 'C:\carray_a.h5'
shape_a = (rows*batches, cols) # predefined size
h5f_a = tables.open_file(fileName_a, 'w')
ca_a = h5f_a.create_carray(h5f_a.root, 'carray', atom, shape_a, filters=filters)
for i in range(batches):
data = np.random.rand(rows,cols)
ca_a[i*rows:(i+1)*rows]= data[:]
#h5f_0.close()
rows = n_col
cols = n_row
batches = n_batch
fileName_b = 'C:\carray_b.h5'
shape_b = (rows, cols*batches) # predefined size
h5f_b = tables.open_file(fileName_b, 'w')
ca_b = h5f_b.create_carray(h5f_b.root, 'carray', atom, shape_b, filters=filters)
#need to batch by cols
sz= rows/batches
for i in range(batches):
data = np.random.rand(sz, cols*batches)
ca_b[i*sz:(i+1)*sz]= data[:]
#h5f_1.close()
rows = n_batch*n_row
cols = n_batch*n_row
fileName_c = 'C:\carray_c.h5'
shape_c = (rows, cols) # predefined size
h5f_c = tables.open_file(fileName_c, 'w')
ca_c = h5f_c.create_carray(h5f_c.root, 'carray', atom, shape_c, filters=filters)
a= h5f_a.root.carray#[:]
b= h5f_b.root.carray#[:]
c= h5f_c.root.carray
t0= time.time()
c= np.dot(a,b) #error if aray is big
print (time.time()-t0)
Update: so here is the code.It's interesting but using hdf5 it works even faster.
import numpy as np
import tables
import time
sz= 100 #chunk size
n_row=10000 #m
n_col=1000 #n
#for arbitrary size
A=np.random.rand(n_row,n_col)
B=np.random.rand(n_col,n_row)
# A=np.random.randint(5, size=(n_row,n_col))
# B=np.random.randint(5, size=(n_col,n_row))
#using numpy array
#C= np.zeros((n_row,n_row))
#using hdf5
fileName_C = 'CArray_C.h5'
atom = tables.Float32Atom()
shape = (A.shape[0], B.shape[1])
Nchunk = 128 # ?
chunkshape = (Nchunk, Nchunk)
chunk_multiple = 1
block_size = chunk_multiple * Nchunk
h5f_C = tables.open_file(fileName_C, 'w')
C = h5f_C.create_carray(h5f_C.root, 'CArray', atom, shape, chunkshape=chunkshape)
sz= block_size
t0= time.time()
for i in range(0, A.shape[0], sz):
for j in range(0, B.shape[1], sz):
for k in range(0, A.shape[1], sz):
C[i:i+sz,j:j+sz] += np.dot(A[i:i+sz,k:k+sz],B[k:k+sz,j:j+sz])
print (time.time()-t0)
t0= time.time()
res= np.dot(A,B)
print (time.time()-t0)
print (C== res)
h5f_C.close()
I don't know of a np.dot that work without loading into memory. I think blocking would work pretty well. Create a an output array (called "c" below) as pytables CArray and fill in blocks. You should choose the chunkshape when you create it to match your blocking scheme. Something like
atom = tables.Float32Atom() # you have UInt8Atom() above. do you mean that?
shape = (a.shape[0], b.shape[1])
# you can vary block_size and chunkshape independently, but I would
# aim to have block_size an integer multiple of chunkshape
# your mileage may vary and depends on the array size and how you'll
# access it in the future.
Nchunk = 128 # ?
chunkshape = (Nchunk, Nchunk)
chunk_multiple = 1
block_size = chunk_multiple * Nchunk
c = h5f.create_carray(h5.root, 'c', atom, shape, chunkshape=chunkshape)
for i_start in range(0, a.shape[0], block_size):
for j_start in range(0, b.shape[1], block_size):
for k_start in range(0, a.shape[1], block_size):
c[i_start:i_start+block_size, j_start:j_start + block_size] += \
np.dot(a[i_start:i_start + block_size, k_start:k_start + block_size],
b[k_start:k_start + block_size, j_start:j_start + block_size]

Categories

Resources