prediction result - Tensorflow - python

I'm trying to print the prediction results and the labels, in addition to accuracy from a model.
I'm not sure what I'm doing wrong here
for mfcc, label in test_data:
prediction = tflite_inference(mfcc, tflite_path)
predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
strlabel="C:/tmp/speech_commands_train/conv_labels.txt"
labels_list= [line.rstrip() for line in tf.io.gfile.GFile(strlabel)]
top_k = prediction.argsort()[-5:][::-1]
for node_id in top_k:
human_string = labels_list[node_id]
score = predicted_indices[node_id]
print('%s (score = %.5f)' % (human_string, score))
test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices,
num_classes=model_settings['label_count'])
`
Error message
human_string = labels_list[node_id] TypeError: only integer scalar arrays can be converted to a scalar index
Thank you in advance for your help.

EDITED ANSWER (after some clarification regarding the problem):
Here I assume that the prediction variable is the output of your model for a single input. With this assumption, your top_k should contain top 5 indices with the highest probability.
To do that you should do the following:
Reshape your predictions variable:
predictions = predictions.reshape(-1) # this will make the predicitions a vector
Get the top_k
# this step is same but this time the output will be a vector instead of a matrix
top_k = prediction.argsort()[-5:][::-1]
Use the loop
# This is also same but as the `top_k` is a vector instead of a matrix there
# won't be any issues/errors.
for node_id in top_k:
human_string = labels_list[node_id]
score = predicted_indices[node_id]
print('%s (score = %.5f)' % (human_string, score))

Related

How to calculate f1 score during evaluation on test set?

I am trying to calculate the f1 score during evaluation of my own test set but i'm not able to solve as I am very unexperienced. I've tried to use both f1 score from Scikit-Learn and from torchmetrics but they give me everytime different errors.
This is my code:
# Function to test the model
from sklearn.metrics import f1_score
since = time.time()
total=0
correct=0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
y_pred=[]
y_true=[]
# Iterate over data.
with torch.no_grad():
for inputs, labels in dataloadersTest_dict['Test']:
inputs = inputs.to(device)
labels = labels.to(device)
#outputs = model(inputs)
predicted_outputs = model(inputs)
_, predicted = torch.max(predicted_outputs, 1)
total += labels.size(0)
print(total)
correct += (predicted == labels).sum().item()
print(correct)
#f1 score
temp_true=labels.numpy()
temp_pred=predicted.numpy()
y_true.append(temp_true.tolist())
y_pred.append(temp_pred.tolist())
time_elapsed = time.time() - since
test_acc=100 * correct / total
print('Evaluation completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Accuracy: %d %%' % (test_acc))
print('F1 Score:')
f1=f1_score(y_true,y_pred, average = 'macro')
print(f1)
The error trace should be available in order to spot the problem but I guess the problem is due to passing a nested list to f1_score instead of a single list. It must be fixed by changing the collecting strategy of the final lists.
# Iterate over data.
y_true, y_pred = [], []
with torch.no_grad():
for inputs, labels in dataloadersTest_dict['Test']:
inputs = inputs.to(device)
labels = labels.to(device)
#outputs = model(inputs)
predicted_outputs = model(inputs)
_, predicted = torch.max(predicted_outputs, 1)
total += labels.size(0)
print(total)
correct += (predicted == labels).sum().item()
print(correct)
#f1 score
temp_true=labels.numpy()
temp_pred=predicted.numpy()
y_true+=temp_true.tolist()
y_pred+=temp_pred.tolist()

Why my confusion matrix report valued zero

So i tried to do some speech to text recognition and the code was successfuly predicted the speech data into text output. Then, i want to use confusion matrix to evaluate to evaluate the accuracy of classification. but the matrix was valued 0 and the precision , re-call, f1 score was valued 0 too.
this is the code:
if types == 1:
start_index = test_file.find('\')+1
end_index = test_file.rfind('\')+1
print('Predictions: ')
print()
print('Audio file:',test_file, start_index, end_index)
original_label = test_file[end_index:]
print('Original: {}'.format(original_label.split('/')[1]))
print('Predicted: {}'.format(predicted_label))
print()
y_test.append(original_label.split('/')[1])
predicted_labels.append(predicted_label)
else:
start_index = test_file.find('\') + 1
end_index = test_file.rfind('\')+1
original_label = test_file[end_index:]
y_test.append(original_label.split('/')[1])
predicted_labels.append(predicted_label)
cm = confusion_matrix(y_test, predicted_labels)
print(cm)
print(classification_report(y_test, predicted_labels))
and this is the result of confusion matrix report:
The Results
I just want to know why, almost of all the value is 0.
thanks!

You must feed a value for placeholder tensor

I am trying to implement an lstm nn using tesorboarb and I am receiving this error message: You must feed a value for placeholder tensor 'performance_1/loss_summary'.
I have already searched for an answer in many question without result.
with tf.name_scope('performance'):
loss = tf.placeholder(tf.float32,shape=None,name='loss_summary')
tf_loss_summary = tf.summary.scalar('loss', loss)
tf_accuracy_ph = tf.placeholder(tf.float32,shape=None, name='accuracy_summary')
tf_accuracy_summary = tf.summary.scalar('accuracy', tf_accuracy_ph)
# Gradient norm summary
for g in gradients:
for var in v:
if 'hidden3' in var.name and 'w' in var.name:
with tf.name_scope('Gradients'):
tf_last_grad_norm = tf.sqrt(tf.reduce_mean(g**2))
tf_gradnorm_summary = tf.summary.scalar('grad_norm', tf_last_grad_norm)
break
# Merge all summaries together
performance_summaries = tf.summary.merge([tf_loss_summary,tf_accuracy_summary])
The other part of the code in which I get the error is:
for ep in range(epochs):
for step in range(train_seq_length//batch_size):
u_data, u_labels = data_gen.unroll_batches()
feed_dict = {}
for ui,(dat,lbl) in enumerate(zip(u_data,u_labels)):
feed_dict[train_inputs[ui]] = dat.reshape(-1,1)
feed_dict[train_outputs[ui]] = lbl.reshape(-1,1)
feed_dict.update({tf_learning_rate: 0.0001, tf_min_learning_rate:0.000001})
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += l
if (ep+1) % valid_summary == 0:
average_loss = average_loss/(valid_summary*(train_seq_length//batch_size))
# The average loss
if (ep+1)%valid_summary==0:
print('Average loss at step %d: %f' % (ep+1, average_loss))
train_mse_ot.append(average_loss)
average_loss = 0 # reset loss
predictions_seq = []
mse_test_loss_seq = []
Thank you in advance.
Initiliaze loss as a variable. When you define something as a placeholder, you have to feed it's value when running the graph dependent on it.
loss is a placeholder, so you have to give a value to it. You probably didn't notice it and overrode your actual loss function. Usually summaries are not placeholders, so you have a misunderstanding of your variables and the flow in you code.

Tensor Flow using old graph instead of new one

I have retrained two different classification models models using retrain.py.
For predicting labels for two images I have created getLabel method from Label_image.py as follows:
def getLabel(localFile, graphKey, labelKey):
image_data_str = tf.gfile.FastGFile(localFile, 'rb').read()
# Loads label file, strips off carriage return
label_lines = [line.rstrip() for line
in tf.gfile.GFile(labelKey)]
# Unpersists graph from file
with tf.gfile.FastGFile(graphKey, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
_ = tf.import_graph_def(graph_def, name='')
sess = tf.Session()
with sess:
# Feed the image_data as input to the graph and get first prediction
softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
predictions = sess.run(softmax_tensor, {'DecodeJpeg/contents:0': image_data_str})
# Sort to show labels of first prediction in order of confidence
top_k = predictions[0].argsort()[-len(predictions[0]):][::-1]
series = []
count = 1
for node_id in top_k:
human_string = label_lines[node_id]
if count==1:
label = human_string
count+=1
score = predictions[0][node_id]
print('%s (score = %.5f)' % (human_string, score))
series.append({"name": human_string, "data": [score * 100]})
sess.close()
return label, series
And I am calling them as
label,series = predict.getLabel(localFile, 'graph1.pb', 'labels1.txt')
label,series = predict.getLabel(localFile, 'graph2.pb', 'labels2.txt')
But for the second function call it is using the old graph i.e. graph1.pb & it is giving below error since model 1 has more categories than model 2.
human_string = label_lines[node_id]
IndexError: list index out of range
I am not able to understand why is this happening. Can someone tell how to load second graph??
It looks like what is happening is that you are calling the same session for both calls to predict.getFinalLabel. What you should do is define two separate sessions, and initialize each separately (e.g. have predict1.getFinalLabel and predict2.getFinalLabel). If you post more of your code, I can provide more detail and code.

Python Random forest and machine learning - improvements

I am quite new to using python for machine learning. I come from a background of programming in Fortran, so as you may imagine, python is quite a leap. I work in chemistry and have become involved in chemiformatics (applying data science techniques to chemistry). As such, the application of pythons extensive machine learning libraries is important. I also need my codes to be efficent. I have written a code which runs and seems to work OK. What I would like to know is:
1 How best to improve it/make it more efficient.
2 Any suggestions on alternative formulations to those I have used and if possible a reason why another route maybe superior?
I tend to work with continuous data and regression models.
Any suggestions would be great and thank you in advance for those.
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
import os.path
import sys
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
fname = str(raw_input('Please enter the input file name containing total dataset and descriptors (assumes csv file, column headings and first column are labels\n'))
if os.path.isfile(fname) :
SubFeAll = pd.read_csv(fname, sep=",")
else:
sys.exit("ERROR: input file does not exist")
#SubFeAll = pd.read_csv(fname, sep=",")
SubFeAll = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = SubFeAll.columns.values # Use the column headers as the descriptor labels
SubFeAll.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(SubFeAll)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Print specific nparray values to check the data
print("The first element of the input data set, as a minial check please ensure this is as expected = %s" % npArray[0,0])
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
test_name = "Predictions.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain, open(test_name,'w') as fpred, open(fi_name,'w') as ffeatimp:
ftrain.write("This file contains the training information for the Random Forest models\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Interation %d ,\n" %(metcount+1))
fpred.write("This file contains the prediction information for the Random Forest models\n")
fpred.write("Predictions are made over a ten fold cross validation hence training on 90% test on 10%. The final prediction are return iteratively over this ten fold cros validation once,\n")
fpred.write("optimised parameters are located via a grid search at each fold,\n")
fpred.write("Interation %d ,\n" %(metcount+1))
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.write("Interation %d ,\n" %(metcount+1))
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10, shuffle=True, random_state=0)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
print("The test set values are : ")
i = 0
if ytestdim%5 == 0:
while i < ytestdim:
print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2),'\t', round(yTest[i+3],2),'\t', round(yTest[i+4],2)
ftrain.write(str(round(yTest[i],2))+','+ str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+','+str(round(yTest[i+4],2))+',\n')
i += 5
elif ytestdim%4 == 0:
while i < ytestdim:
print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2),'\t', round(yTest[i+3],2)
ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+',\n')
i += 4
elif ytestdim%3 == 0 :
while i < ytestdim :
print round(yTest[i],2),'\t', round(yTest[i+1],2),'\t', round(yTest[i+2],2)
ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+',\n')
i += 3
elif ytestdim%2 == 0 :
while i < ytestdim :
print round(yTest[i],2), '\t', round(yTest[i+1],2)
ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+',\n')
i += 2
else :
while i< ytestdim :
print round(yTest[i],2)
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
rfparamgrid = {"n_estimators": [10], "max_features": ["auto", "sqrt", "log2"], "max_depth": [5,7]}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error',cv=10)
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
#print(RfGridSearch.grid_scores_) # Diagnos
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
# Train random forest and predict with optimised parameters
print("\n\n------------------- Starting opitimised RF training -------------------")
optRF = RandomForestRegressor(n_estimators = ne, max_features = mf, max_depth = md, random_state=0)
optRF.fit(XTrain, yTrain) # Train the model
RFfeatimp = optRF.feature_importances_
indices = np.argsort(RFfeatimp)[::-1]
print("Training R2 = %5.2f" % optRF.score(XTrain,yTrain))
print("Starting optimised RF prediction")
RFpreds = optRF.predict(XTest)
print("The predicted values now follow :")
RFpredsdim = RFpreds.shape[0]
i = 0
if RFpredsdim%5 == 0:
while i < RFpredsdim:
print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2),'\t', round(RFpreds[i+3],2),'\t', round(RFpreds[i+4],2)
i += 5
elif RFpredsdim%4 == 0:
while i < RFpredsdim:
print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2),'\t', round(RFpreds[i+3],2)
i += 4
elif RFpredsdim%3 == 0 :
while i < RFpredsdim :
print round(RFpreds[i],2),'\t', round(RFpreds[i+1],2),'\t', round(RFpreds[i+2],2)
i += 3
elif RFpredsdim%2 == 0 :
while i < RFpredsdim :
print round(RFpreds[i],2), '\t', round(RFpreds[i+1],2)
i += 2
else :
while i< RFpredsdim :
print round(RFpreds[i],2)
i += 1
print "\n"
RFr2.append(optRF.score(XTest, yTest))
RFmse.append( metrics.mean_squared_error(yTest,RFpreds))
RFrmse.append(math.sqrt(RFmse[metcount]))
print ("Random Forest prediction statistics for fold %d are; MSE = %5.2f RMSE = %5.2f R2 = %5.2f\n\n" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount]))
ftrain.write("Random Forest prediction statistics for fold %d are, MSE =, %5.2f, RMSE =, %5.2f, R2 =, %5.2f,\n\n" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount]))
ffeatimp.write("Feature importance rankings from random forest,\n")
for i in range(RFfeatimp.shape[0]) :
ffeatimp.write("%d. , feature %d , %s, (%f),\n" % (i + 1, indices[i], npheader[indices[i]], RFfeatimp[indices[i]]))
# Store prediction in original order of data (itest) whilst following through the current test set order (j)
metcount += 1
ftrain.write("Fold %d, \n" %(metcount))
print "------------------- Next Fold %d -------------------" %(metcount+1)
j = 0
for itest in test :
RFpredictions.append(RFpreds[j])
j += 1
lennames = names.shape[0]
lenpredictions = len(RFpredictions)
lentrue = y.shape[0]
if lennames == lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lennames) :
fpred.write(str(names[i])+",,"+str(RFpredictions[i])+",,"+str(y[i])+",\n")
else :
fpred.write("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csv\n")
fpred.write("Array printed in the order names/Labels, predictions RF and true values\n")
fpred.write(names+"\n")
fpred.write(RFpredictions+"\n")
fpred.write(y+"\n")
sys.exit("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csv")
print "Final averaged Random Forest metrics : "
RFamse = sum(RFmse)/10
RFmse_sd = np.std(RFmse)
RFarmse = sum(RFrmse)/10
RFrmse_sd = np.std(RFrmse)
RFslope, RFintercept, RFr_value, RFp_value, RFstd_err = scipy.stats.linregress(RFpredictions, y)
RFR2 = RFr_value**2
print "Average Mean Squared Error = ", RFamse, " +/- ", RFmse_sd
print "Average Root Mean Squared Error = ", RFarmse, " +/- ", RFrmse_sd
print "R2 Final prediction against True values = ", RFR2
fpred.write("\n")
fpred.write("FINAL PREDICTION STATISTICS,\n")
fpred.write("Random Forest average MSE, %s, +/-, %s,\n" %(str(RFamse), str(RFmse_sd)))
fpred.write("Random Forest average RMSE, %s, +/-, %s,\n" %(str(RFarmse), str(RFrmse_sd)))
fpred.write("Random Forest slope, %s, Random Forest intercept, %s,\n" %(str(RFslope), str(RFintercept)))
fpred.write("Random Forest standard error, %s,\n" %(str(RFstd_err)))
fpred.write("Random Forest R, %s,\n" %(str(RFr_value)))
fpred.write("Random Forest R2, %s,\n" %(str(RFR2)))
ftrain.close()
fpred.close()
ffeatimp.close()
you can also add Feature Selection to your data:
sickit learn feature selection
some feature selection techniques are provided in sickit learn and you can use it to improve some aspect of your DM project

Categories

Resources