how import a python script as an import on colab - python

Good day everyone. I want to ask how can I import a python script configs.py, to another script in colab. It is listed as an import in the other script, which is app.py, how can I achieve it?
Here is the configs.py file
from attrdict import AttrDict # type: ignore
config = {
"encoder_path": "models/encoder_model.bin",
"decoder_path": "models/decoder_model.bin",
"input_word_index": "pickles/input_word_index.pkl",
"target_word_index": "pickles/target_word_index.pkl",
"url": "https://api.mymemory.translated.net/get",
"max_length_src": 47,
"max_length_tar": 47,
}
config = AttrDict(config)
Meanwhile this is the other script app.py
import functools
import math
import pickle
import re
import string
import numpy as np # type: ignore
import requests
from tensorflow.keras.models import load_model # type: ignore
from configs import config
# CONSTANTS
max_length_src = config["max_length_src"]
max_length_tar = config["max_length_tar"]
print("[INFO] Loading Word Indexes ...")
with open(config["input_word_index"], "rb") as file:
input_token_index = pickle.load(file)
with open(config["target_word_index"], "rb") as file:
target_token_index = pickle.load(file)
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
print("[INFO] Loading Encoder & Decoder ...")
encoder_model = load_model(config["encoder_path"])
decoder_model = load_model(config["decoder_path"])
def clean(input_seq):
input_seq = input_seq.lower()
input_seq = re.sub("'", "", input_seq)
input_seq = "".join(ch for ch in input_seq if ch not in set(string.punctuation))
input_seq = input_seq.strip()
return input_seq
def get_input_seq(input_seq):
input_seq = clean(input_seq)
encoder_input_data = np.zeros((1, max_length_src), dtype="float32")
for t, word in enumerate(input_seq.split()):
encoder_input_data[0, t] = input_token_index[word]
return encoder_input_data
#functools.lru_cache(maxsize=128)
def decode_sequence(input_seq):
input_seq = get_input_seq(input_seq)
states_value = encoder_model.predict(input_seq)
target_seq = np.zeros((1, 1))
# use `START_` as the first character
target_seq[0, 0] = target_token_index["START_"]
stop_condition = False
decoded_sentence = ""
while not stop_condition:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# Sampling a token with max probability
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += " " + sampled_char
# Exit condition: either hit max length or find stop character.
if sampled_char == "_END" or len(decoded_sentence) > max_length_tar:
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
# Update states
states_value = [h, c]
return decoded_sentence
def beam_search_decoder(predictions, top_k):
# start with an empty sequence with zero score
output_sequences = [([], 0)]
# looping through all the predictions
for token_probs in predictions:
new_sequences = []
# append new tokens to old sequences and re-score
for old_seq, old_score in output_sequences:
for char_index in range(len(token_probs)):
new_seq = old_seq + [char_index]
# considering log-likelihood for scoring
new_score = old_score + math.log(token_probs[char_index])
new_sequences.append((new_seq, new_score))
# sort all new sequences in the de-creasing order of their score
output_sequences = sorted(new_sequences, key=lambda val: val[1], reverse=True)
# select top-k based on score
# *Note- best sequence is with the highest score
output_sequences = output_sequences[:top_k]
return output_sequences
#functools.lru_cache(maxsize=128)
def decode_sequence_beam_search(input_seq, beam_width=3):
probabilities = []
# Encode the input as state vectors.
input_seq = get_input_seq(input_seq)
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1))
# Populate the first character of target sequence with the start character.
target_seq[0, 0] = target_token_index["START_"]
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ""
while not stop_condition:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# Sampling a token with max probability
sampled_token_index = np.argmax(output_tokens[0, -1, :])
probabilities.append(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += " " + sampled_char
# Exit condition: either hit max length
# or find stop character.
if sampled_char == "_END" or len(decoded_sentence) > max_length_tar:
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
# Update states
states_value = [h, c]
# storing multiple results
outputs = []
beam_search_preds = beam_search_decoder(probabilities, top_k=beam_width)
for prob_indexes, score in beam_search_preds:
decoded_sentence = ""
for index in prob_indexes:
sampled_char = reverse_target_char_index[index]
decoded_sentence += " " + sampled_char
if sampled_char == "_END" or len(decoded_sentence) > max_length_tar:
break
outputs.append(decoded_sentence)
return outputs
Please help to resolve the issue about the importation of python scripts on google colaboratory. Thank you
This is the error I got
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-5-2048e9210135> in <module>()
/content/app.py in <module>()
9 from tensorflow.keras.models import load_model # type: ignore
10
---> 11 from configs import config
12
13 # CONSTANTS
ModuleNotFoundError: No module named 'configs'

Related

How to fix broken data in feature extraction/pre-processing in speech recognition?

i am very new in machine learning. I stumble on this source code on github that has no database, so i decided to use my own database. This code is to recognize speaker with MFCC and GMM-UBM. But when i try to run the code, i got this error "ValueError: Found array with 1 sample(s) (shape=(1, 13)) while a minimum of 2 is required". It seems like when the code is trying to fit the GMM on the 68th dataset, the MFCC shape of the data is broken. I assume there's something wrong on the feature extraction process.
Please help me! thank you very much.
Here's the code
import python_speech_features as psf
from sklearn.mixture import GaussianMixture
from sklearn.externals import joblib
from scipy.io import wavfile
from functools import reduce
import numpy as np
from os import listdir
from os.path import isfile, join
import os
import re
DATA_PATH = 'dataCoba'
# Make a list of speakers from the newdata/data folder. The format for the files in the folder is
# name_1,wav for training and name_2.wav for testing
substring = "_2"
onlyfiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]
onlyfiles.sort()
onlyones = []
for filename in onlyfiles:
dups = re.search('[\w]+_2.wav', filename)
#dups = re.search('[\w].wav', filename)
if dups is None:
onlyones.append(''.join(filename.split('_')[0]))
print(onlyones)
SPEAKERS = onlyones
TOTAL_SPEAKERS = len(SPEAKERS)
MODEL_SPEAKERS = len(SPEAKERS)
print(len(SPEAKERS))
class SpeakerRecognition:
# Create a GMM and UBM model for each speaker. The GMM is modelled after the speaker and UBM for each speaker
# is modelled after all the other speakers. Likelihood Ratio test is used to verify speaker
def setGMMUBM(self, no_components):
self.GMM = []
self.UBM = []
for i in range(MODEL_SPEAKERS):
self.GMM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
self.UBM.append(GaussianMixture(n_components= no_components, covariance_type= 'diag'))
# Load in data from .wav files in data/
# Extract mfcc (first 13 coefficients) from each audio sample
def load_data(self):
#training
self.spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_1.wav') for i in SPEAKERS]
self.spk_mfcc = [psf.mfcc(self.spk[i][1], self.spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
#testing
self.p_spk = [wavfile.read(DATA_PATH + '/' + (str(i).replace('.wav','')) + '_2.wav') for i in SPEAKERS]
self.p_spk_mfcc = [psf.mfcc(self.p_spk[i][1], self.p_spk[i][0]) for i in range(0, TOTAL_SPEAKERS)]
print(self.spk_mfcc)
for i in range(TOTAL_SPEAKERS):
self.spk_train_size.append(len(self.spk_mfcc[i]))
self.spk_start.append(len(self.total_mfcc))
print("Speaker Number(train) = ",i)
print ("self.spk_mfcc[i] = ", len(self.spk_mfcc[i]))
for mfcc in self.spk_mfcc[i]:
self.total_mfcc.append(mfcc)
self.speaker_label.append(i)
self.spk_end.append(len(self.total_mfcc))
print("self.total_mfcc = ", len(self.total_mfcc))
print("\n")
for i in range(TOTAL_SPEAKERS):
#print("self.p_spk_mfcc =", self.p_spk_mfcc)
self.spk_test_size.append(len(self.p_spk_mfcc[i]))
self.spk_start.append(len(self.p_total_mfcc))
print("Speaker Num(test) = ",i)
print("self.p_spk_mfcc = ",len(self.p_spk_mfcc[i]))
print("MFCC Shape = ",self.spk_mfcc[i].shape)
for mfcc in self.p_spk_mfcc[i]:
self.p_total_mfcc.append(mfcc)
self.p_speaker_label.append(i)
self.p_spk_end.append(len(self.p_total_mfcc))
print("self.total_mfcc = ", len(self.p_total_mfcc))
print("\n")
# Gaussian Mixture Model is made of a number of Gaussian distribution components.
# To model data, a suitable number o gaussian components have to be selected.
# There is no method for finding this. It is done by trial and error. This runs
# the program for different values of component and records accuracy for each one
[![This is the error when i run the code][1]][1]
def find_best_params(self):
best_no_components = 1
maxacc = 0
for i in range(100, 256):
self.setGMMUBM(i)
self.fit_model()
_, acc, _ = self.predict()
print("Accuracy for n = {} is {}".format(i, acc))
if acc > maxacc:
maxacc = acc
best_no_components = i
return best_no_components
# Fit the GMM UBM models with training data
# fit = N buah data * dimensi data
def fit_model(self):
for i in range(MODEL_SPEAKERS):
print("Fit start for {}".format(i))
self.GMM[i].fit(self.spk_mfcc[i])
print(self.spk_mfcc[i].shape)
self.UBM[i].fit(self.total_mfcc[:self.spk_start[i]] + self.total_mfcc[self.spk_end[i]:])
print("Fit end for {}".format(i))
joblib.dump(self.UBM[i], 'dumps/new/ubm' + str(i) + '.pkl')
joblib.dump(self.GMM[i], 'dumps/new/gmm' + str(i) + '.pkl')
def model(self, no_components = 244):
self.setGMMUBM(no_components)
self.fit_model()
# Predict the output for each model for each speaker and produce confusion matrix
def load_model(self):
for i in range(0, MODEL_SPEAKERS):
self.GMM.append(joblib.load('dumps/new/gmm' + str(i) + '.pkl'))
self.UBM.append(joblib.load('dumps/new/ubm' + str(i) + '.pkl'))
def predict(self):
avg_accuracy = 0
confusion = [[ 0 for y in range(MODEL_SPEAKERS) ] for x in range(TOTAL_SPEAKERS)]
for i in range(TOTAL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
x = self.GMM[j].score_samples(self.p_spk_mfcc[i]) - self.UBM[j].score_samples(self.p_spk_mfcc[i])
for score in x :
if score > 0:
confusion[i][j] += 1
confusion_diag = [confusion[i][i] for i in range(MODEL_SPEAKERS)]
diag_sum = 0
for item in confusion_diag:
diag_sum += item
remain_sum = 0
for i in range(MODEL_SPEAKERS):
for j in range(MODEL_SPEAKERS):
if i != j:
remain_sum += confusion[i][j]
spk_accuracy = 0
for i in range(MODEL_SPEAKERS):
best_guess, _ = max(enumerate(confusion[i]), key=lambda p: p[1])
print("For Accent {}, best guess is {}".format(SPEAKERS[i], SPEAKERS[best_guess]))
if i == best_guess:
spk_accuracy += 1
#print(MODEL_SPEAKERS)
spk_accuracy /= MODEL_SPEAKERS
avg_accuracy = diag_sum/(remain_sum+diag_sum)
return confusion, avg_accuracy, spk_accuracy
def __init__(self):
self.test_spk = []
self.test_mfcc = []
# Speaker data and corresponding mfcc
self.spk = []
self.spk_mfcc = []
self.p_spk = []
self.p_spk_mfcc = []
# Holds all the training mfccs of all speakers and
# speaker_label is the speaker label for the corresponding mfcc
self.total_mfcc = []
self.speaker_label = []
self.spk_train_size = [] # Index upto which is training data for that speaker.
self.p_total_mfcc = []
self.p_speaker_label = []
#print(self.p_speaker_label)
self.spk_test_size = []
# Since the length of all the audio files are different, spk_start and spk_end hold
self.spk_start = []
self.spk_end = []
self.p_spk_start = []
self.p_spk_end = []
self.GMM = []
self.UBM = []
self.load_data()
self.cepstral_mean_subtraction()
# Cepstral Mean Subtraction (Feature Normalization step)
def cepstral_mean_subtraction(self):
for i, speaker_mfcc in enumerate(self.spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x/len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.spk_mfcc[i][j][k] -= average[k]
for i, speaker_mfcc in enumerate(self.p_spk_mfcc):
average = reduce(lambda acc, ele: acc + ele, speaker_mfcc)
average = list(map(lambda x: x / len(speaker_mfcc), average))
for j, feature_vector in enumerate(speaker_mfcc):
for k, feature in enumerate(feature_vector):
self.p_spk_mfcc[i][j][k] -= average[k]
#TBD : Ten fold validation
def ten_fold():
#fold_size = 0.1 * self.n
fold_offset = 0.0
accuracy_per_fold = 0
average_accuracy = 0
for i in range(0, 10):
print("Fold start is {} and fold end is {} ".format( fold_offset, fold_offset + fold_size))
#accuracy = self.execute(int(fold_offset), int(fold_offset + fold_size))
#print("Accuracy is of test {} is : {} ".format(i, accuracy))
#average_accuracy += accuracy
#fold_offset += fold_size
average_accuracy /= 10.0
print("Average accuracy " + str(100 * average_accuracy))
return average_accuracy
# Final result is a confusion matrix which represents the accuracy of the fit of the model
if __name__ == '__main__':
SR = SpeakerRecognition()
#SR.load_model()
SR.setGMMUBM(no_components=13)
#SR.find_best_params()
SR.fit_model()
confusion, mfcc_accuracy, spk_accuracy = SR.predict()
print("Confusion Matrix")
print(np.matrix(confusion))
print("Accuracy in predicting speakers : {}".format(spk_accuracy))
print("Accuracy in testing for MFCC : {}".format(mfcc_accuracy))

IndexError: index 82459 is out of bounds for axis 0 with size 82459

I am trying to run code (found here) for a visual question generation model. I am running the code using Windows Subsystem for Linux, in an Anaconda virtual environment for Python 2.7. I am using Tensorflow v1.3.0, as I experienced issues using more recent versions of Tensorflow -- the repository is relatively old.
I am receiving the following error (full traceback included):
Traceback (most recent call last):
File "main.py", line 70, in <module>
tf.app.run()
File "/home/username/anaconda2/envs/py27/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "main.py", line 64, in main
model.train()
File "/home/username/VQG-tensorflow/question_generator.py", line 124, in train
feats = self.img_feature[img_list,:]
IndexError: index 82459 is out of bounds for axis 0 with size 82459
I've included the source code for main.py and question_generator.py below. Obviously, the program is trying to access an index that doesn't exist. I can't figure out what would make it behave this way. Similar questions to this one (like this and this) were not helpful. I tried padding the array using the numpy.pad method, but that only led to a different and related error:
ValueError: Cannot feed value of shape (256, 4097) for Tensor u'Placeholder:0', which has shape '(256, 4096)'
Any and all help is greatly appreciated!
Source code for main.py:
#-*- coding: utf-8 -*-
import math
import os
import tensorflow as tf
import numpy as np
import cPickle
import skimage
import pprint
import tensorflow.python.platform
from keras.preprocessing import sequence
from data_loader import *
import vgg19
import question_generator
flags = tf.app.flags
pp = pprint.PrettyPrinter().pprint
tf.app.flags.DEFINE_string('input_img_h5', './data_img.h5', 'path to the h5file containing the image feature')
tf.app.flags.DEFINE_string('input_ques_h5', './data_prepro.h5', 'path to the h5file containing the preprocessed dataset')
tf.app.flags.DEFINE_string('input_json', './data_prepro.json', 'path to the json file containing additional info and vocab')
tf.app.flags.DEFINE_string('model_path', './models/', 'where should we save')
tf.app.flags.DEFINE_string('vgg_path', './vgg16.tfmodel', 'momentum for adam')
tf.app.flags.DEFINE_string('gpu_fraction', '2/3', 'define the gpu fraction used')
tf.app.flags.DEFINE_string('test_image_path', './assets/demo.jpg', 'the image you want to generate question')
tf.app.flags.DEFINE_string('test_model_path', './models/model-250', 'model we saved')
tf.app.flags.DEFINE_integer('batch_size', 256, 'tch_size for each iterations')
tf.app.flags.DEFINE_integer('dim_embed', 512, 'word embedding size')
tf.app.flags.DEFINE_integer('dim_hidden', 512, 'hidden size')
tf.app.flags.DEFINE_integer('dim_image', 4096, 'dimension of output from fc7')
tf.app.flags.DEFINE_integer('img_norm', 1, 'do normalization on image or not')
tf.app.flags.DEFINE_integer('maxlen', 26, 'max length of question')
tf.app.flags.DEFINE_integer('n_epochs', 250, 'how many epochs are we going to train')
tf.app.flags.DEFINE_float('learning_rate', '0.001', 'learning rate for adam')
tf.app.flags.DEFINE_float('momentum', 0.9, 'momentum for adam')
tf.app.flags.DEFINE_boolean('is_train', 'True', 'momentum for adam')
conf = flags.FLAGS
def calc_gpu_fraction(fraction_string):
idx, num = fraction_string.split('/')
idx, num = float(idx), float(num)
fraction = 1 / (num - idx + 1)
print " [*] GPU : %.4f" % fraction
return fraction
def main(_):
attrs = conf.__dict__['__flags']
pp(attrs)
dataset, img_feature, train_data = get_data(conf.input_json, conf.input_img_h5, conf.input_ques_h5, conf.img_norm)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=calc_gpu_fraction(conf.gpu_fraction))
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
model = question_generator.Question_Generator(sess, conf, dataset, img_feature, train_data)
if conf.is_train:
model.build_model()
model.train()
else:
model.build_generator()
model.test(test_image_path=conf.test_image_path, model_path=conf.test_model_path, maxlen=26)
if __name__ == '__main__':
tf.app.run()
Source code for question_generation.py:
import os
import tensorflow as tf
import numpy as np
import tensorflow.python.platform
from keras.preprocessing import sequence
from data_loader import *
import vgg19
tf.pack = tf.stack
tf.select = tf.where
tf.batch_matmul = tf.matmul
class Question_Generator():
def __init__(self, sess, conf, dataset, img_feature, train_data):
self.sess = sess
self.dataset = dataset
self.img_feature = img_feature
self.train_data = train_data
self.dim_image = conf.dim_image
self.dim_embed = conf.dim_embed
self.dim_hidden = conf.dim_hidden
self.batch_size = conf.batch_size
self.maxlen = conf.maxlen
self.n_lstm_steps = conf.maxlen+2
self.model_path = conf.model_path
if conf.is_train:
self.n_epochs = conf.n_epochs
self.learning_rate = conf.learning_rate
self.num_train = train_data['question'].shape[0] # total number of data
self.n_words = len(dataset['ix_to_word'].keys()) # vocabulary_size
# word embedding
self.Wemb = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='Wemb')
self.bemb = tf.Variable(tf.random_uniform([self.dim_embed], -0.1, 0.1), name='bemb')
# LSTM
self.lstm = tf.contrib.rnn.BasicLSTMCell(self.dim_hidden)
#self.lstm = tf.nn.rnn_cell.BasicLSTMCell(self.dim_hidden)
# fc7 encoder
self.encode_img_W = tf.Variable(tf.random_uniform([self.dim_image, self.dim_hidden], -0.1, 0.1), name='encode_img_W')
self.encode_img_b = tf.Variable(tf.random_uniform([self.dim_hidden], -0.1, 0.1), name='encode_img_b')
# feat -> word
self.embed_word_W = tf.Variable(tf.random_uniform([self.dim_hidden, self.n_words], -0.1, 0.1), name='embed_word_W')
self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1, 0.1), name='embed_word_b')
def build_model(self):
self.image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image])
self.question = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
self.mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
image_emb = tf.nn.xw_plus_b(self.image, self.encode_img_W, self.encode_img_b) # (batch_size, dim_hidden)
state = self.lstm.zero_state(self.batch_size,tf.float32)
loss = 0.0
with tf.variable_scope("RNN"):
for i in range(self.n_lstm_steps):
if i == 0:
current_emb = image_emb
else:
tf.get_variable_scope().reuse_variables()
current_emb = tf.nn.embedding_lookup(self.Wemb, self.question[:,i-1]) + self.bemb
# LSTM
output, state = self.lstm(current_emb, state)
if i > 0:
# ground truth
labels = tf.expand_dims(self.question[:, i], 1)
indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
concated = tf.concat([indices, labels], 1)
#concated = tf.concat(1, [indices, labels])
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0)
# predict word
logit_words = tf.nn.xw_plus_b(output, self.embed_word_W, self.embed_word_b)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=onehot_labels)
cross_entropy = cross_entropy * self.mask[:,i]
current_loss = tf.reduce_sum(cross_entropy)
loss = loss + current_loss
self.loss = loss / tf.reduce_sum(self.mask[:,1:])
def build_generator(self):
self.image = tf.placeholder(tf.float32, [1, self.dim_image]) # only one image
image_emb = tf.nn.xw_plus_b(self.image, self.encode_img_W, self.encode_img_b)
state = tf.zeros([1, self.lstm.state_size])
self.generated_words = []
with tf.variable_scope("RNN"):
output, state = self.lstm(image_emb, state)
last_word = tf.nn.embedding_lookup(self.Wemb, [0]) + self.bemb
for i in range(self.maxlen):
tf.get_variable_scope().reuse_variables()
output, state = self.lstm(last_word, state)
logit_words = tf.nn.xw_plus_b(output, self.embed_word_W, self.embed_word_b)
max_prob_word = tf.argmax(logit_words, 1)
last_word = tf.nn.embedding_lookup(self.Wemb, max_prob_word)
last_word += self.bemb
self.generated_words.append(max_prob_word)
def train(self):
index = np.arange(self.num_train)
np.random.shuffle(index)
questions = self.train_data['question'][index,:]
img_list = self.train_data['img_list'][index]
print("img feature length: " + str(len(self.img_feature)))
print("img list: " + str(img_list))
#self.img_feature = np.pad(self.img_feature, (0,1),'constant', constant_values=(0,0)) #pad array to prevent bug
print("img feature length: " + str(len(self.img_feature)))
feats = self.img_feature[img_list,:]
self.saver = tf.train.Saver(max_to_keep=50)
train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
tf.initialize_all_variables().run()
for epoch in range(self.n_epochs):
counter = 0
for start, end in zip( \
range(0, len(feats), self.batch_size),
range(self.batch_size, len(feats), self.batch_size)
):
current_feats = feats[start:end]
current_questions = questions[start:end]
current_question_matrix = sequence.pad_sequences(current_questions, padding='post', maxlen=self.maxlen+1)
current_question_matrix = np.hstack( [np.full( (len(current_question_matrix),1), 0), current_question_matrix] ).astype(int)
current_mask_matrix = np.zeros((current_question_matrix.shape[0], current_question_matrix.shape[1]))
nonzeros = np.array( map(lambda x: (x != 0).sum()+2, current_question_matrix ))
# +2 -> #START# and '.'
for ind, row in enumerate(current_mask_matrix):
row[:nonzeros[ind]] = 1
_, loss_value = self.sess.run([train_op, self.loss], feed_dict={
self.image: current_feats,
self.question : current_question_matrix,
self.mask : current_mask_matrix
})
if np.mod(counter, 100) == 0:
print "Epoch: ", epoch, " batch: ", counter ," Current Cost: ", loss_value
counter = counter + 1
if np.mod(epoch, 25) == 0:
print "Epoch ", epoch, " is done. Saving the model ... "
self.save_model(epoch)
def test(self, test_image_path, model_path, maxlen):
ixtoword = self.dataset['ix_to_word']
images = tf.placeholder("float32", [1, 224, 224, 3])
image_val = read_image(test_image_path)
vgg = vgg19.Vgg19()
with tf.name_scope("content_vgg"):
vgg.build(images)
fc7 = self.sess.run(vgg.relu7, feed_dict={images:image_val})
saver = tf.train.Saver()
saver.restore(self.sess, model_path)
generated_word_index = self.sess.run(self.generated_words, feed_dict={self.image:fc7})
generated_word_index = np.hstack(generated_word_index)
generated_sentence = ''
for x in generated_word_index:
if x==0:
break
word = ixtoword[str(x)]
generated_sentence = generated_sentence + ' ' + word
print ' '
print '--------------------------------------------------------------------------------------------------------'
print generated_sentence
def save_model(self, epoch):
if not os.path.exists(self.model_path):
os.makedirs(self.model_path)
self.saver.save(self.sess, os.path.join(self.model_path, 'model'), global_step=epoch)
This is a really basic problem. What you didn't understand when you were running this code is that arrays (lists in Python) are 0-indexed. If you have a list of length n, then when you try to access the nth element in that list, you will receive an index error.

Windows/Python Error WindowsError: [Error 3] The system cannot find the path specified

Hi I am new to python and i need some help. I trying to run a file on Windows 10 OS with python 2.7.
import os
import re
import codecs
import numpy as np
import theano
models_path = "./models"
eval_path = "./evaluation"
eval_temp = os.path.join(eval_path, "temp")
eval_script = os.path.join(eval_path, "conlleval")
def get_name(parameters):
"""
Generate a model name from its parameters.
"""
l = []
for k, v in parameters.items():
if type(v) is str and "/" in v:
l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
else:
l.append((k, v))
name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
return "".join(i for i in name if i not in "\/:*?<>|")
def set_values(name, param, pretrained):
"""
Initialize a network parameter with pretrained values.
We check that sizes are compatible.
"""
param_value = param.get_value()
if pretrained.size != param_value.size:
raise Exception(
"Size mismatch for parameter %s. Expected %i, found %i."
% (name, param_value.size, pretrained.size)
)
param.set_value(np.reshape(
pretrained, param_value.shape
).astype(np.float32))
def shared(shape, name):
"""
Create a shared object of a numpy array.
"""
if len(shape) == 1:
value = np.zeros(shape) # bias are initialized with zeros
else:
drange = np.sqrt(6. / (np.sum(shape)))
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
return theano.shared(value=value.astype(theano.config.floatX), name=name)
def create_dico(item_list):
"""
Create a dictionary of items from a list of list of items.
"""
assert type(item_list) is list
dico = {}
for items in item_list:
for item in items:
if item not in dico:
dico[item] = 1
else:
dico[item] += 1
return dico
def create_mapping(dico):
"""
Create a mapping (item to ID / ID to item) from a dictionary.
Items are ordered by decreasing frequency.
"""
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
return item_to_id, id_to_item
def zero_digits(s):
"""
Replace every digit in a string by a zero.
"""
return re.sub('\d', '0', s)
def iob2(tags):
"""
Check that tags have a valid IOB format.
Tags in IOB1 format are converted to IOB2.
"""
for i, tag in enumerate(tags):
if tag == 'O':
continue
split = tag.split('-')
if len(split) != 2 or split[0] not in ['I', 'B']:
return False
if split[0] == 'B':
continue
elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else: # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
return True
def iob_iobes(tags):
"""
IOB -> IOBES
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == 'O':
new_tags.append(tag)
elif tag.split('-')[0] == 'B':
if i + 1 != len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('B-', 'S-'))
elif tag.split('-')[0] == 'I':
if i + 1 < len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('I-', 'E-'))
else:
raise Exception('Invalid IOB format!')
return new_tags
def iobes_iob(tags):
"""
IOBES -> IOB
"""
new_tags = []
for i, tag in enumerate(tags):
if tag.split('-')[0] == 'B':
new_tags.append(tag)
elif tag.split('-')[0] == 'I':
new_tags.append(tag)
elif tag.split('-')[0] == 'S':
new_tags.append(tag.replace('S-', 'B-'))
elif tag.split('-')[0] == 'E':
new_tags.append(tag.replace('E-', 'I-'))
elif tag.split('-')[0] == 'O':
new_tags.append(tag)
else:
raise Exception('Invalid format!')
return new_tags
def insert_singletons(words, singletons, p=0.5):
"""
Replace singletons by the unknown word with a probability p.
"""
new_words = []
for word in words:
if word in singletons and np.random.uniform() < p:
new_words.append(0)
else:
new_words.append(word)
return new_words
def pad_word_chars(words):
"""
Pad the characters of the words in a sentence.
Input:
- list of lists of ints (list of words, a word being a list of char indexes)
Output:
- padded list of lists of ints
- padded list of lists of ints (where chars are reversed)
- list of ints corresponding to the index of the last character of each word
"""
max_length = max([len(word) for word in words])
char_for = []
char_rev = []
char_pos = []
for word in words:
padding = [0] * (max_length - len(word))
char_for.append(word + padding)
char_rev.append(word[::-1] + padding)
char_pos.append(len(word) - 1)
return char_for, char_rev, char_pos
def create_input(data, parameters, add_label, singletons=None):
"""
Take sentence data and return an input for
the training or the evaluation function.
"""
words = data['words']
chars = data['chars']
if singletons is not None:
words = insert_singletons(words, singletons)
if parameters['cap_dim']:
caps = data['caps']
char_for, char_rev, char_pos = pad_word_chars(chars)
input = []
if parameters['word_dim']:
input.append(words)
if parameters['char_dim']:
input.append(char_for)
if parameters['char_bidirect']:
input.append(char_rev)
input.append(char_pos)
if parameters['cap_dim']:
input.append(caps)
if add_label:
input.append(data['tags'])
return input
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
id_to_tag, dictionary_tags, eval_id):
"""
Evaluate current model using CoNLL script.
"""
n_tags = len(id_to_tag)
predictions = []
count = np.zeros((n_tags, n_tags), dtype=np.int32)
for raw_sentence, data in zip(raw_sentences, parsed_sentences):
input = create_input(data, parameters, False)
if parameters['crf']:
y_preds = np.array(f_eval(*input))[1:-1]
else:
y_preds = f_eval(*input).argmax(axis=1)
y_reals = np.array(data['tags']).astype(np.int32)
assert len(y_preds) == len(y_reals)
p_tags = [id_to_tag[y_pred] for y_pred in y_preds]
r_tags = [id_to_tag[y_real] for y_real in y_reals]
if parameters['tag_scheme'] == 'iobes':
p_tags = iobes_iob(p_tags)
r_tags = iobes_iob(r_tags)
for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)):
new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]])
predictions.append(new_line)
count[y_real, y_pred] += 1
predictions.append("")
# Write predictions to disk and run CoNLL script externally
#eval_id = np.random.randint(1000000, 2000000)
output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id)
scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id)
with codecs.open(output_path, 'w', 'utf8') as f:
f.write("\n".join(predictions))
os.system("%s < %s > %s" % (eval_script, output_path, scores_path))
# CoNLL evaluation results
eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
#trainLog = open('train.log', 'w')
for line in eval_lines:
print line
#trainLog.write("%s\n" % line)
# Remove temp files
# os.remove(output_path)
# os.remove(scores_path)
# Confusion matrix with accuracy for each tag
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
"ID", "NE", "Total",
*([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
)
for i in xrange(n_tags):
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
str(i), id_to_tag[i], str(count[i].sum()),
*([count[i][j] for j in xrange(n_tags)] +
["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
)
# Global accuracy
print "%i/%i (%.5f%%)" % (
count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
)
# F1 on all entities
return float(eval_lines[1].strip().split()[-1])
When i compile the code as it is i always get the error.I think its either because of restriction on path length in windows or it needs or slashes. I dont know what to add to subtract in order to resolve the problem.
run train.py --train lstm/fold1/train --dev lstm/fold1/dev --test lstm/fold1/test
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29
Using gpu device 0: GeForce GT 620M (CNMeM is enabled with initial size: 85.0% of memory, cuDNN not available)
Traceback (most recent call last):
File "E:\New-Code\tagger-master\tagger-master\train.py", line 135, in
model = Model(parameters=parameters, models_path=models_path)
File "model.py", line 36, in init
os.makedirs(self.model_path)
File "C:\Users\Acer\Anaconda2\envs\env_name27\lib\os.py", line 157, in makedirs
mkdir(name, mode)
WindowsError: [Error 3] The system cannot find the path specified: './models\tag_scheme=iob,lower=False,zeros=False,char_dim=25,char_lstm_dim=25,char_bidirect=True,word_dim=100,word_lstm_dim=100,word_bidirect=True,pre_emb=,all_emb=False,cap_dim=0,crf=True,dropout=0.3,lr_method=sgd-lr_.005'
In windows pathe is given by back slash \ instead of forward slash / which is used in linux/unix.
Try it like blow if file is 1 folder back:
models_path = "..\models"
eval_path = "..\evaluation"

Zero padding not performed properly I think

I seem to have some issues using the implementation that is provided here.
I am a bit in a similar situation as the guy posting, in which I am trying to map and input to an output. The input being samples of an audio file, and the output being a feature vector with length 14 (length is static). The sequence length is variable as the audio files are in different lengths, making the vector containing the samples become in different lengths as well.
I am not solving a classification problem but a regression, so a bit different task.
My code looks like this:
import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import rnn
import numpy as np
import librosa
import glob
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import os
from os import walk
from os.path import splitext
from os.path import join
import time
rng = np.random
np.set_printoptions(threshold=np.nan)
import functools
start_time = time.time()
print "Preprocessing"
def lazy_property(function):
attribute = '_' + function.__name__
#property
#functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
## Class definition ##
class VariableSequenceLabelling:
def __init__(self, data, target, num_hidden=200, num_layers=3):
self.data = data
self.target = target
self._num_hidden = num_hidden
self._num_layers = num_layers
self.prediction
self.error
self.optimize
#lazy_property
def length(self):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
#lazy_property
def prediction(self):
# Recurrent network.
output, _ = tf.nn.dynamic_rnn(
rnn_cell.GRUCell(self._num_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length,
)
# Softmax layer.
max_length = int(self.target.get_shape()[1])
num_classes = int(self.target.get_shape()[2])
weight, bias = self._weight_and_bias(self._num_hidden, num_classes)
# Flatten to apply same weights to all time steps.
output = tf.reshape(output, [-1, self._num_hidden])
prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
prediction = tf.reshape(prediction, [-1, max_length, num_classes])
return prediction
#lazy_property
def cost(self):
# Compute cross entropy for each frame.
cross_entropy = self.target * tf.log(self.prediction)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(cross_entropy)
#lazy_property
def optimize(self):
learning_rate = 0.0003
optimizer = tf.train.AdamOptimizer(learning_rate)
return optimizer.minimize(self.cost)
#lazy_property
def error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))
mistakes = tf.cast(mistakes, tf.float32)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
mistakes *= mask
# Average over actual sequence lengths.
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(mistakes)
#staticmethod
def _weight_and_bias(in_size, out_size):
weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
bias = tf.constant(0.1, shape=[out_size])
return tf.Variable(weight), tf.Variable(bias)
#######################
#Converting file to .wav from .sph file format... God dammit!!!
#with open(train_filelist, 'r') as train_filelist, open(test_filelist, 'r') as test_filelist:
#train_mylist = train_filelist.read().splitlines()
#test_mylist = test_filelist.read().splitlines()
#for line in train_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_train + new_file)
#for line in test_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_test + new_file)
path_train = "/home/JoeS/kaldi-trunk/egs/start/s5/data/train"
path_test = "/home/JoeS/kaldi-trunk/egs/start/s5/data/test"
dnn_train = "/home/JoeS/kaldi-trunk/dnn/train/"
dnn_test = "/home/JoeS/kaldi-trunk/dnn/test/"
dnn = "/home/JoeS/kaldi-trunk/dnn/"
path = "/home/JoeS/kaldi-trunk/egs/start/s5/data/"
MFCC_dir = "/home/JoeS/kaldi-trunk/egs/start/s5/mfcc/raw_mfcc_train.txt"
train_filelist = path_train+"/wav_train.txt"
test_filelist = path_test+"/wav_test.txt"
os.chdir(path)
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def load_sound_files(file_paths , names_input, data_input):
raw_sounds = []
names_output = []
data_output = []
class_output = []
for fp in file_paths:
X,sr = librosa.load(fp)
raw_sounds.append(X)
index = list(find_all(fp,'-'))
input_index = names_input.index(fp[index[1]+1:index[2]])
names_output.append(names_input[input_index])
data_output.append(data_input[input_index])
class_output.append(binify(data_input[input_index][0]))
return raw_sounds, names_output, data_output, class_output
def generate_list_of_names_data(file_path):
# Proprocess
# extract name and data
name = []
data = []
with open(MFCC_dir) as mfcc_feature_list:
content = [x.strip('\n') for x in mfcc_feature_list.readlines()] # remove endlines
start_index_data = 0
end_index_data = 2
for number in range(0,42):
start = list(find_all(content[start_index_data],'['))[0]
end = list(find_all(content[end_index_data],']'))[0]
end_name = list(find_all(content[start_index_data],' '))[0]
substring_data = content[start_index_data][start+1 :]+content[end_index_data][: end]
substring_name = content[start_index_data][:end_name]
arr = np.array(substring_data.split(), dtype = float)
data.append(arr)
name.append(substring_name)
start_index_data = start_index_data + +3
end_index_data = end_index_data +3
return name, data
files_train_path = [dnn_train+f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_path = [dnn_test+f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
files_train_name = [f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_name = [f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
os.chdir(dnn_train)
train_name,train_data = generate_list_of_names_data(files_train_path)
train_data, train_names, train_output_data, train_class_output = load_sound_files(files_train_path,train_name,train_data)
max_length = 0 ## Used for variable sequence input
for element in train_data:
if element.size > max_length:
max_length = element.size
NUM_EXAMPLES = len(train_data)/2
test_data = train_data[NUM_EXAMPLES:]
test_output = train_output_data[NUM_EXAMPLES:]
train_data = train_data[:NUM_EXAMPLES]
train_output = train_output_data[:NUM_EXAMPLES]
print("--- %s seconds ---" % (time.time() - start_time))
##-------------------MAIN----------------------------##
if __name__ == '__main__':
data = tf.placeholder(tf.float32, [None, max_length, 1])
target = tf.placeholder(tf.float32, [None, 14, 1])
model = VariableSequenceLabelling(data, target)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for epoch in range(10):
for sample_set in range(100):
batch_train = train_data[sample_set]
batch_target = train_output[sample_set]
sess.run(model.optimize, {data: batch_train, target: batch_target})
test_set = test_data[epoch]
test_set_output = test_output[epoch]
error = sess.run(model.error, {data: test_set, target: test_set_output})
print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
And the error message is
Traceback (most recent call last):
File "tensorflow_datapreprocess_mfcc_extraction_rnn.py", line 239, in <module>
sess.run(model.optimize, {data: batch_train, target: batch_target})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 553, in _run
% (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (63945,) for Tensor u'Placeholder:0', which has shape '(?, 138915, 1)'
The error message I receive, as I understand is due to the use of max_length, and getting an input that does not have the proper size - meaning that the input isn't being zero padded properly?.. Or am I wrong? If so how do I fix it? The solution I seek doesn't seem to come natively from tensorflow, does other framework do this natively - and would it be recommendable to use a different one due to the missing function?
The shapes of Tensorflow placeholders must match the shapes of data that is fed to them. Here, you're trying to feed a [63945] tensor to a [?, 138915, 1] shaped placeholder. These are incompatible shapes and Tensorflow complains.
You must pad the input tensor using numpy to the desired shape before feeding it to Tensorflow. I suggest using numpy.pad. (Also note the number of dimensions must match --- you can use numpy.reshape to fix that, or change the placeholder shape and use a Tensorflow reshape.)
When working with long sequences, often padding to a common sequence length can cause memory problems. The standard workaround for that is to bucket sequences into buckets of similar lengths. The Tensorflow seq2seq example might be a useful source of inspiration: https://www.tensorflow.org/versions/r0.11/tutorials/seq2seq/index.html

error in Naive bayes classifier

i'm beginner in machine learning and i'm trying to implement my first Naive Bayes by myself for better understanding. So, i have dataset from http://archive.ics.uci.edu/ml/datasets/Adult (american census data, classes are '<=50k' and '>50k').
Here is my python code:
#!/usr/bin/python
import sys
import csv
words_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}
words_cnt = 0
targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each class
class_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each class
items_cnt = 0
def train(dataset, targets):
global words_stats, words_cnt, targets_stats, items_cnt, class_stats
num = len(dataset)
for item in xrange(num):
class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1
for i in xrange(len(dataset[item])):
word = dataset[item][i]
if not words_stats.has_key(word):
words_stats[word] = {}
tgt = targets[item]
cnt = words_stats[word].get(tgt, 0)
words_stats[word][tgt] = cnt + 1
targets_stats[tgt] = targets_stats.get(tgt, 0) + 1
words_cnt += 1
items_cnt = num
def classify(doc, tgt_set):
global words_stats, words_cnt, targets_stats, items_cnt
probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W)
pc = {} #probability of the class in document set P(c)
pwc = {} #probability of the word set in particular class. P(W|c)
pw = 1 #probability of the word set in documet set
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
pw = pw * float(sum(words_stats[word].values())) / words_cnt
for tgt in tgt_set:
pc[tgt] = class_stats[tgt] / float(items_cnt)
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
tgt_wrd_cnt = words_stats[word].get(tgt, 0)
pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt]
probs[tgt] = (pwc[tgt] * pc[tgt]) / pw
l = sorted(probs.items(), key = lambda i: i[1], reverse=True)
print probs
return l[0][0]
def check_results(dataset, targets):
num = len(dataset)
tgt_set = set(targets)
correct = 0
incorrect = 0
for item in xrange(num):
res = classify(dataset[item], tgt_set)
if res == targets[item]:
correct = correct + 1
else:
incorrect = incorrect + 1
print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / num
def load_data(fil):
data = []
tgts = []
reader = csv.reader(fil)
for line in reader:
d = [x.strip() for x in line]
if '?' in d:
continue
if not len(d):
continue
data.append(d[:-1])
tgts.append(d[-1:][0])
return data, tgts
if __name__ == '__main__':
if len(sys.argv) < 3:
print './program train_data.txt test_data.txt'
sys.exit(1)
filename = sys.argv[1]
fil = open(filename, 'r')
data, tgt = load_data(fil)
train(data, tgt)
test_file = open(sys.argv[2], 'r')
test_data, test_tgt = load_data(test_file)
check_results(test_data, tgt)
it gives ~61% of correct results. when i print probabilities i get the following:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
but in case of correct classifier i expect to see sum of both probabilities equal to 1.
At first i thought the problem is in float underflow and tried to make all calculations in logarithms, but results were similiar.
i understand that omitting some words is gonna affect accuracy, but the probabilities are sooo wrong.
What do i do wrong or don't understand?
for your convinience i've uploaded dataset and python script here:
https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
Thank you for your help.
Naive Bayes doesn't compute a probability directly, rather it computes a "raw score" that is relatively compared to the other scores for each label in order to classify an instance. This score can easily be converted to a "probability" in the range of [0, 1]:
total = sum(probs.itervalues())
for label, score in probs.iteritems():
probs[label] = score / total
However, keep in mind this still doesn't represent a true probability, as mentioned in this answer:
naive Bayes tends to predict probabilities that are almost always either very close to zero or very close to one.

Categories

Resources