I am try to write prediction part of script for the tutorial: https://mxnet.incubator.apache.org/tutorials/nlp/cnn.html
import mxnet as mx
from collections import Counter
import os
import re
import threading
import sys
import itertools
import numpy as np
from collections import namedtuple
SENTENCES_DIR = 'C:/code/mxnet/sentences'
CURRENT_DIR = 'C:/code/mxnet'
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_sentences(filename):
sentences_file = open( filename, "r")
# Tokenize
x_text = [line.decode('Latin1').strip() for line in sentences_file.readlines()]
x_text = [clean_str(sent).split(" ") for sent in x_text]
return x_text
def pad_sentences(sentences, padding_word=""):"
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab(sentences):
word_counts = Counter(itertools.chain(*sentences))
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
return vocabulary, vocabulary_inv
def build_input_data(sentences, vocabulary):
x = np.array([
[vocabulary[word] for word in sentence]
for sentence in sentences])
return x
def predict(mod, sen):
mod.forward(Batch(data=[mx.nd.array(sen)]))
prob = mod.get_outputs()[0].asnumpy()
prob = np.squeeze(prob)
a = np.argsort(prob)[::-1]
for i in a[0:5]:
print('probability=%f' %(prob[i]))
sentences = load_data_sentences( os.path.join( SENTENCES_DIR, 'test-pos-1.txt') )
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x = build_input_data(sentences_padded, vocabulary)
Batch = namedtuple('Batch', ['data'])
sym, arg_params, aux_params = mx.model.load_checkpoint( os.path.join( CURRENT_DIR, 'cnn'), 19)
mod = mx.mod.Module(symbol=sym, context=mx.cpu(), label_names = None)
mod.bind(for_training=False, data_shapes=[('data', (50,56))], label_shapes=mod._label_shapes)
mod.set_params(arg_params, aux_params, allow_missing=True)
predict(mod, x)
But I got the error:
infer_shape error. Arguments: data: (50, 26L)
Traceback (most recent call last):
File "C:\code\mxnet\test2.py", line 152, in predict(mod, x)
File "C:\code\mxnet\test2.py", line 123, in predict
mod.forward(Batch(data=[mx.nd.array(sen)]))
...
MXNetError: Error in operator reshape0: [16:20:21] c:\projects\mxnet-distro-win\mxnet-build\src\operator\tensor./matrix_op-inl.h:187:
Check failed: oshape.Size() == dshape.Size() (840000 vs. 390000)
Target shape size is different to source.
Target: [50,1,56,300]
Source: [50,26,300]
Source is text file with 50 strings of sentences
Unfortunately I didn't found any help in Internet. Please take a look.
OS: Windows 10. Python 2.7
Thank you.
I believe the error you're having is because the padding of your input sentences is different than what the model expects. The way pad_sentences works is to pad the sentences to the length of the longest sentence passed in, so if you're using a different data set, you'll almost certainly get a different padding than your model's padding (which is 56). In this case, it looks like you're getting a padding of 26 (From the error message 'Source: [50, 26, 300]').
I was able to get your code to run successfully by modifying pad_sentence as follows and running it with sequence_length=56 to match the model.
def pad_sentences(sentences, sequence_length, padding_word=""):
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
N.B when you do get your successful run, you'll encounter an error because prob[i] is not a float.
def predict(mod, sen):
mod.forward(Batch(data=[mx.nd.array(sen)]))
prob = mod.get_outputs()[0].asnumpy()
prob = np.squeeze(prob)
a = np.argsort(prob)[::-1]
for i in a[0:5]:
print('probability=%f' %(prob[i])) << prob is a numpy.ndarray, not a float.
Vishaal
Related
Good day everyone. I want to ask how can I import a python script configs.py, to another script in colab. It is listed as an import in the other script, which is app.py, how can I achieve it?
Here is the configs.py file
from attrdict import AttrDict # type: ignore
config = {
"encoder_path": "models/encoder_model.bin",
"decoder_path": "models/decoder_model.bin",
"input_word_index": "pickles/input_word_index.pkl",
"target_word_index": "pickles/target_word_index.pkl",
"url": "https://api.mymemory.translated.net/get",
"max_length_src": 47,
"max_length_tar": 47,
}
config = AttrDict(config)
Meanwhile this is the other script app.py
import functools
import math
import pickle
import re
import string
import numpy as np # type: ignore
import requests
from tensorflow.keras.models import load_model # type: ignore
from configs import config
# CONSTANTS
max_length_src = config["max_length_src"]
max_length_tar = config["max_length_tar"]
print("[INFO] Loading Word Indexes ...")
with open(config["input_word_index"], "rb") as file:
input_token_index = pickle.load(file)
with open(config["target_word_index"], "rb") as file:
target_token_index = pickle.load(file)
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
print("[INFO] Loading Encoder & Decoder ...")
encoder_model = load_model(config["encoder_path"])
decoder_model = load_model(config["decoder_path"])
def clean(input_seq):
input_seq = input_seq.lower()
input_seq = re.sub("'", "", input_seq)
input_seq = "".join(ch for ch in input_seq if ch not in set(string.punctuation))
input_seq = input_seq.strip()
return input_seq
def get_input_seq(input_seq):
input_seq = clean(input_seq)
encoder_input_data = np.zeros((1, max_length_src), dtype="float32")
for t, word in enumerate(input_seq.split()):
encoder_input_data[0, t] = input_token_index[word]
return encoder_input_data
#functools.lru_cache(maxsize=128)
def decode_sequence(input_seq):
input_seq = get_input_seq(input_seq)
states_value = encoder_model.predict(input_seq)
target_seq = np.zeros((1, 1))
# use `START_` as the first character
target_seq[0, 0] = target_token_index["START_"]
stop_condition = False
decoded_sentence = ""
while not stop_condition:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# Sampling a token with max probability
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += " " + sampled_char
# Exit condition: either hit max length or find stop character.
if sampled_char == "_END" or len(decoded_sentence) > max_length_tar:
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
# Update states
states_value = [h, c]
return decoded_sentence
def beam_search_decoder(predictions, top_k):
# start with an empty sequence with zero score
output_sequences = [([], 0)]
# looping through all the predictions
for token_probs in predictions:
new_sequences = []
# append new tokens to old sequences and re-score
for old_seq, old_score in output_sequences:
for char_index in range(len(token_probs)):
new_seq = old_seq + [char_index]
# considering log-likelihood for scoring
new_score = old_score + math.log(token_probs[char_index])
new_sequences.append((new_seq, new_score))
# sort all new sequences in the de-creasing order of their score
output_sequences = sorted(new_sequences, key=lambda val: val[1], reverse=True)
# select top-k based on score
# *Note- best sequence is with the highest score
output_sequences = output_sequences[:top_k]
return output_sequences
#functools.lru_cache(maxsize=128)
def decode_sequence_beam_search(input_seq, beam_width=3):
probabilities = []
# Encode the input as state vectors.
input_seq = get_input_seq(input_seq)
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1))
# Populate the first character of target sequence with the start character.
target_seq[0, 0] = target_token_index["START_"]
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ""
while not stop_condition:
output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# Sampling a token with max probability
sampled_token_index = np.argmax(output_tokens[0, -1, :])
probabilities.append(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += " " + sampled_char
# Exit condition: either hit max length
# or find stop character.
if sampled_char == "_END" or len(decoded_sentence) > max_length_tar:
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
# Update states
states_value = [h, c]
# storing multiple results
outputs = []
beam_search_preds = beam_search_decoder(probabilities, top_k=beam_width)
for prob_indexes, score in beam_search_preds:
decoded_sentence = ""
for index in prob_indexes:
sampled_char = reverse_target_char_index[index]
decoded_sentence += " " + sampled_char
if sampled_char == "_END" or len(decoded_sentence) > max_length_tar:
break
outputs.append(decoded_sentence)
return outputs
Please help to resolve the issue about the importation of python scripts on google colaboratory. Thank you
This is the error I got
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-5-2048e9210135> in <module>()
/content/app.py in <module>()
9 from tensorflow.keras.models import load_model # type: ignore
10
---> 11 from configs import config
12
13 # CONSTANTS
ModuleNotFoundError: No module named 'configs'
import os,re
import math
from math import log10
import nltk.corpus
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
python_file_root = './presidential_debates'
def getidf(token):
document_occurance = 0
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
for line in file:
if re.search(r'\b' +token+ r'\b', line):
document_occurance = document_occurance + 1
break
if (document_occurance != 0):
idf = log10(30 / document_occurance)
return idf
return -1
def normalize(filename,token):
file = open(os.path.join(python_file_root, filename), "r")
counts = dict()
square = []
count1 = 0
for line in file:
count1 = count1 + 1
if line in counts:
counts[line] += 1
else:
counts[line] = 1
for key,value in counts.items():
tf = 1 +log10(value)
idf = getidf(key.rstrip())
square.append((tf * idf)*(tf * idf))
summ = sum(square)
sqroot = math.sqrt(summ)
return sqroot
def getweight(filename,token):
hit_count1 = 0
final = 0
file = open(os.path.join(python_file_root, filename), "r")
idft = getidf(token)
for line in file:
if re.search(r'\b' +token+ r'\b', line):
hit_count1 = hit_count1 + 1
if (hit_count1 == 0):
return 0
else:
tf = 1 + log10(hit_count1)
initial = idft * tf
if(initial <= 0):
final = 0
return final
else:
normalize_fact = normalize(filename,token)
final = initial / normalize_fact
return final
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
doc = file.read()
doc = doc.lower()
stemmed = []
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
tokens = tokenizer.tokenize(doc)
stoplist = stopwords.words('english')
stop_removed = [word for word in tokens if word not in stoplist]
with open(os.path.join(python_file_root, filename), "w") as f:
for item in stop_removed:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(item)]
for items in stemmed:
f.write("%s\n" % items)
print("\nIDF\n")
print("%.12f" % getidf("health"))
print("%.12f" % getidf("agenda"))
print("%.12f" % getidf("vector"))
print("%.12f" % getidf("reason"))
print("%.12f" % getidf("hispan"))
print("%.12f" % getidf("hispanic"))
print("\n")
print("%.12f" % getweight("2012-10-03.txt","health"))
print("%.12f" % getweight("1960-10-21.txt","reason"))
print("%.12f" % getweight("1976-10-22.txt","agenda"))
print("%.12f" % getweight("2012-10-16.txt","hispan"))
print("%.12f" % getweight("2012-10-16.txt","hispanic"))
I have 30 txt files and i have developed a program to find the idf and normalized tf-idf vectors. Im getting the correct values but the function getweight takes more than 15 minutes to generate the output. Can anyone suggest me a few methods for optimization.
I donot want to use any other non-standard Python package.
Why do you create a new PorterStemmer for every word?
Apart from this obvious thing, try profiling your code. NLTI has the reputation of being really slow - so it may well be not your fault. If you profile, then you'll know.
the program is when user input"8#15#23###23#1#19###9#20"
output should be "HOW WAS IT"
However,it could not work to show space(###).
enter code here
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
def from_abstract(s):
result = ''
for word in s.split('*'):
result = result +ABSTRACT_SHIFTED.get(word)
return result
This would do the trick:
#!/usr/bin/env python
InputString = "8#15#23###23#1#19###9#20"
InputString = InputString.replace("###", "##")
InputString = InputString.split("#")
DecodedMessage = ""
for NumericRepresentation in InputString:
if NumericRepresentation == "":
NumericRepresentation = " "
DecodedMessage += NumericRepresentation
continue
else:
DecodedMessage += chr(int(NumericRepresentation) + 64)
print(DecodedMessage)
Prints:
HOW WAS IT
you can also use a regex
import re
replacer ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
reversed = {value:key for key,value in replacer.items()}
# Reversed because regex is greedy and it will match 1 before 15
target = '8#15#23###23#1#19###9#20'
pattern = '|'.join(map(lambda x: x + '+', list(reversed.keys())[::-1]))
repl = lambda x: reversed[x.group(0)]
print(re.sub(pattern, string=target, repl=repl))
And prints:
HOW WAS IT
With a couple minimal changes to your code it works.
1) split on '#', not '*'
2) retrieve ' ' by default if a match isn't found
3) use '##' instead of '###'
def from_abstract(s):
result = ''
for word in s.replace('###','##').split('#'):
result = result +ABSTRACT_SHIFTED.get(word," ")
return result
Swap the key-value pairs of ABSTRACT and use simple split + join on input
ip = "8#15#23###23#1#19###9#20"
ABSTRACT = dict((v,k) for k,v in ABSTRACT.items())
''.join(ABSTRACT.get(i,' ') for i in ip.split('#')).replace(' ', ' ')
#'HOW WAS IT'
The biggest challenge here is that "#" is used as a token separator and as the space character, you have to know the context to tell which you've got at any given time, and that makes it difficult to simply split the string. So write a simple parser. This one will accept anything as the first character in a token and then grab everything until it sees the next "#".
ABSTRACT ={"A":"1","B":"2","C":"3","D":"4","E":"5","F":"6","G":"7","H":"8","I":"9", "J":"10","K":"11","L":"12","M":"13","N":"14","O":"15","P":"16","Q":"17","R":"18","S":"19","T":"20","U":"21","V":"22","W":"23", "X":"24","Y":"25","Z":"26",
" ":"###","":"#" }
ABSTRACT_SHIFTED = {value:key for key,value in ABSTRACT.items()}
user_input = "8#15#23###23#1#19###9#20"
def from_abstract(s):
result = []
while s:
print 'try', s
# tokens are terminated with #
idx = s.find("#")
# ...except at end of line
if idx == -1:
idx = len(s) - 1
token = s[:idx]
s = s[idx+1:]
result.append(ABSTRACT_SHIFTED.get(token, ' '))
return ''.join(result)
print from_abstract(user_input)
I want to create a very basic Q&A chatbot. Given a list of questions & answers that I use as my dataset, I want to train it in order to return relevant answers, depending on a hard-coded question (different every time). First I tokenize, cleaning up, then using cosine similarity, but it gives me an error, which is (I guess) a pickle issue.
UPDATED
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import pickle
import os.path
import re, math
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')
extra_stopwords = stopwords + ['I', 'can']
WORD = re.compile(r'\w+')
def get_clean_data():
clean_data_set = {
'questions' : {},
'answers' : {}
}
reader = csv.reader(open('data.csv', 'r', encoding="utf-8"))
tags = []
counter = 0
for r in reader:
question = str(r[0].encode('utf-8'))
answer = str(r[1].encode('utf-8'))
_, tags_question = get_tags(question)
_, tags_answer = get_tags(answer)
clean_data_set['answers'][answer] = tags_answer + tags_question
clean_data_set['questions'][question] = text_to_vector(question)
counter += 1
# hardcode the number :)
print (counter, ' out of 746')
# pickle.dump(clean_data_set, open('dump.dict', 'wb'))
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
def get_tags(text, use_set = True):
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
# make it lower case
filtered_words = [word.lower() for word in tokens if word not in extra_stopwords]
# return non duplicate values by default
if use_set == True:
filterd_words = list(set(filtered_words))
return Counter(filtered_words), filtered_words
# simple cosine similarity measure
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
# question_set is the data we had
def get_cosine_value(question, question_set):
question_vector = text_to_vector(question)
cosine = get_cosine(question_vector, question_set)
return cosine
def answer_question(question, top = 5):
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
# data_set = pickle.load(open('dump.dict', 'rb'))
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
_, question_tags = get_tags(question)
ranking_dict = {}
similar_questions_rank = {}
for entry in data_set['answers']:
tags = data_set['answers'][entry]
# rank is the intersection between the list of tags from the question
# and the list of tags associated to answers
rank = len(set(question_tags).intersection(tags))
ranking_dict[entry] = rank
for entry in data_set['questions']:
cosine_similarity = get_cosine_value(question, data_set['questions'][entry])
similar_questions_rank[entry] = cosine_similarity
sorted_similarity_dict = sorted(similar_questions_rank.items(), key=lambda x: x[1], reverse=True)
sorted_ranking_dict = sorted(ranking_dict.items(), key=lambda x: x[1], reverse=True)
# sort them by rank
for item in sorted_ranking_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Answer: ', item[0])
print ('\n\n')
# sort them by rank
for item in sorted_similarity_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Question: ', item[0])
#get_clean_data()
question = 'why all these errors?'
answer_question(question)
This is the updated error message:
Traceback (most recent call last):
File "C:\Users\joasa\Desktop\si\main.py", line 133, in <module>
answer_question(question)
File "C:\Users\joasa\Desktop\si\main.py", line 94, in answer_question
data_set = pickle.load(my_dump_file)
EOFError: Ran out of input
[Finished in 1.4s]
Can someone help please? I have no idea what to do. Thanks in advance
I think it comes from this line in your get_clean_data function:
pickle.dump(clean_data_set, open('dump.dict', 'w'))
See here you open the file for writing, but you never close it, so when you try to read it there is nothing to signify that the end of file has been reached. To avoid stuff like this from happening, use a context manager block:
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
That way, whichever way you exit the with block, you are guaranteed to close your file.
You should also do the same when loading your pickle dump in answer_question:
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
import re
from sys import argv
def read_file(fname):
""" open and extract the text from the file """
txt_file = open(fname, 'r')
txt = txt_file.read()
txt_file.close()
return txt
def clean_space(files):
""" remove spaces from the file """
return files.replace('\n', '')
def filter_file(files):
""" remove punctuation and filter small words from the file """
split_words = map(lambda x: re.sub('[^A-Za-z0-9]+', '', x),
files.split())
filtered_txt = [x for x in split_words if len(x) > 1]
return filtered_txt
def dict_count(files):
""" for loop to return dict with word count and length keys """
lengths = {}
for word in filtered_text: # And this also
length = len(word)
if length not in lengths:
lengths[length] = 0
lengths[length] += 1
for length, counter in lengths.item():
return "Words of length %d: %d" % (length, counter)
def print_result(fname):
fi = dict_count(filter_file(clean_space(read_file(fname))))
print fi
if __name__ == '__main__':
script, fname = argv
print_result(fname)
In the function dict_count you have never created the filtered_text variable, and then you want to use it.
You must create the variable before using it with:
filtered_text = filter_file(files)
complete code:
def dict_count(files):
""" for loop to return dict with word count and length keys """
lengths = {}
filtered_text = filter_file(files)
for word in filtered_text: # And this also
length = len(word)
if length not in lengths:
lengths[length] = 0
lengths[length] += 1
for length, counter in lengths.item():
return "Words of length %d: %d" % (length, counter)