I am working on an information retrieval project, where I have to process a ~1.5 GB text data and create a Dictionary (words, document frequency) and posting list (document id, term frequency). According to the professor, it should take around 10-15 minutes. But my code is running for more than 8 hours now! I tried a smaller dataset (~35 MB) and it took 5 hours to process.
I am a newbie in python and I think it is taking so long because i have created many python dictionaries and lists in my code. I tried to use generator, but I am not sure how to work around with it.
file = open(filename, 'rt')
text = file.read()
file.close()
p = r'<P ID=\d+>.*?</P>'
tag = RegexpTokenizer(p)
passage = tag.tokenize(text)
doc_re = re.compile(r"<P ID=(\d+)>")
def process_data(docu):
tokens = RegexpTokenizer(r'\w+')
lower_tokens = [word.lower() for word in tokens.tokenize(docu)]
table = str.maketrans('','', string.punctuation)
stripped = [w.translate(table) for w in lower_tokens]
alpha = [word for word in stripped if word.isalpha()]
stopwordlist = stopwords.words('english')
stopped = [w for w in alpha if not w in stopwordlist]
return stopped
data = {}
for doc in passage:
group_docID = doc_re.match(doc)
docID = group_docID.group(1)
tokens = process_data(doc)
data[docID] = list(set(tokens))
vocab = [item for i in data.values() for item in i]
total_vocab = list(set(vocab))
total_vocab.sort()
print('Document Size = ', len(data))
print('Collection Size = ', len(vocab))
print('Vocabulary Size= ', len(total_vocab))
inv_index = {}
for x in total_vocab:
for y, z in data.items():
if x in z:
wordfreq = z.count(x)
inv_index.setdefault(x, []).append((int(y), wordfreq))
flattend = [item for tag in inv_index.values() for item in tag]
posting = [item for tag in flattend for item in tag ]
doc_freq=[]
for k,v in inv_index.items():
freq1=len([item for item in v if item])
doc_freq.append((freq1))
#offset value of each vocabulary/words
offset = []
offset1=0
for i in range(len(doc_freq)):
if i>0:
offset1 =offset1 + (doc_freq[i-1]*2)
offset.append((offset1))
#create dcitionary of words, document frequency and offset
dictionary = {}
for i in range(len(total_vocab)):
dictionary[total_vocab[i]]=(doc_freq[i],offset[i])
#dictionary of word, inverse document frequency
idf = {}
for i in range(len(dictionary)):
a = np.log2(len(data)/doc_freq[i])
idf[total_vocab[i]] = a
with open('dictionary.json', 'w') as f:
json.dump(dictionary,f)
with open('idf.json', 'w') as f:
json.dump(idf, f)
binary_file = open('binary_file.txt', 'wb')
for i in range(0, len(posting)):
binary_int = (posting[i]).to_bytes(4, byteorder = 'big')
binary_file.write(binary_int)
binary_file.close()
Could someone please help me to rewrite this code so that it becomes more computationally and time efficient?
Related
I am trying to adapt TF-IDF on my data ([using the code by Dr. W.J.B. Mattingly: https://github.com/wjbmattingly/topic_modeling_textbook/blob/main/lessons/02_tf_idf_official.py) on my data - descriptions of the startups from Startup blink website.
I cannot get the main idea on how to better deal with the extraction of all words as now the output is the string with all words all together like this - also you will notice lots of empty lists inside as well:
[['qualitygeotechnicalinvestigationtestinggeotechnicalreportspreconditiondevelopmentideasnewprojectimplementationintensivefieldlaboratorytestingsnecessaryobtaininputdatasoillayerscapacitysettlementcategorizationqualitymaterials']
s = requests.Session()
df = pd.DataFrame()
for p in tqdm(range(2000)):
r = s.get(f'https://www.startupblink.com/api/entities?entity=startups&page={p}')
d = pd.json_normalize(r.json()['page'])
df = pd.concat([df, d], axis=0, ignore_index=True)
df.to_csv('World_startups.csv')
# selecting only ESG related startups
esg = df[df['subindustry_name'].isin(['Energy', 'Energy & Environment-Other', 'Smart Cities', 'Smart Home', 'Public Transportation', 'Sustainability',
'Transportation-Other','Waste Management'])]
esg = esg[['title', 'description', 'subindustry_name']]
description = esg.description.tolist()
#description = description.remove(np.nan)
def remove_stopwords(text, stops):
words = text.split()
final = []
for word in words:
if word not in stops:
final.append(word)
final = "".join(final)
final = final.translate(str.maketrans("", "", string.punctuation))
final = "".join([i for i in final if not i.isdigit()])
while " " in final:
final = final.replace(" ", " ")
return final
def clean_docs(docs):
stops = stopwords.words('english')
final = []
for doc in docs:
clean_doc = remove_stopwords(doc, stops)
final.append(clean_doc)
return (final)
cleaned_docs = clean_docs(description)
vectorizer = TfidfVectorizer(lowercase=True,
max_features=100,
# max_df=.9,# percentage
# min_df=2, # number of
ngram_range=(1,3),
stop_words = 'english') # up to triagrams
vectors = vectorizer.fit_transform(cleaned_docs)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
# Printing all unique dense values to mid-check
densearray = numpy.array(denselist)
print(numpy.unique(densearray))
all_keywords = []
for d in denselist:
x=0
keywords = []
for word in d:
if word > 0:
keywords.append(feature_names[x])
x=x+1
all_keywords.append(keywords)
all_keywords[7]
print(len(all_keywords))
# the list contains lots of emptly lists inside - will remove them
all_keywords = [ele for ele in all_keywords if ele != []]
print('')
print(len(all_keywords))
print(all_keywords[7])
i have used this code for category detection..
import numpy as np
# Words -> category
categories = {word: key for key, words in data.items() for word in words}
# Load the whole embedding matrix
embeddings_index = {}
with open('glove.6B.100d.txt', encoding="utf8") as f:
for line in f:
values = line.split()
word = values[0]
embed = np.array(values[1:], dtype=np.float32)
embeddings_index[word] = embed
print('Loaded %s word vectors.' % len(embeddings_index))
# Embeddings for available words
data_embeddings = {key: value for key, value in embeddings_index.items() if key in categories.keys()}
# Processing the query
def process(query):
query_embed = embeddings_index[query]
scores = {}
for word, embed in data_embeddings.items():
category = categories[word]
dist = query_embed.dot(embed)
dist /= len(data[category])
scores[category] = scores.get(category, 0) + dist
return scores
# Testing
print(process('pizza'))
OUTPUT
{'service': 6.385544379552205, 'ambiance': 3.5752111077308655, 'Food': 12.912149047851562}
is there a way I only get the highest accuracy category like Food??
def process(query):
query_embed = embeddings_index[query]
scores = {}
for word, embed in data_embeddings.items():
category = categories[word]
dist = query_embed.dot(embed)
dist /= len(data[category])
scores[category] = scores.get(category, 0) + dist
return max(scores, key=scores.get)
You can use max() for this. This will return the key name of maximum value.
I am trying to solve the "Consensus and Profile" challenge on Rosalind.
The challenge instructions are as follows:
Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)
My code is as follows (I got most of it from another user on this website). My only issue is that some of the DNA strands are broken down into multiple separate lines, so they are being appended to the "allstrings" list as separate strings. I am trying to figure out how to write each consecutive line that does not contain ">" as a single string.
import numpy as np
seq = []
allstrings = []
temp_seq = []
matrix = []
C = []
G = []
T = []
A = []
P = []
consensus = []
position = 1
file = open("C:/Users/knigh/Documents/rosalind_cons (3).txt", "r")
conout = open("C:/Users/knigh/Documents/consensus.txt", "w")
# Right now, this is reading and writing each as an individual line. Thus, it
# is splitting each sequence into multiple small sequences. You need to figure
# out how to read this in FASTA format to prevent this from occurring
desc = file.readlines()
for line in desc:
allstrings.append(line)
for string in range(1, len(allstrings)):
if ">" not in allstrings[string]:
temp_seq.append(allstrings[string])
else:
seq.insert(position, temp_seq[0])
temp_seq = []
position += 1
# This last insertion into the sequence must be performed after the loop to empty
# out the last remaining string from temp_seq
seq.insert(position, temp_seq[0])
for base in seq:
matrix.append([pos for pos in base])
M = np.array(matrix).reshape(len(seq), len(seq[0]))
for base in range(len(seq[0])):
A_count = 0
C_count = 0
G_count = 0
T_count = 0
for pos in M[:, base]:
if pos == "A":
A_count += 1
elif pos == "C":
C_count += 1
elif pos == "G":
G_count += 1
elif pos == "T":
T_count += 1
A.append(A_count)
C.append(C_count)
G.append(G_count)
T.append(T_count)
profile_matrix = {"A": A, "C": C, "G": G, "T": T}
P.append(A)
P.append(C)
P.append(G)
P.append(T)
profile = np.array(P).reshape(4, len(A))
for pos in range(len(A)):
if max(profile[:, pos]) == profile[0, pos]:
consensus.append("A")
elif max(profile[:, pos]) == profile[1, pos]:
consensus.append("C")
elif max(profile[:, pos]) == profile[2, pos]:
consensus.append("G")
elif max(profile[:, pos]) == profile[3, pos]:
consensus.append("T")
conout.write("".join(consensus) + "\n")
for k, v in profile_matrix.items():
conout.write(k + ": " + " ".join(str(x) for x in v) + "\n")
conout.close()
There are a couple of ways that you can iterate a FASTA file as records. You can use a prebuilt library or write your own.
A widely used library for working with sequence data is biopython. This code snippet will create a list of strings.
from Bio import SeqIO
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for record in SeqIO.parse(file_handle, "fasta"):
sequences.append(record.seq)
Alternatively, you can write your own FASTA parser. Something like this should work:
def read_fasta(fh):
# Iterate to get first FASTA header
for line in fh:
if line.startswith(">"):
name = line[1:].strip()
break
# This list will hold the sequence lines
fa_lines = []
# Now iterate to find the get multiline fasta
for line in fh:
if line.startswith(">"):
# When in this block we have reached
# the next FASTA record
# yield the previous record's name and
# sequence as tuple that we can unpack
yield name, "".join(fa_lines)
# Reset the sequence lines and save the
# name of the next record
fa_lines = []
name = line[1:].strip()
# skip to next line
continue
fa_lines.append(line.strip())
yield name, "".join(fa_lines)
You can use this function like so:
file = "path/to/your/file.fa"
sequences = []
with open(file, "r") as file_handle:
for name, seq in read_fasta(file_handle):
sequences.append(seq)
I want to create a very basic Q&A chatbot. Given a list of questions & answers that I use as my dataset, I want to train it in order to return relevant answers, depending on a hard-coded question (different every time). First I tokenize, cleaning up, then using cosine similarity, but it gives me an error, which is (I guess) a pickle issue.
UPDATED
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import pickle
import os.path
import re, math
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')
extra_stopwords = stopwords + ['I', 'can']
WORD = re.compile(r'\w+')
def get_clean_data():
clean_data_set = {
'questions' : {},
'answers' : {}
}
reader = csv.reader(open('data.csv', 'r', encoding="utf-8"))
tags = []
counter = 0
for r in reader:
question = str(r[0].encode('utf-8'))
answer = str(r[1].encode('utf-8'))
_, tags_question = get_tags(question)
_, tags_answer = get_tags(answer)
clean_data_set['answers'][answer] = tags_answer + tags_question
clean_data_set['questions'][question] = text_to_vector(question)
counter += 1
# hardcode the number :)
print (counter, ' out of 746')
# pickle.dump(clean_data_set, open('dump.dict', 'wb'))
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
def get_tags(text, use_set = True):
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
# make it lower case
filtered_words = [word.lower() for word in tokens if word not in extra_stopwords]
# return non duplicate values by default
if use_set == True:
filterd_words = list(set(filtered_words))
return Counter(filtered_words), filtered_words
# simple cosine similarity measure
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
# question_set is the data we had
def get_cosine_value(question, question_set):
question_vector = text_to_vector(question)
cosine = get_cosine(question_vector, question_set)
return cosine
def answer_question(question, top = 5):
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
# data_set = pickle.load(open('dump.dict', 'rb'))
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
_, question_tags = get_tags(question)
ranking_dict = {}
similar_questions_rank = {}
for entry in data_set['answers']:
tags = data_set['answers'][entry]
# rank is the intersection between the list of tags from the question
# and the list of tags associated to answers
rank = len(set(question_tags).intersection(tags))
ranking_dict[entry] = rank
for entry in data_set['questions']:
cosine_similarity = get_cosine_value(question, data_set['questions'][entry])
similar_questions_rank[entry] = cosine_similarity
sorted_similarity_dict = sorted(similar_questions_rank.items(), key=lambda x: x[1], reverse=True)
sorted_ranking_dict = sorted(ranking_dict.items(), key=lambda x: x[1], reverse=True)
# sort them by rank
for item in sorted_ranking_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Answer: ', item[0])
print ('\n\n')
# sort them by rank
for item in sorted_similarity_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Question: ', item[0])
#get_clean_data()
question = 'why all these errors?'
answer_question(question)
This is the updated error message:
Traceback (most recent call last):
File "C:\Users\joasa\Desktop\si\main.py", line 133, in <module>
answer_question(question)
File "C:\Users\joasa\Desktop\si\main.py", line 94, in answer_question
data_set = pickle.load(my_dump_file)
EOFError: Ran out of input
[Finished in 1.4s]
Can someone help please? I have no idea what to do. Thanks in advance
I think it comes from this line in your get_clean_data function:
pickle.dump(clean_data_set, open('dump.dict', 'w'))
See here you open the file for writing, but you never close it, so when you try to read it there is nothing to signify that the end of file has been reached. To avoid stuff like this from happening, use a context manager block:
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
That way, whichever way you exit the with block, you are guaranteed to close your file.
You should also do the same when loading your pickle dump in answer_question:
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
I am looking to alter my map reduce files to output the top bigrams in a chunk of text instead of the word count, so both words and the bigram count
This is my current code and approach.
Map:
import sys
for line in sys.stdin:
line = line.strip()
words = line.split() #bigrams = line.split()
for word in words: #for bigram in words
print '%s\t%s' % (word,1) #print ... word pair???
Reduce:
mydict = dict()
for line in sys.stdin:
(word,cnt) = line.strip().split('\t') #bigram and bigram count
mydict[word] = mydict.get(word,0) 1
for word,cnt in mydict.items():
print word,cnt #print bigram and bigram count
Thank you.
I saw nltk as a popular solution for computing bigrams, should I have that approach even in my mapreduce format?
I wouldn't do it with stdin and stdout. I'd rather throw multiprocessing at this and read from some saved file:
import multiprocessing as mp
def main(infilepath):
bgqIn, bgqOut = [mp.Queue() for _ in xrange(2)]
procs = [mp.Process(target=mapper, args=(bgqIn, bgqOut)) for _ in xrange(mp.cpu_count())]
for p in procs:
p.start()
with open(infilepath) as infile:
first = ''
second = ''
for line in infile:
line = line.lower()
for word in line.split():
first = second
second = word
bigram = (first, second)
bgqIn.put(bigram)
for p in procs:
bgqIn.put(None)
rqs = [(mp.Queue() for _ in xrange(2)) for i in xrange(mp.cpu_count())]
rprocs = [mp.Process(target=reducer, args=(*rqs[i])) for i in xrange(mp.cpu_count())]
for p in rprocs:
p.start()
qmap = {}
for char in xrange(97,123):
qmap[ord(char)] = rqs[(char-97)/len(rqs)]
dones = 0
while dones != len(procs):
t = bgqOut.get()
if t is None:
dones += 1
else:
qmap[t[0][0]].put(t)
for q in rqs:
q.put(None)
answer = {}
for q in rqs:
for bg,count in iter(q.get, None):
if bg not in answer:
answer[bg] = 0
answer[bg] += count
for bg,count in answer.iteritems():
print "There are", count, "occurrences of", bg
def mapper(qIn, qOut):
counts = {}
for bg in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += 1
for k,v in counts.iteritems():
qOut.put((k,v))
qOut.put(None)
def reducer(qIn, qOut):
counts = {}
for bg,count in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += count
for bg,count in counts.iteritems():
qOut.put((bg,count))
qOut.put(None)
I haven't tested this, but it's a basic skeleton that should get you started.