Mapreduce & Python: Bigrams - python

I am looking to alter my map reduce files to output the top bigrams in a chunk of text instead of the word count, so both words and the bigram count
This is my current code and approach.
Map:
import sys
for line in sys.stdin:
line = line.strip()
words = line.split() #bigrams = line.split()
for word in words: #for bigram in words
print '%s\t%s' % (word,1) #print ... word pair???
Reduce:
mydict = dict()
for line in sys.stdin:
(word,cnt) = line.strip().split('\t') #bigram and bigram count
mydict[word] = mydict.get(word,0) 1
for word,cnt in mydict.items():
print word,cnt #print bigram and bigram count
Thank you.
I saw nltk as a popular solution for computing bigrams, should I have that approach even in my mapreduce format?

I wouldn't do it with stdin and stdout. I'd rather throw multiprocessing at this and read from some saved file:
import multiprocessing as mp
def main(infilepath):
bgqIn, bgqOut = [mp.Queue() for _ in xrange(2)]
procs = [mp.Process(target=mapper, args=(bgqIn, bgqOut)) for _ in xrange(mp.cpu_count())]
for p in procs:
p.start()
with open(infilepath) as infile:
first = ''
second = ''
for line in infile:
line = line.lower()
for word in line.split():
first = second
second = word
bigram = (first, second)
bgqIn.put(bigram)
for p in procs:
bgqIn.put(None)
rqs = [(mp.Queue() for _ in xrange(2)) for i in xrange(mp.cpu_count())]
rprocs = [mp.Process(target=reducer, args=(*rqs[i])) for i in xrange(mp.cpu_count())]
for p in rprocs:
p.start()
qmap = {}
for char in xrange(97,123):
qmap[ord(char)] = rqs[(char-97)/len(rqs)]
dones = 0
while dones != len(procs):
t = bgqOut.get()
if t is None:
dones += 1
else:
qmap[t[0][0]].put(t)
for q in rqs:
q.put(None)
answer = {}
for q in rqs:
for bg,count in iter(q.get, None):
if bg not in answer:
answer[bg] = 0
answer[bg] += count
for bg,count in answer.iteritems():
print "There are", count, "occurrences of", bg
def mapper(qIn, qOut):
counts = {}
for bg in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += 1
for k,v in counts.iteritems():
qOut.put((k,v))
qOut.put(None)
def reducer(qIn, qOut):
counts = {}
for bg,count in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += count
for bg,count in counts.iteritems():
qOut.put((bg,count))
qOut.put(None)
I haven't tested this, but it's a basic skeleton that should get you started.

Related

How do I count word occurrence in each line if the word is in a dictionary

I am trying to count the number of positive, negative, and neutral words in each line. I have a text file containing lines of reviews called reviews.txt.
My Code:
poswords = {} #contains positive words
negwords = {} #contains negative words
with open(path + "reviews.txt", 'r') as f:
possum = 0
negsum = 0
neutsum = 0
for line in f.readlines():
lower = line.lower()
for word in lower.split():
if word in poswords:
possum += 1
elif word in negwords:
negsum += 1
else:
neutsum += 1
print(possum)
print(negsum)
print(neutsum)
Output:
1401
633
18351
Instead of counting positive, negative, and neutral words for the whole text file, how do I show the count for each line?
Put last 3 print statements inside for loop. Like
poswords = {} #contains positive words
negwords = {} #contains negative words
with open(path + "reviews.txt", 'r') as f:
for line in f.readlines():
possum = 0
negsum = 0
neutsum = 0
lower = line.lower()
for word in lower.split():
if word in poswords:
possum += 1
elif word in negwords:
negsum += 1
else:
neutsum += 1
print("Line: ", line)
print(possum)
print(negsum)
print(neutsum)
Set your count variables back to zero for each line and then print the variables after going through the line.
poswords = {} #contains positive words
negwords = {} #contains negative words
with open(path + "reviews.txt", 'r') as f:
for line in f.readlines():
possum = 0
negsum = 0
neutsum = 0
lower = line.lower()
for word in lower.split():
if word in poswords:
possum += 1
elif word in negwords:
negsum += 1
else:
neutsum += 1
print("\n", line)
print(possum)
print(negsum)
print(neutsum)
This can be done with re as well:
poswords = {...}
negwords = {...}
pos = '|'.join(poswords)
neg = '|'.join(negwords)
with open("reviews.txt", 'r') as f:
matches = re.findall(f'({pos})|({neg})|(\w+)', f.read())
positive, negitive, neutral = (sum(map(bool, g)) for g in zip(*matches))

Python code taking more than 15 minutes to generate output

import os,re
import math
from math import log10
import nltk.corpus
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
python_file_root = './presidential_debates'
def getidf(token):
document_occurance = 0
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
for line in file:
if re.search(r'\b' +token+ r'\b', line):
document_occurance = document_occurance + 1
break
if (document_occurance != 0):
idf = log10(30 / document_occurance)
return idf
return -1
def normalize(filename,token):
file = open(os.path.join(python_file_root, filename), "r")
counts = dict()
square = []
count1 = 0
for line in file:
count1 = count1 + 1
if line in counts:
counts[line] += 1
else:
counts[line] = 1
for key,value in counts.items():
tf = 1 +log10(value)
idf = getidf(key.rstrip())
square.append((tf * idf)*(tf * idf))
summ = sum(square)
sqroot = math.sqrt(summ)
return sqroot
def getweight(filename,token):
hit_count1 = 0
final = 0
file = open(os.path.join(python_file_root, filename), "r")
idft = getidf(token)
for line in file:
if re.search(r'\b' +token+ r'\b', line):
hit_count1 = hit_count1 + 1
if (hit_count1 == 0):
return 0
else:
tf = 1 + log10(hit_count1)
initial = idft * tf
if(initial <= 0):
final = 0
return final
else:
normalize_fact = normalize(filename,token)
final = initial / normalize_fact
return final
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
doc = file.read()
doc = doc.lower()
stemmed = []
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
tokens = tokenizer.tokenize(doc)
stoplist = stopwords.words('english')
stop_removed = [word for word in tokens if word not in stoplist]
with open(os.path.join(python_file_root, filename), "w") as f:
for item in stop_removed:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(item)]
for items in stemmed:
f.write("%s\n" % items)
print("\nIDF\n")
print("%.12f" % getidf("health"))
print("%.12f" % getidf("agenda"))
print("%.12f" % getidf("vector"))
print("%.12f" % getidf("reason"))
print("%.12f" % getidf("hispan"))
print("%.12f" % getidf("hispanic"))
print("\n")
print("%.12f" % getweight("2012-10-03.txt","health"))
print("%.12f" % getweight("1960-10-21.txt","reason"))
print("%.12f" % getweight("1976-10-22.txt","agenda"))
print("%.12f" % getweight("2012-10-16.txt","hispan"))
print("%.12f" % getweight("2012-10-16.txt","hispanic"))
I have 30 txt files and i have developed a program to find the idf and normalized tf-idf vectors. Im getting the correct values but the function getweight takes more than 15 minutes to generate the output. Can anyone suggest me a few methods for optimization.
I donot want to use any other non-standard Python package.
Why do you create a new PorterStemmer for every word?
Apart from this obvious thing, try profiling your code. NLTI has the reputation of being really slow - so it may well be not your fault. If you profile, then you'll know.

Invalid Output for Coursera Python Assignment

I'm doing the Coursera Python for Everybody stream and I'm stuck on Assignment 10.2. I'm getting invalid output for it. Here is what the assignment asks:
Write a program to read through the mbox-short.txt and figure out the
distribution by hour of the day for each of the messages. You can pull
the hour out from the 'From ' line by finding the time and then
splitting the string a second time using a colon.
From stephen.marquard#uct.ac.za Sat Jan 5 09:14:16 2008
Once you have accumulated the counts for each hour, print out the
counts, sorted by hour as shown below.
Here is my code:
name = raw_input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
counts = dict()
lst = list()
for line in handle:
line = line.rstrip()
if not line.startswith('From '):
continue
words = line.split()
words = words[5]
words = words.split(":")
for word in counts:
counts[word] = counts.get(word, 0) + 1
lst = list()
for key, val in counts.items():
lst.append((key, val))
lst.sort()
print lst
Let me know what I'm doing wrong. Any advice or hint is appreciated.
I think you are iterating through the wrong thing in the inner loop: it should be for word in words, not for word in counts.
name = raw_input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
hours = dict()
for line in handle:
if line.startswith("From "):
hour = line.split()[5].split(':')[0]
hours[hour] = hours.get(hour, 0) + 1
for key, value in sorted(hours.items(), None):
print key, value
name = input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
counts = {}
for line in handle:
if not line.startswith("From "):continue
time = line.split()
time = time[5]
hour = time.split(':')
hour = hour[0]
counts[hour] = counts.get(hour, 0) + 1
for k, v in sorted(counts.items()):
print (k,v)
counts = dict()
for line in handle:
if line.startswith ('From '):
words = line.split()
hour=words[5].split(':')
counts[hour[0]]= counts.get(hour[0],0)+1
lst=list()
for key, val in counts.items():
lst.append((key,val))
lst=sorted(lst)
for a,b in lst:
print (a,b)
name =input("Enter file:")
if len(name) < 1:
name = "mbox-short.txt"
handle = open(name)
counts = dict()
for line in handle:
if line.startswith("From "):
time = line.split()[5].split(":")
counts [time[0]] = counts.get(time[0], 0) + 1
#print sorted( [ (v,k) for k,v in counts.items()] )
list = list()
for key, value in counts.items():
list.append( (key,value) )
list.sort()
for hour, counts in list:
print (hour, counts)
file = open('words.txt')
dic = dict()
lst = list()
for line in file :
line = line.rstrip()
if not line.startswith('From '):
continue
words = line.split()
words= words[5].split(':')
words = words[0]
dic[words] = dic.get(words,0)+1
for k,v in dic.items():
lst.append((k,v))
lst.sort()
for k,v in lst:
print(k,v)
name = input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
g={}
for line in handle :
if not line.startswith('From'): continue
werds=line.split()[5:6]
for werd in werds :
we=werd.split(':')[0]
g[we]=g.get(we,0)+1
lst=list()
for v,k in g.items() :
new=(v,k)
lst.append(new)
lst=sorted(lst)
for v,k in lst :
print(v,k)
name = input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
one = dict()
for line1 in handle:
if line1.startswith("From "):
lst1 = line1.split()
lst2 = lst1[5].split(":")
word = lst2[0]
one[word] = one.get(word,0) + 1
lst3 = list()
for k,v in one.items():
tup = (k,v)
lst3.append(tup)
lst3 = sorted(lst3,)
for k,v in lst3:
print(k,v)
name = input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
h = dict()
for line in handle:
if line.startswith('From '):
l = line.split()[5].split(':')[0]
h[l] = h.get(l, 0) +1
for k,v in sorted(h.items(), None):
print(k,v)
This worked for me:-
Opening the file
name = input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
lst = list()
counts = dict()
Splitting each line of the file to get words, then word at 6th place, then in that first letter of that 6th word splitted by ':', appending that into the list
for lines in handle:
if not lines.startswith('From '): continue
words = lines.split()
words = words[5]
words = words.split(':')
lst.append(words[0])
Now counting the letters occurred at different no. of times
for i in lst:
counts[i] = counts.get(i,0) + 1
Finally sorting them by key(here time)
for k, v in sorted(counts.items()):
print(k,v)
name = input("Enter file: ")
if len(name) < 1:
name = "mbox-short.txt"
handle = open(name)
hours = dict()
for line in handle:
# Skipping lines we don't need
if not line.startswith("From "):
continue
words = line.split()
# Finding text in the line that we need
time = words[5]
time = time.split(":")
hour_time = time[0]
# Adding to the dictionary and checking if it already there
hours[hour_time] = hours.get(hour_time, 0) + 1
#Sorting dictionary using sorted() method
hours_sorted = sorted(hours.items())
for key, value in sorted(hours.items()):
print(key, value)
name = raw_input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
hours = dict()
for line in handle:
if line.startswith("From "):
hour = line.split()[5].split(':')[0]
hours[hour] = hours.get(hour, 0) + 1
for key, value in sorted(hours.items(), None):
print key, value

Can I use bisect to print the content of a line?

I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

Word count with pattern in Python

So this is the question:
Write a program to read in multiple lines of text and count the number
of words in which the rule i before e, except after c is broken, and
number of words which contain either ei or ie and which don't break
the rule.
For this question, we only care about the c if it is the character
immediately before the ie or the ei. So science counts as breaking the
rule, but mischievous doesn't. If a word breaks the rule twice (like
obeisancies), then it should still only be counted once.
Example given:
Line: The science heist succeeded
Line: challenge accepted
Line:
Number of times the rule helped: 0
Number of times the rule was broken: 2
and my code:
rule = []
broken = []
line = None
while line != '':
line = input('Line: ')
line.replace('cie', 'broken')
line.replace('cei', 'rule')
line.replace('ie', 'rule')
line.replace('ei', 'broken')
a = line.count('rule')
b = line.count('broken')
rule.append(a)
broken.append(b)
print(sum(a)); print(sum(b))
How do I fix my code, to work like the question wants it to?
I'm not going to write the code to your exact specification as it sounds like homework but this should help:
import pprint
words = ['science', 'believe', 'die', 'friend', 'ceiling',
'receipt', 'seize', 'weird', 'vein', 'foreign']
rule = {}
rule['ie'] = []
rule['ei'] = []
rule['cei'] = []
rule['cie'] = []
for word in words:
if 'ie' in word:
if 'cie' in word:
rule['cie'].append(word)
else:
rule['ie'].append(word)
if 'ei' in word:
if 'cei' in word:
rule['cei'].append(word)
else:
rule['ei'].append(word)
pprint.pprint(rule)
Save it to a file like i_before_e.py and run python i_before_e.py:
{'cei': ['ceiling', 'receipt'],
'cie': ['science'],
'ei': ['seize', 'weird', 'vein', 'foreign'],
'ie': ['believe', 'die', 'friend']}
You can easily count the occurrences with:
for key in rule.keys():
print "%s occured %d times." % (key, len(rule[key]))
Output:
ei occured 4 times.
ie occured 3 times.
cie occured 1 times.
cei occured 2 times.
Firstly, replace does not chance stuff in place. What you need is the return value:
line = 'hello there' # line = 'hello there'
line.replace('there','bob') # line = 'hello there'
line = line.replace('there','bob') # line = 'hello bob'
Also I would assume you want actual totals so:
print('Number of times the rule helped: {0}'.format(sum(rule)))
print('Number of times the rule was broken: {0}'.format(sum(broken)))
You are printing a and b. These are the numbers of times the rule worked and was broken in the last line processed. You want totals.
As a sidenote: Regular expressions are good for things like this. re.findall would make this a lot more sturdy and pretty:
line = 'foo moo goo loo foobar cheese is great '
foo_matches = len(re.findall('foo', line)) # = 2
Let's split the logic up into functions, that should help us reason about the code and get it right. To loop over the line, we can use the iter function:
def rule_applies(word):
return 'ei' in word or 'ie' in word
def complies_with_rule(word):
if 'cie' in word:
return False
if word.count('ei') > word.count('cei'):
return False
return True
helped_count = 0
broken_count = 0
lines = iter(lambda: input("Line: "), '')
for line in lines:
for word in line.split():
if rule_applies(word):
if complies_with_rule(word):
helped_count += 1
else:
broken_count += 1
print("Number of times the rule helped:", helped_count)
print("Number of times the rule was broken:", broken_count)
We can make the code more concise by shortening the complies_with_rule function and by using generator expressions and Counter:
from collections import Counter
def rule_applies(word):
return 'ei' in word or 'ie' in word
def complies_with_rule(word):
return 'cie' not in word and word.count('ei') == word.count('cei')
lines = iter(lambda: input("Line: "), '')
words = (word for line in lines for word in line.split())
words_considered = (word for word in words if rule_applies(word))
did_rule_help_count = Counter(complies_with_rule(word) for word in words_considered)
print("Number of times the rule helped:", did_rule_help_count[True])
print("Number of times the rule was broken:", did_rule_help_count[False])
If I understand correctly, your main problematic is to get unique result per word. Is that what you try to achieve:
rule_count = 0
break_count = 0
line = None
while line != '':
line = input('Line: ')
rule_found = False
break_found = False
for word in line.split():
if 'cie' in line:
line = line.replace('cie', '')
break_found = True
if 'cei' in line:
line = line.replace('cei', '')
rule_found = True
if 'ie' in line:
rule_found = True
if 'ei' in line:
break_found = True
if rule_found:
rule_count += 1
if break_found:
break_count += 1
print(rule_found); print(break_count)
rule = []
broken = []
tb = 0
tr = 0
line = ' '
while line:
lines = input('Line: ')
line = lines.split()
for word in line:
if 'ie' in word:
if 'cie' in word:
tb += 1
elif word.count('cie') > 1:
tb += 1
elif word.count('ie') > 1:
tr += 1
elif 'ie' in word:
tr += 1
if 'ei' in word:
if 'cei' in word:
tr += 1
elif word.count('cei') > 1:
tr += 1
elif word.count('ei') > 1:
tb += 1
elif 'ei' in word:
tb += 1
print('Number of times the rule helped: {0}'.format(tr))
print('Number of times the rule was broken: {0}'.format(tb))
Done.

Categories

Resources