ValueError with NLTK - python

Using NLTK, I'm trying to print a line of text if the last word of the line has an "NN" POS tag, but I'm getting: "ValueError: too many values to unpack" on the following code. Any ideas why? Thanks in advance.
import nltk
from nltk.tokenize import word_tokenize
def end_of_line():
filename = raw_input("Please enter a text file.> ")
with open(filename) as f:
for line in f:
linewords = nltk.tokenize.word_tokenize(line)
lw_tagged = nltk.tag.pos_tag(linewords)
last_lw_tagged = lw_tagged.pop()
for (word, tag) in last_lw_tagged:
if tag == "NN":
print line
end_of_line()
Traceback (most recent call last):
File "/private/var/folders/ly/n5ph6rcx47q8zz_j4pcj3b880000gn/T/Cleanup At Startup/endofline-477697124.590.py", line 15, in <module>
end_of_line()
File "/private/var/folders/ly/n5ph6rcx47q8zz_j4pcj3b880000gn/T/Cleanup At Startup/endofline-477697124.590.py", line 11, in end_of_line
for (word, tag) in last_lw_tagged:
ValueError: too many values to unpack
logout

Instead of this:
for (word, tag) in last_lw_tagged:
if tag == "NN":
Do this:
if last_lw_tagged[1] == "NN:

Related

word counting in python in file with spaces in name

Here is my code
fname = input("Enter file name: ")
word=input("Enter word to be searched:")
k = 0
with open(fname, 'r') as f:
for line in f:
words = line.split()
for i in words:
if(i==word):
k=k+1
print("Occurrences of the word:")
print(k)
I am running it on windows if the file name is having spaces in it
such as "some file xyz.txt"
then upon running the above code I am getting error
Enter file name: "some file xyz.txt"
Enter word to be searched:cs
Traceback (most recent call last):
File "D:/folder1/folder2/folder3/some file xyz.txt", line 5, in <module>
with open(fname, 'r') as f:
OSError: [Errno 22] Invalid argument: '"some file xyz.txt"'
>>>
How should I enter correct file name with spaces or the code itself is wrong?
Just enter the file name without quotes. Python treats your input as a whole string.

Issue with list indexing while converting letters (devnagari to english)

I am currently trying to map devnagari script with English alphabets. But once in a while I run into the error list index out of range . I don't want to miss out on any list . This is why I do not want to use error handling unless it is necessary. Could you please look into my script and help out why this error is occurring ?
In my word file I have located which word is causing the error but then If i use couple of sentence up and down from that word then the error is not there . i.e I think the error happens at a specific length of string.
clean=[]
dafuq=[]
clean_list = []
replacements = {'अ':'A','आ':'AA', 'इ':'I', 'ई':'II', 'उ':'U','ऊ':'UU', 'ए':'E', 'ऐ':'AI',
'ओ':'O','औ':'OU', 'क':'KA', 'ख':'KHA', 'ग':'GA', 'घ':'GHA', 'ङ':'NGA',
'च':'CA','छ':'CHHA', 'ज':'JA', 'झ':'JHA','ञ':'NIA', 'ट':'TA', 'ठ':'THA',
'ड':'DHA','ढ':'DHHA', 'ण':'NAE', 'त':'TA', 'थ':'THA','द':'DA', 'ध':'DHA',
'न':'NA','प':'PA', 'फ':'FA', 'ब':'B', 'भ':'BHA', 'म':'MA','य':'YA', 'र':'RA',
'ल':'L','व':'WA', 'स':'SA', 'ष':'SHHA', 'श':'SHA', 'ह':'HA', '्':'A',
'ऋ':'RI', 'ॠ':'RI','ऌ':'LI','ॐ':'OMS', 'ः':' ', 'ँ':'U',
'ं':'M', 'ृ':'RI', 'ा':'AA', 'ी':'II', 'ि':'I', 'े':'E', 'ै':'AI',
'ो':'O','ौ':'OU','ु' :'U','ू':'UU' }
import unicodedata
from functools import reduce
def reducer(r, v):
if unicodedata.category(v) in ('Mc', 'Mn'):
r[-1] = r[-1] + v
else:
r.append(v)
return r
with open('words_original.txt', mode='r',encoding="utf-8") as f:
with open ('alphabeths.txt', mode='w+', encoding='utf-8') as d:
with open('only_words.txt', mode='w+', encoding="utf-8") as e:
chunk_size = 4096
f_chunk = f.read(chunk_size)
while len(f_chunk)>0:
for word in f_chunk.split():
for char in ['।', ',', '’', '‘', '?','#','1','2','3','4','0','5','6','7','8','9',
'१','२','३','४','५','.''६','७','८','९','०', '5','6','7','8','9','0','\ufeff']:
if char in word:
word = word.replace(char, '')
if word.strip():
clean_list.append(word)
f_chunk = f.read(chunk_size)
for clean_word in clean_list:
test_word= reduce(reducer,clean_word,[])
final_word= (''.join(test_word))
dafuq.append(final_word)
print (final_word)
f_chunk = f.read(chunk_size)
This is the file I am testing it on
words_original.txt
words_original.txt
stacktrace error
Traceback (most recent call last):
File "C:\Users\KUSHAL\Desktop\EARTHQUAKE_PYTHON\test.py", line 82, in <module>
test_word= reduce(reducer,clean_word,[])
File "C:\Users\KUSHAL\Desktop\EARTHQUAKE_PYTHON\test.py", line 27, in reducer
r[-1] = r[-1] + v
IndexError: list index out of range
The problem lay with some unicode characters. It worked after removing them.

Python - U.S. ZipCode Matching

I'm working with Regex and I'm brand new to using python. I can't get the program to read from file and go through the match case properly. I'm getting a traceback error that looks like this:
Traceback (most recent call last):
File "C:\Users\Systematic\workspace\Project8\src\zipcode.py", line 18, in <module>
m = re.match(info, pattern)
File "C:\Python34\lib\re.py", line 160, in match
return _compile(pattern, flags).match(string)
File "C:\Python34\lib\re.py", line 282, in _compile
p, loc = _cache[type(pattern), pattern, flags]
TypeError: unhashable type: 'list'
zipin.txt:
3285
32816
32816-2362
32765-a234
32765-23
99999-9999
zipcode.py:
from pip._vendor.distlib.compat import raw_input
import re
userinput = raw_input('Please enter the name of the file containing the input zipcodes: ')
myfile = open(userinput)
info = myfile.readlines()
pattern = '^[0-9]{5}(?:-[0-9]{4})?$'
m = re.match(info, pattern)
if m is not None:
print("Match found - valid U.S. zipcode: " , info, "\n")
else: print("Error - no match - invalid U.S. zipcode: ", info, "\n")
myfile.close()
The problem is that readlines() returns a list, and re operates on stuff that is string like. Here is one way it could work:
import re
zip_re = re.compile('^[0-9]{5}(?:-[0-9]{4})?$')
for l in open('zipin.txt', 'r'):
m = zip_re.match(l.strip())
if m:
print l
break
if m is None:
print("Error - no match")
The code now operates in a loop over the file lines, and attempts to match the re on a stripped version of each line.
Edit:
It's actually possible to write this in a much shorter, albeit less clear way:
next((l for l in open('zipin.txt', 'r') if zip_re.match(l.strip())), None)

open: invalid mode or filename

This is the word count program. how could it be made more simple?
import re
from collections import Counter
with open('C:\Data\test.txt') as f:
passage = f.read()
words = re.findall(r'\w+', passage)
cap_words = [word.upper() for word in words]
word_counts = Counter(cap_words)
keep getting this error message:
Traceback (most recent call last):
File "C:/Python27/wordcount", line 4, in <module>
with open('C:\Data\test.txt') as f:
IOError: [Errno 22] invalid mode ('r') or filename: 'C:\\Data\test.txt'
Use a raw string or escape each \ with a \. That is required because without it '\t' will be converted to a tab space:
r'C:\Data\test.txt'
Example:
>>> print 'C:\Data\test.txt'
C:\Data est.txt #\t is converted to tab
>>> print r'C:\Data\test.txt'
C:\Data\test.txt #here it is fine
>>> print 'C:\\Data\\test.txt' #same as raw string, but manual escaping
C:\Data\test.txt

extract tweets from a text file (python)

Sorry, I am just trying to store 'id_str' from each tweet to a new list called ids[]..
but getting the following error:
Traceback (most recent call last):
File "extract_tweet.py", line 17, in
print tweet['id_str']
KeyError: 'id_str'
My code is:
import json
import sys
if __name__ == '__main__':
tweets = []
for line in open (sys.argv[1]):
try:
tweets.append(json.loads(line))
except:
pass
ids = []
for tweet in tweets:
ids.append(tweet['id_str'])
The json data from tweets are sometimes missing fields. Try something like this,
ids = []
for tweet in tweets:
if 'id_str' in tweet:
ids.append(tweet['id_str'])
or equivalently,
ids = [tweet['id_str'] for tweet in tweets if 'id_str' in tweet]
import json
tweets = []
tweets.append(
json.loads('{"a": 1}')
)
tweet = tweets[0]
print(tweet)
print( tweet['id_str'] )
--output:--
{'a': 1}
Traceback (most recent call last):
File "1.py", line 9, in <module>
print( tweet['id_str'] )
KeyError: 'id_str'
And:
my_dict = {u"id_str": 1}
print my_dict["id_str"]
--output:--
1

Categories

Resources