Issue with list indexing while converting letters (devnagari to english) - python

I am currently trying to map devnagari script with English alphabets. But once in a while I run into the error list index out of range . I don't want to miss out on any list . This is why I do not want to use error handling unless it is necessary. Could you please look into my script and help out why this error is occurring ?
In my word file I have located which word is causing the error but then If i use couple of sentence up and down from that word then the error is not there . i.e I think the error happens at a specific length of string.
clean=[]
dafuq=[]
clean_list = []
replacements = {'अ':'A','आ':'AA', 'इ':'I', 'ई':'II', 'उ':'U','ऊ':'UU', 'ए':'E', 'ऐ':'AI',
'ओ':'O','औ':'OU', 'क':'KA', 'ख':'KHA', 'ग':'GA', 'घ':'GHA', 'ङ':'NGA',
'च':'CA','छ':'CHHA', 'ज':'JA', 'झ':'JHA','ञ':'NIA', 'ट':'TA', 'ठ':'THA',
'ड':'DHA','ढ':'DHHA', 'ण':'NAE', 'त':'TA', 'थ':'THA','द':'DA', 'ध':'DHA',
'न':'NA','प':'PA', 'फ':'FA', 'ब':'B', 'भ':'BHA', 'म':'MA','य':'YA', 'र':'RA',
'ल':'L','व':'WA', 'स':'SA', 'ष':'SHHA', 'श':'SHA', 'ह':'HA', '्':'A',
'ऋ':'RI', 'ॠ':'RI','ऌ':'LI','ॐ':'OMS', 'ः':' ', 'ँ':'U',
'ं':'M', 'ृ':'RI', 'ा':'AA', 'ी':'II', 'ि':'I', 'े':'E', 'ै':'AI',
'ो':'O','ौ':'OU','ु' :'U','ू':'UU' }
import unicodedata
from functools import reduce
def reducer(r, v):
if unicodedata.category(v) in ('Mc', 'Mn'):
r[-1] = r[-1] + v
else:
r.append(v)
return r
with open('words_original.txt', mode='r',encoding="utf-8") as f:
with open ('alphabeths.txt', mode='w+', encoding='utf-8') as d:
with open('only_words.txt', mode='w+', encoding="utf-8") as e:
chunk_size = 4096
f_chunk = f.read(chunk_size)
while len(f_chunk)>0:
for word in f_chunk.split():
for char in ['।', ',', '’', '‘', '?','#','1','2','3','4','0','5','6','7','8','9',
'१','२','३','४','५','.''६','७','८','९','०', '5','6','7','8','9','0','\ufeff']:
if char in word:
word = word.replace(char, '')
if word.strip():
clean_list.append(word)
f_chunk = f.read(chunk_size)
for clean_word in clean_list:
test_word= reduce(reducer,clean_word,[])
final_word= (''.join(test_word))
dafuq.append(final_word)
print (final_word)
f_chunk = f.read(chunk_size)
This is the file I am testing it on
words_original.txt
words_original.txt
stacktrace error
Traceback (most recent call last):
File "C:\Users\KUSHAL\Desktop\EARTHQUAKE_PYTHON\test.py", line 82, in <module>
test_word= reduce(reducer,clean_word,[])
File "C:\Users\KUSHAL\Desktop\EARTHQUAKE_PYTHON\test.py", line 27, in reducer
r[-1] = r[-1] + v
IndexError: list index out of range

The problem lay with some unicode characters. It worked after removing them.

Related

Unhashable type: list

I am working on a program that parses through log files and returns the top hits for IP addresses and a couple other things. Currently I am having trouble and I cannot interpret any of the answers to this problem to what I have going on right now. This is all of my code:
import gzip
from collections import Counter
logFileName = open('C:\\Users\\Pawlaczykm\\Desktop\\fileNames.txt', 'r')
ipAdd = []
landingPages = []
ALL_ipAdd = []
ALL_landingPages = []
# everything after this line gets done to all files
for line in logFileName.readlines():
# rstrip removes a blank line from output
# print 'Summary of: ' + line.rstrip()
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the ip addresses in lines 15-18
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
ipAdd.append(parts[2])
ALL_ipAdd.append(ipAdd)
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the landing pages
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
variable = parts[8].split('?')[0]
landingPages.append(variable)
v): (-v, k))[:10]
ALL_landingPages.append(landingPages)
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
sortedALL_ipAdd = sorted(ALL_ipAddDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top IPs of all files'
print(sortedALL_ipAdd)
ALL_LandingPageDict = dict(Counter(ALL_landingPages).most_common())
sortedALL_LandingPage = sorted(ALL_LandingPageDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top landing pages of all files'
print (sortedALL_LandingPage)
Now where I am having trouble is in the following line:
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
The output when I run the whole program is this:
Traceback (most recent call last):
File "C:/Users/Pawlaczykm/PycharmProjects/LogParse/parseText.py", line 35, in <module>
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
File "C:\Python27\lib\collections.py", line 477, in __init__
self.update(*args, **kwds)
File "C:\Python27\lib\collections.py", line 567, in update
self[elem] = self_get(elem, 0) + 1
TypeError: unhashable type: 'list'
Can somebody help me? This is frustrating.
From your code ALL_ipAdd = [] and ipAdd = [] and ALL_ipAdd.append(ipAdd) we can conclude that ALL_ipAdd is a list of list. Counter is a subtype of dict, which hashes its items before it counts them. Lists cannot be hashed because they are mutable (if the list changed the hash would change) and thus lists can't be counted by Counter objects.
To solve this you can convert the inner lists to tuples before counting them:
ALL_ipAddDict = dict(Counter(map(tuple, ALL_ipAdd)).most_common())
That's normal. ALL_ipAdd is a list of lists. Counter needs a list, a string or any other hashable type :)

Python - How do i get filename in wordcount mapreduce job

My task is to get wordcount from txt file using mapreduce job. However, im getting key error when i try to print filename with wordcount. Please help me out.
#!/usr/bin/env python
import sys
import os
import re
# input comes from STDIN (standard input)
for line in sys.stdin:
stopwords = ['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your']
# remove leading and trailing whitespace
line = line.strip()
# split the line into words
fname = os.environ['map_input_file']
words = re.findall(r"[A-Za-z]+", line)
words = line.split()
words = [word for word in words if word not in stopwords]
# increase counters
for word in words:
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
#
# tab-delimited; the trivial word count is 1
print '%s\t%s' % (word + ' ' + fname, 1)
I have to pass the reducer (word&filename, 1). With above code I gets key error.
File "/home/s/ss/ssa8455/mapper.py", line 12, in ?
fname = os.environ['map_input_file']
File "/usr/lib64/python2.4/UserDict.py", line 17, in __getitem__
def __getitem__(self, key): return self.data[key]
KeyError: 'map_input_file'

Python - U.S. ZipCode Matching

I'm working with Regex and I'm brand new to using python. I can't get the program to read from file and go through the match case properly. I'm getting a traceback error that looks like this:
Traceback (most recent call last):
File "C:\Users\Systematic\workspace\Project8\src\zipcode.py", line 18, in <module>
m = re.match(info, pattern)
File "C:\Python34\lib\re.py", line 160, in match
return _compile(pattern, flags).match(string)
File "C:\Python34\lib\re.py", line 282, in _compile
p, loc = _cache[type(pattern), pattern, flags]
TypeError: unhashable type: 'list'
zipin.txt:
3285
32816
32816-2362
32765-a234
32765-23
99999-9999
zipcode.py:
from pip._vendor.distlib.compat import raw_input
import re
userinput = raw_input('Please enter the name of the file containing the input zipcodes: ')
myfile = open(userinput)
info = myfile.readlines()
pattern = '^[0-9]{5}(?:-[0-9]{4})?$'
m = re.match(info, pattern)
if m is not None:
print("Match found - valid U.S. zipcode: " , info, "\n")
else: print("Error - no match - invalid U.S. zipcode: ", info, "\n")
myfile.close()
The problem is that readlines() returns a list, and re operates on stuff that is string like. Here is one way it could work:
import re
zip_re = re.compile('^[0-9]{5}(?:-[0-9]{4})?$')
for l in open('zipin.txt', 'r'):
m = zip_re.match(l.strip())
if m:
print l
break
if m is None:
print("Error - no match")
The code now operates in a loop over the file lines, and attempts to match the re on a stripped version of each line.
Edit:
It's actually possible to write this in a much shorter, albeit less clear way:
next((l for l in open('zipin.txt', 'r') if zip_re.match(l.strip())), None)

Read special characters from .txt file in python

The goal of this code is to find the frequency of words used in a book.
I am tying to read in the text of a book but the following line keeps throwing my code off:
precious protégés. No, gentlemen; he'll always show 'em a clean pair
specifically the é character
I have looked at the following documentation, but I don't quite understand it: https://docs.python.org/3.4/howto/unicode.html
Heres my code:
import string
# Create word dictionary from the comprehensive word list
word_dict = {}
def create_word_dict ():
# open words.txt and populate dictionary
word_file = open ("./words.txt", "r")
for line in word_file:
line = line.strip()
word_dict[line] = 1
# Removes punctuation marks from a string
def parseString (st):
st = st.encode("ascii", "replace")
new_line = ""
st = st.strip()
for ch in st:
ch = str(ch)
if (n for n in (1,2,3,4,5,6,7,8,9,0)) in ch or ' ' in ch or ch.isspace() or ch == u'\xe9':
print (ch)
new_line += ch
else:
new_line += ""
# now remove all instances of 's or ' at end of line
new_line = new_line.strip()
print (new_line)
if (new_line[-1] == "'"):
new_line = new_line[:-1]
new_line.replace("'s", "")
# Conversion from ASCII codes back to useable text
message = new_line
decodedMessage = ""
for item in message.split():
decodedMessage += chr(int(item))
print (decodedMessage)
return new_line
# Returns a dictionary of words and their frequencies
def getWordFreq (file):
# Open file for reading the book.txt
book = open (file, "r")
# create an empty set for all Capitalized words
cap_words = set()
# create a dictionary for words
book_dict = {}
total_words = 0
# remove all punctuation marks other than '[not s]
for line in book:
line = line.strip()
if (len(line) > 0):
line = parseString (line)
word_list = line.split()
# add words to the book dictionary
for word in word_list:
total_words += 1
if (word in book_dict):
book_dict[word] = book_dict[word] + 1
else:
book_dict[word] = 1
print (book_dict)
# close the file
book.close()
def main():
wordFreq1 = getWordFreq ("./Tale.txt")
print (wordFreq1)
main()
The error that I received is as follows:
Traceback (most recent call last):
File "Books.py", line 80, in <module>
main()
File "Books.py", line 77, in main
wordFreq1 = getWordFreq ("./Tale.txt")
File "Books.py", line 60, in getWordFreq
line = parseString (line)
File "Books.py", line 36, in parseString
decodedMessage += chr(int(item))
OverflowError: Python int too large to convert to C long
When you open a text file in python, the encoding is ANSI by default, so it doesn't contain your é chartecter. Try
word_file = open ("./words.txt", "r", encoding='utf-8')
The best way I could think of is to read each character as an ASCII value, into an array, and then take the char value. For example, 97 is ASCII for "a" and if you do char(97) it will output "a". Check out some online ASCII tables that provide values for special characters also.
Try:
def parseString(st):
st = st.encode("ascii", "replace")
# rest of code here
The new error you are getting is because you are calling isalpha on an int (i.e. a number)
Try this:
for ch in st:
ch = str(ch)
if (n for n in (1,2,3,4,5,6,7,8,9,0) if n in ch) or ' ' in ch or ch.isspace() or ch == u'\xe9':
print (ch)

open: invalid mode or filename

This is the word count program. how could it be made more simple?
import re
from collections import Counter
with open('C:\Data\test.txt') as f:
passage = f.read()
words = re.findall(r'\w+', passage)
cap_words = [word.upper() for word in words]
word_counts = Counter(cap_words)
keep getting this error message:
Traceback (most recent call last):
File "C:/Python27/wordcount", line 4, in <module>
with open('C:\Data\test.txt') as f:
IOError: [Errno 22] invalid mode ('r') or filename: 'C:\\Data\test.txt'
Use a raw string or escape each \ with a \. That is required because without it '\t' will be converted to a tab space:
r'C:\Data\test.txt'
Example:
>>> print 'C:\Data\test.txt'
C:\Data est.txt #\t is converted to tab
>>> print r'C:\Data\test.txt'
C:\Data\test.txt #here it is fine
>>> print 'C:\\Data\\test.txt' #same as raw string, but manual escaping
C:\Data\test.txt

Categories

Resources