unicode error displayed on the server on running app (django) - python

my views.py code:
#!/usr/bin/python
from django.template import loader, RequestContext
from django.http import HttpResponse
#from skey import find_root_tags, count, sorting_list
from search.models import Keywords
from django.shortcuts import render_to_response as rr
def front_page(request):
if request.method == 'POST' :
from skey import find_root_tags, count, sorting_list
str1 = request.POST['word']
fo = open("/home/pooja/Desktop/xml.txt","r")
for i in range(count.__len__()):
file = fo.readline()
file = file.rstrip('\n')
find_root_tags(file,str1,i)
list.append((file,count[i]))
sorting_list(list)
for name, count1 in list:
s = Keywords(file_name=name,frequency_count=count1)
s.save()
fo.close()
list1 = Keywords.objects.all()
t = loader.get_template('search/results.html')
c = RequestContext({'list1':list1,
})
return HttpResponse(t.render(c))
else :
str1 = ''
list = []
template = loader.get_template('search/front_page.html')
c = RequestContext(request)
response = template.render(c)
return HttpResponse(response)
skey.py has another function called within from find_root_tags():
def find_text(file,str1,i):
str1 = str1.lower()
exp = re.compile(r'<.*?>')
with open(file) as f:
lines = ''.join(line for line in f.readlines())
text_only = exp.sub('',lines).strip()
text_only = text_only.lower()
k = text_only.count(str1) #**line 34**
count[i] = count[i]+k
when I ran my app on server it gave me this error:
UnicodeDecodeError at /search/
'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
Request Method: POST
Request URL: http://127.0.0.1:8000/search/
Django Version: 1.4
Exception Type: UnicodeDecodeError
Exception Value:
'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
Exception Location: /home/pooja/Desktop/mysite/search/skey.py in find_text, line 34
Python Executable: /usr/bin/python
Python Version: 2.6.5
Python Path: ['/home/pooja/Desktop/mysite',
'/usr/lib/python2.6',
'/usr/lib/python2.6/plat-linux2',
'/usr/lib/python2.6/lib-tk',
'/usr/lib/python2.6/lib-old',
'/usr/lib/python2.6/lib-dynload',
'/usr/lib/python2.6/dist-packages',
'/usr/lib/python2.6/dist-packages/PIL',
'/usr/lib/python2.6/dist-packages/gst-0.10',
'/usr/lib/pymodules/python2.6',
'/usr/lib/python2.6/dist-packages/gtk-2.0',
'/usr/lib/pymodules/python2.6/gtk-2.0',
'/usr/local/lib/python2.6/dist-packages'] error :
Can anyone tell me why is it showing this error?How can I remove this error
Please help.

You're mixing Unicode strings and bytestrings. str1 = request.POST['word'] is probably a Unicode string and text_only is a bytestring. Python fails to convert the later to Unicode. You could use codecs.open() to specify the character encoding of the file. See Pragmatic Unicode and The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!).

Probable your str1 is in unicode, but text_only is not (on line 34). The next is not a panacea but if this corrects your problem then I am right.
k = u"{0}".format( text_only ).count(str1)

Related

Udacity Fundamental of programming with Python:

import urllib.request
def read_text():
quotes = open(r"C:\Users\Intel\Google Drive\Udacity\Full Stack\AbdoulCoverLetter.txt")
contents_of_files = quotes.read()
print(contents_of_files)
quotes.close()
check_profanity(contents_of_files)
My code is showing the following error:
UnicodeEncodeError: 'ascii' codec can't encode character '\u2022' in position 154: ordinal not in range(128)
I am not sure what is the issue. Please help..
def check_profanity(text_to_check):
with urllib.request.urlopen("http://www.wdylike.appspot.com/?q="+text_to_check) as response:
connection = response.read()
output = connection.read()
connection.close()
read_text()
I removed the bullet list and I changed the code. It is working fine now. Thanks..
import urllib.request
def read_text():
quotes = open(r"C:\Users\Intel\Google Drive\Udacity\Full Stack\AbdoulCoverLetter.txt")
contents_of_files = quotes.read()
print(contents_of_files)
quotes.close()
check_profanity(contents_of_files)
def check_profanity(text_to_check):
connection = urllib.request.urlopen("http://www.wdylike.appspot.com/?" + urllib.parse.urlencode([('q', text_to_check)]))
output = connection.read()
connection.close()
read_text()

Python.- fuzzy.DMetaphone 'ascii' error

How is that possible, that with the same input I sometime get ascii codec error, and sometime it works just fine? The code cleans the name and build it's Soundex and DMetaphone values. It works in ~1 out of 5 runs, sometimes more often :)
UPD: Looks like that's an issue of fuzzy.DMetaphone, at least on Python2.7 with Unicode. Plan to integrate Metaphone instead, for now. All solutions for fuzzy.DMetaphone problem are very welcome :)
UPD 2: Problem is gone after fuzzy update to 1.2.2. The same code works fine.
import re
import fuzzy
import sys
def make_namecard(full_name):
soundex = fuzzy.Soundex(4)
dmeta = fuzzy.DMetaphone(4)
names = process_name(full_name)
print names
soundexes = map(soundex, names)
dmetas = []
for name in names:
print name
dmetas.extend(list(dmeta(name)))
dmetas = filter(bool, dmetas)
return {
"full_name": full_name,
"soundex": soundexes,
"dmeta": dmetas,
"names": names,
}
def process_name(full_name):
full_name = re.sub("[_-]", " ", full_name)
full_name = re.sub(r'[^A-Za-z0-9 ]', "", full_name)
names = full_name.split()
names = filter(valid_name, names)
return names
def valid_name(name):
COMMON_WORDS = ["the", "of"]
return len(name) >= 2 and name.lower() not in COMMON_WORDS
print make_namecard('Jerusalem Warriors')
Output:
➜ python2.7 make_namecard.py
['Jerusalem', 'Warriors']
Jerusalem
Warriors
{'soundex': [u'J624', u'W624'], 'dmeta': [u'\x00\x00\x00\x00', u'ARSL', u'ARRS', u'FRRS'], 'full_name': 'Jerusalem Warriors', 'names': ['Jerusalem', 'Warriors']}
➜ python2.7 make_namecard.py
['Jerusalem', 'Warriors']
Jerusalem
Traceback (most recent call last):
File "make_namecard.py", line 38, in <module>
print make_namecard('Jerusalem Warriors')
File "make_namecard.py", line 16, in make_namecard
dmetas.extend(list(dmeta(name)))
File "src/fuzzy.pyx", line 258, in fuzzy.DMetaphone.__call__
UnicodeDecodeError: 'ascii' codec can't decode byte 0xab in position 0: ordinal not in range(128)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xaa in position 2370: ordinal not in range(128) [duplicate]

This question already has an answer here:
how to interpret this error "UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 164: ordinal not in range(128)"
(1 answer)
Closed 5 years ago.
I'm writing a script in Python 3.5.3 that takes username/password combos from a file and writes them to another file. The script was written on a machine with Windows 10 and worked. However, when I tried to run the script on a MacBook running Yosemite, I got an error that has something to do with ASCII encoding.
The relevant function is this:
def buildDatabase():
print("Building database, this may take some time...")
passwords = open("10-million-combos.txt", "r") #File with user/pword combos.
hashWords = open("Hashed Combos.txt", "a") #File where user/SHA-256 encrypted pwords will be stored.
j = 0
hashTable = [[ None ] for x in range(60001)] #A hashtable with 30,000 elements, quadratic probing means size must = 2 x the final size + 1
for line in passwords:
toSearch = line
i = q = toSearch.find("\t") #The username/pword combos are formatted: username\tpassword\n.
n = toSearch.find("\n")
password = line[i:n-1] #i is the start of the password, n is the end of it
username = toSearch[ :q] + ":" #q is the end of the username
byteWord = password.encode('UTF-8')
sha.update(byteWord)
toWrite = sha.hexdigest() #password is encrypted to UTF-8, run thru SHA-256, and stored in toWrite
skip = False
if len(password) == 0: #if le(password) is 0, just skip it
skip = True
if len(password) == 1:
doModulo = ord(password[0]) ** 4
if len(password) == 2:
doModulo = ord(password[0]) * ord(password[0]) * ord(password[1]) * ord(password[1])
if len(password) == 3:
doModulo = ord(password[0]) * ord(password[0]) * ord(password[1]) * ord(password[2])
if len(password) > 3:
doModulo = ord(password[0]) * ord(password[1]) * ord(password[2]) * ord(password[3])
assignment = doModulo % 60001
#The if block above gives each combo an assignment number for a hash table, indexed by password because they're more unique than usernames
successful = False
collision = 0
The error is as follows:
Traceback (most recent call last):
File "/Users/connerboehm/Documents/Conner B/PythonFinalProject.py", line 104, in <module>
buildDatabase()
File "/Users/connerboehm/Documents/Conner B/PythonFinalProject.py", line 12, in buildDatabase
for line in passwords:
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/encodings/ascii.py", line 26, in decode
return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xaa in position 2370: ordinal not in range(128)
What's happening here? I haven't gotten this error before on Windows, and I can't see any problem with my attempt to encode into UTF-8.
Edit: Notepad encodes in ANSI. Changing the encoding (just copying and pasting the data to a new .txt file) to UTF-8 solved the problem.
Your program doesn't say what codec is used in the file "10-million-combos.txt", so Python is in this case trying to decode it as ASCII. 0xaa isn't an ASCII ordinal so that failed. Identify what codec is used in your file and pass that in the encoding parameter for open.

YouTube API search list_next() throws UnicodeEncodeError

When I feed a non-English string into the YouTube API library's
search, it only works during the initial search. If I call list_next(),
it throws a UnicodeEncodeError.
When I use a simple ascii string, everything works correctly.
Any suggestions about what I should do?
Here's a simplified code of what I'm doing:
# -*- coding: utf-8 -*-
import apiclient.discovery
def test(query):
youtube = apiclient.discovery.build('youtube', 'v3', developerKey='xxx')
ys = youtube.search()
req = ys.list(
q=query.encode('utf-8'),
type='video',
part='id,snippet',
maxResults=50
)
while (req):
res = req.execute()
for i in res['items']:
print(i['id']['videoId'])
req = ys.list_next(req, res)
test(u'한글')
test(u'日本語')
test(u'\uD55C\uAE00')
test(u'\u65E5\u672C\u8A9E')
Error message:
Traceback (most recent call last):
File "E:\prj\scripts\yt\search.py", line 316, in _search
req = ys.list_next(req, res)
File "D:\Apps\Python\lib\site-packages\googleapiclient\discovery.py", line 966, in methodNext
parsed[4] = urlencode(newq)
File "D:\Apps\Python\lib\urllib.py", line 1343, in urlencode
v = quote_plus(str(v))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-9: ordinal not in range(128)
Versions:
google-api-python-client (1.6.2)
Python 2.7.13 (Win32)
EDIT: I posted a workaround below.
If anyone else is interested, here's one workaround that works for me:
googleapiclient/discovery.py:
(old) q = parse_qsl(parsed[4])
(new) q = parse_qsl(parsed[4].encode('ascii'))
Explanation
In discovery.py, list_next() parses and unescapes the previous url, then makes a new url from it:
pageToken = previous_response['nextPageToken']
parsed = list(urlparse(request.uri))
q = parse_qsl(parsed[4])
# Find and remove old 'pageToken' value from URI
newq = [(key, value) for (key, value) in q if key != 'pageToken']
newq.append(('pageToken', pageToken))
parsed[4] = urlencode(newq)
uri = urlunparse(parsed)
It seems the problem is when parse_qsl unescapes the unicode parsed[4], it
returns the utf-8 encoded value in a unicode type. urlencode does not like
this:
q = urlparse.parse_qsl(u'q=%ED%95%9C%EA%B8%80')
[(u'q', u'\xed\x95\x9c\xea\xb8\x80')]
urllib.urlencode(q)
UnicodeEncodeError
If parse_qsl is given a plain ascii string, it returns a plain utf-8 encoded string which urlencode likes:
q = urlparse.parse_qsl(u'q=%ED%95%9C%EA%B8%80'.encode('ascii'))
[('q', '\xed\x95\x9c\xea\xb8\x80')]
urllib.urlencode(q)
'q=%ED%95%9C%EA%B8%80'

Python Unicode Error With % Operator

Not sure what I'm doing wrong here, but with this:
# -*- coding: utf-8 -*-
class Foo(object):
CURRENCY_SYMBOL_MAP = {"CAD":'$', "USD":'$', "GBP" : "£"}
def bar(self, value, symbol="GBP"):
result = u"%s%s" % (self.CURRENCY_SYMBOL_MAP[symbol], value)
return result
if __name__ == "__main__":
f = Foo()
print f.bar(unicode("19.00"))
I get:
Traceback (most recent call last):
File "test.py", line 11, in <module>
print f.bar(unicode("19.00"))
File "test.py", line 7, in bar
result = u"%s%s" % (self.CURRENCY_SYMBOL_MAP[symbol], value)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 0: ordinal not in range(128)
This is with Python 2.7.6
PS - I get that there are libraries like Babel for formmatting things as currency, my question is more with respect to unicode strings and the % operator.
Make sure the strings you're inserting are Unicode too.
CURRENCY_SYMBOL_MAP = {"CAD":u'$', "USD":u'$', "GBP" : u"£"}
You are attempting to insert a non-unicode string into a unicode string. You just have to make the values in CURRENCY_SYMBOL_MAP unicode objects.
# -*- coding: utf-8 -*-
class Foo(object):
CURRENCY_SYMBOL_MAP = {"CAD":u'$', "USD":u'$', "GBP" : u"£"} # this line is the difference
def bar(self, value, symbol="GBP"):
result = u"%s%s" % (self.CURRENCY_SYMBOL_MAP[symbol], value)
return result
if __name__ == "__main__":
f = Foo()
print f.bar(unicode("19.00"))

Categories

Resources