Python: 'ascii' codec can't encode character u'\\u2026' - python

I am trying to use the Bing api in python with the following code:
#!/usr/bin/python
from bingapi import bingapi
import re
import json
import urllib
import cgi
import cgitb
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def strip_tags2(data):
p = re.compile(r'<[^<]*?>')
q = re.compile(r'[&;!##$%^*()]*')
data = p.sub('', data)
return q.sub('', data)
def getUrl(item):
return item['Url']
def getContent(item):
return item['Description']
def getTitle(item):
return item['Title']
def getInfo(qry, siteStr):
qryStr = qry + "+" + siteStr
#qryStr = u"%s" % qryStr.encode('UTF-8')
query = urllib.urlencode({'q' : qryStr})
url = 'http://api.bing.net/json.aspx?Appid=<myappid>&Version=2.2&Market=en-US&Query=%s&Sources=web&Web.Count=10&JsonType=raw' % (query)
search_results = urllib.urlopen(url)
j = json.loads(search_results.read())
results = j['SearchResponse']['Web']['Results']
return results
def updateRecent(qry):
f = open("recent.txt", "r")
lines = f.readlines()
f.close()
lines = lines[1:]
if len(qry) > 50: #truncate if string too long
qry = (qry[:50] + '...')
qry = strip_tags2(qry) #strip out the html if injection try
lines.append("\n%s" % qry)
f = open("recent.txt", "w")
f.writelines(lines)
f.close()
if __name__ == '__main__':
form = cgi.FieldStorage()
qry = form["qry"].value
qry = r'%s' % qry
updateRecent(qry)
siteStr = "(site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com)"
print "Content-type: text/html"
print
header = open("header.html", "r")
contents = header.readlines()
header.close()
for item in contents:
print item
print """
<div id="results">
<center><h1>Results:</h1></center>
"""
for item in getInfo(siteStr, qry):
print "<h3>%s</h3>" % getTitle(item)
print "<br />"
print "%s" % getUrl(item)
print "<br />"
print "<p style=\"color:gray\">%s</p>" % getContent(item)
print "<br />"
print "</div>"
footer = open("footer.html", "r")
contents = footer.readlines()
footer.close()
for thing in contents:
print thing
I prints a few results, and then gives me the following error:
UnicodeEncodeError: 'ascii' codec can't encode character u'\\u2026' in position 72: ordinal not in range(128)
Can someone explain why this is happening? It clearly has something to do with how the url is getting encoded, but what is exactly is wrong? Thanks in advance!

That particular Unicode character is "HORIZONTAL ELLIPSIS". One or more of your getXXXXX() functions are returning Unicode strings, one of which contains a non-ASCII character. I suggest declaring the encoding of your output, for example:
Content-Type: text/html; charset=utf-8
and explicitly encoding your output in that encoding.

We need to know the line number where the exception was thrown, it will be in the backtrace. Anyway, the problem is that you are reading unicode from the files/URLs and then implicitly converting them to US-ASCII, probably in one of the concatenation operations. You should prefix all constant strings with u to indicate that they are unicode strings, like in
u"\n%s" % qry

Related

Python- encode base64 print new line

I want to try to encrypt my python code with base64.
But when I use \n I get an error:
print("hello
IndentationError: unexpected indent
This is my code I used:
import base64
def encode(data):
try:
# Standard Base64 Encoding
encodedBytes = base64.b64encode(data.encode("utf-8"))
return str(encodedBytes, "utf-8")
except:
return ""
def decode(data):
try:
message_bytes = base64.b64decode(data)
return message_bytes.decode('utf-8')
except:
return ""
your_code = encode("""
print("hello \n world")
""")
exec(decode(your_code))
I could use twice the print() function instead of \n but is there a way to use \n?
I hope you can help me out
First, you have to remove the indent in the your_code section. Second you have to replace \n with \\n
import base64
def encode(data):
try:
# Standard Base64 Encoding
encodedBytes = base64.b64encode(data.encode("utf-8"))
return str(encodedBytes, "utf-8")
except:
return ""
def decode(data):
try:
message_bytes = base64.b64decode(data)
return message_bytes.decode('utf-8')
except:
return ""
your_code = encode("""
print("hello \\n world")
""")
exec(decode(your_code))

Printing unicode string not correct

I use PyPDF2 to read a pdf file but get a unicode string.
I don't know what's the encoding, then try to dump first 8 chars to hex:
0000 005b 00d7 00c1 00e8 00d4 00c5 00d5 [......
What's these bytes means? is it utf-16be/le?
I try below code but output is wrong:
print outStr.encode('utf-16be').decode('utf-16')
嬀휀섀퐀씀픀
If print directly, python will report error:
UnicodeEncodeError: 'ascii' codec can't encode characters in position 1-7: ordinal not in range(128)
I am following the instruction from How To Extract Text From Pdf In Python
Code section as below:
import PyPDF2
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
FILTER = ''.join([(len(repr(chr(x))) == 3) and chr(x) or '.' for x in range(256)])
def dumpUnicodeString(src, length=8):
result = []
for i in xrange(0, len(src), length):
unichars = src[i:i+length]
hex = ' '.join(["%04x" % ord(x) for x in unichars])
printable = ''.join(["%s" % ((ord(x) <= 127 and FILTER[ord(x)]) or '.') for x in unichars])
result.append("%04x %-*s %s\n" % (i*2, length*5, hex, printable))
return ''.join(result)
def extractPdfText(filePath=''):
fileObject = open(filePath, 'rb')
pdfFileReader = PyPDF2.PdfFileReader(fileObject)
totalPageNumber = pdfFileReader.numPages
currentPageNumber = 0
text = ''
while(currentPageNumber < totalPageNumber ):
pdfPage = pdfFileReader.getPage(currentPageNumber)
text = text + pdfPage.extractText()
currentPageNumber += 1
if(text == ''):
text = textract.process(filePath, method='tesseract', encoding='utf-8')
return text
if __name__ == '__main__':
pdfFilePath = 'a.pdf'
pdfText = extractPdfText(pdfFilePath)
#pdfText = pdfText[:7]
print dumpUnicodeString(pdfText)
print pdfText

Is tokenize.detect_encoding(readline) only in python3?

In python2.7
AttributeError: 'module' object has no attribute 'detect_encoding'
for python2&3 compatibility,you can use:
from lib2to3.pgen2 import tokenize
tokenize.detect_encoding(f.readline)[0] # 'utf-8'
This function isn't available for python2.7, you can see it isn't listed on the https://docs.python.org/2.7/library/tokenize.html. That said, I don't see any reason why the python3.6 version wouldn't work on python2.7, ie:
import re
from codecs import lookup, BOM_UTF8
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
return "iso-8859-1"
return orig_enc
def detect_encoding(readline):
try:
filename = readline.__self__.name
except AttributeError:
filename = None
bom_found = False
encoding = None
default = 'utf-8'
def read_or_stop():
try:
return readline()
except StopIteration:
return b''
def find_cookie(line):
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line_string = line.decode('utf-8')
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
match = cookie_re.match(line_string)
if not match:
return None
encoding = _get_normal_name(match.group(1))
try:
codec = lookup(encoding)
except LookupError:
# This behaviour mimics the Python interpreter
if filename is None:
msg = "unknown encoding: " + encoding
else:
msg = "unknown encoding for {!r}: {}".format(filename,
encoding)
raise SyntaxError(msg)
if bom_found:
if encoding != 'utf-8':
# This behaviour mimics the Python interpreter
if filename is None:
msg = 'encoding problem: utf-8'
else:
msg = 'encoding problem for {!r}: utf-8'.format(filename)
raise SyntaxError(msg)
encoding += '-sig'
return encoding
first = read_or_stop()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
default = 'utf-8-sig'
if not first:
return default, []
encoding = find_cookie(first)
if encoding:
return encoding, [first]
if not blank_re.match(first):
return default, [first]
second = read_or_stop()
if not second:
return default, [first]
encoding = find_cookie(second)
if encoding:
return encoding, [first, second]
return default, [first, second]
coding, lines = detect_encoding(open("out.txt", 'rb').readline)
print(coding, lines)

getting text length from lxml.html nodes and truncate

In parse_html() I am trying to parse for the text of each element and get the len() of text in each element. I want to generate a script that parses the length of text in each element and when the cumulative text length reaches a set size parameter, it truncates the rest of the text in the document. My problem is in the child.text/tag1.text/tag2.text/tag3.text. len() doesn't seem to be working on these. Is there a way I can pull the numerical length of these text strings?
import sys
import imaplib
import getpass
import email
import email.header
import datetime
from bs4 import BeautifulSoup
import re
from lxml import etree, html
from io import StringIO, BytesIO
from lxml.html.clean import clean_html, Cleaner, word_break
from lxml.etree import HTML
from lxml.html import HtmlMixin
EMAIL_ACCOUNT = "sample#gmai.com"
EMAIL_FOLDER = "INBOX"
def process_mailbox(M):
"""
Do something with emails messages in the folder.
For the sake of this example, print some headers.
"""
rv, data = M.search(None, "ALL")
if rv != 'OK':
print "No messages found!"
return
for num in data[0].split():
rv, data = M.fetch(num, '(RFC822)')
if rv != 'OK':
print "ERROR getting message", num
return
msg = email.message_from_string(data[0][1])
decode = email.header.decode_header(msg['Subject'])[0]
subject = unicode(decode[0])
body = msg.get_payload(decode=True)
print 'Message %s: %s' % (num, subject)
print 'Raw Date:', msg['Date']
print 'Body:', body
if msg.is_multipart():
html = None
print "Checking for html or text"
for part in msg.get_payload():
if part.get_content_charset() is None:
charset = chardet.detect(srt(part))['encoding']
else:
charset = part.get_content_charset()
if part.get_content_type() == 'text/plain':
text = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.txt', 'w')
f.write(text)
f.close
if part.get_content_type() == 'text/html':
html = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.html','w')
f.write(html)
f.close
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
if not os.path.isfile(filename) :
fp = open(filename, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
return 0
if html is None:
return text.strip()
else:
return html.strip()
# Now convert to local date-time
date_tuple = email.utils.parsedate_tz(msg['Date'])
if date_tuple:
local_date = datetime.datetime.fromtimestamp(
email.utils.mktime_tz(date_tuple))
print "Local Date:", \
local_date.strftime("%a, %d %b %Y %H:%M:%S")
def parse_html():
#htmldoc = open('email.html', 'r+')
#doc = htmldoc.read()
VALID_TAGS = ['iframe', 'video', 'o>', 'li', 'sub', 'sup', 'source', 'br', 'h3', 'h4', 'h6', 'hr', 'q', 'mark','wbr', 'audio','strong', 'em', 'p','ul', 'li', 'br', 'blockquote', 'pre', 'del', 'h3', 'body', 'header', 'html', 'title', 'div', 'img', 'a']
parser = etree.HTMLParser()
tree = etree.parse("email.html", parser)
#results = etree.tostring(tree.getroot(), pretty_print=True, method="html")
page = html.tostring(tree)
cleaner = Cleaner(page_structure=False, add_nofollow=True, style=True, links=True, safe_attrs_only=True)
clean_page = cleaner.clean_html(page)
root = tree.getroot()
child = root[0]
print len(root)
children = list(root)
for child in root:
print child.tag
print child.attrib
print child.text
for tag1 in child:
print tag1.tag
print tag1.attrib
print tag1.text
for tag2 in tag1:
print tag2.tag
print tag2.attrib
print tag2.text
for tag3 in tag2:
print tag3.tag
print tag3.attrib
print tag3.text
M = imaplib.IMAP4_SSL('imap.gmail.com')
try:
rv, data = M.login(EMAIL_ACCOUNT, getpass.getpass())
except imaplib.IMAP4.error:
print "LOGIN FAILED!!! "
sys.exit(1)
print rv, data
rv, mailboxes = M.list()
if rv == 'OK':
print "Mailboxes:"
print mailboxes
rv, data = M.select(EMAIL_FOLDER)
if rv == 'OK':
print "Processing mailbox...\n"
process_mailbox(M)
parse_html()
M.close()
else:
print "ERROR: Unable to open mailbox ", rv
M.logout()
This is the error I get when I try to use len()
TypeError: object of type 'NoneType' has no len()
Also, if you know anything about how to do that truncating with lxml.html, I'd appreciate being pointed in the right direction.
Thanks.
There may be a more efficient way to do this but I was able to get functioning results. I had to turn each child into a string and then strip the html tags of each child string. I used functions stringify_children() and strip_tags(), found at these links: Get all text inside a tag in lxml and
Strip HTML from strings in Python
def stringify_children(node):
from lxml.etree import tostring
from itertools import chain
parts = ([node.text] +
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
tree = etree.fromstring(docstring)
walkAll = tree.iterchildren()
for elt in walkAll:
child = stringify_children(elt)
childtext = strip_tags(child)
print len(childtext)

Unicode Decode Error in Python with files

so I'm having this trouble with the decode. I found it in other threads how to do it for simple strings, with the u'string'.encode. But I can't find a way to make it work with files.
Any help would be appreciated!
Here's the code.
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
and here's the whole code, should it help.
#!/usr/bin/env python
# coding: utf-8
"""
Script to helps on translate some code's methods from
portuguese to english.
"""
from multiprocessing import Pool
from mock import MagicMock
from goslate import Goslate
import fnmatch
import logging
import os
import re
import urllib2
_MAX_PEERS = 1
try:
os.remove('traducoes.log')
except OSError:
pass
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('traducoes.log')
logger.addHandler(handler)
def fileWalker(ext, dirname, names):
"""
Find the files with the correct extension
"""
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f, pat):
ext[1].append(os.path.join(dirname, f))
def encontre_text(file):
"""
find on the string the works wich have '_' on it
"""
text = file.read().decode('utf-8')
return re.findall(r"\w+(?<=_)\w+", text)
#return re.findall(r"\"\w+\"", text)
def traduza_palavra(txt):
"""
Translate the word/phrase to english
"""
try:
# try connect with google
response = urllib2.urlopen('http://google.com', timeout=2)
pass
except urllib2.URLError as err:
print "No network connection "
exit(-1)
if txt[0] != '_':
txt = txt.replace('_', ' ')
txt = txt.replace('media'.decode('utf-8'), 'média'.decode('utf-8'))
gs = Goslate()
#txt = gs.translate(txt, 'en', gs.detect(txt))
txt = gs.translate(txt, 'en', 'pt-br') # garantindo idioma tupiniquim
txt = txt.replace(' en ', ' br ')
return txt.replace(' ', '_') # .lower()
def subistitua(file, txt, novo_txt):
"""
should rewrite the file with the new text in the future
"""
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
def magica(File):
"""
Thread Pool. Every single thread should play around here with
one element from list os files
"""
global _DONE
if _MAX_PEERS == 1: # inviavel em multithread
logger.info('\n---- File %s' % File)
with open(File, "r+") as file:
list_txt = encontre_text(file)
for txt in list_txt:
novo_txt = traduza_palavra(txt)
if txt != novo_txt:
logger.info('%s -> %s [%s]' % (txt, novo_txt, File))
subistitua(file, txt, novo_txt)
file.close()
print File.ljust(70) + '[OK]'.rjust(5)
if __name__ == '__main__':
try:
response = urllib2.urlopen('http://www.google.com.br', timeout=1)
except urllib2.URLError as err:
print "No network connection "
exit(-1)
root = './app'
ex = ".py"
files = []
os.path.walk(root, fileWalker, [ex, files])
print '%d files found to be translated' % len(files)
try:
if _MAX_PEERS > 1:
_pool = Pool(processes=_MAX_PEERS)
result = _pool.map_async(magica, files)
result.wait()
else:
result = MagicMock()
result.successful.return_value = False
for f in files:
pass
magica(f)
result.successful.return_value = True
except AssertionError, e:
print e
else:
pass
finally:
if result.successful():
print 'Translated all files'
else:
print 'Some files were not translated'
Thank you all for the help!
In Python 2, reading from files produces regular (byte) string objects, not unicode objects. There is no need to call .encode() on these; in fact, that'll only trigger an automatic decode to Unicode first, which can fail.
Rule of thumb: use a unicode sandwich. Whenever you read data, you decode to unicode at that stage. Use unicode values throughout your code. Whenever you write data, encode at that point. You can use io.open() to open file objects that encode and decode automatically for you.
That also means you can use unicode literals everywhere; for your regular expressions, for your string literals. So use:
def encontre_text(file):
text = file.read() # assume `io.open()` was used
return re.findall(ur"\w+(?<=_)\w+", text) # use a unicode pattern
and
def subistitua(file, txt, novo_txt):
text = file.read() # assume `io.open()` was used
text = text.replace(txt, novo_txt)
file.seek(0) # rewind
file.write(text)
as all string values in the program are already unicode, and
txt = txt.replace(u'media', u'média')
as u'..' unicode string literals don't need decoding anymore.

Categories

Resources