HTMLParser or urllib2 unicode issue

HTMLParser or urllib2 unicode issue - python

I am trying to use HTMLParser and urllib2 to get to an image file
content = urllib2.urlopen( imgurl.encode('utf-8') ).read()
try:
p = MyHTMLParser( )
p.feed( content )
p.download_file( )
p.close()
except Exception,e:
print e
MyHTMLParser:
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.url=""
self.outfile = "some.png"
def download_file(self):
urllib.urlretrieve( self.url, self.outfile )
def handle_starttag(self, tag, attrs):
if tag == "a":
# after some manipulation here, self.url will have a img url
self.url = "http://somewhere.com/Fondue%C3%A0.png"
when i run the script, i get
Traceback (most recent call last):
File "test.py", line 59, in <module>
p.feed( data )
File "/usr/lib/python2.7/HTMLParser.py", line 114, in feed
self.goahead(0)
File "/usr/lib/python2.7/HTMLParser.py", line 158, in goahead
k = self.parse_starttag(i)
File "/usr/lib/python2.7/HTMLParser.py", line 305, in parse_starttag
attrvalue = self.unescape(attrvalue)
File "/usr/lib/python2.7/HTMLParser.py", line 472, in unescape
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
File "/usr/lib/python2.7/re.py", line 151, in sub
return _compile(pattern, flags).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 56: ordinal not in range(128)
Using the suggestions i found in the found, i did the .encode('utf-8') method, but it still gives me error. how to fix this ? thanks

Replace
content = urllib2.urlopen( url.encode('utf-8') ).read()
with
content = urllib2.urlopen(url).read().decode('utf-8')
To decode the response into unicode.

Related

ignore encoding error when parsing pdf with pdfminer

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fn='test.pdf'
with open(fn, mode='rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
item = {}
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
item[name]=value
Hello, I need help with this code as it is giving me Unicode error on some characters
Traceback (most recent call last):
File "<stdin>", line 7, in <module>
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 80, in resolve1
x = x.resolve(default=default)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 67, in resolve
return self.doc.getobj(self.objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 673, in getobj
stream = stream_value(self.getobj(strmid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 676, in getobj
obj = self._getobj_parse(index, objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 648, in _getobj_parse
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py", line 85, in __repr__
return self.name.decode('ascii')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
is there anything I can add so it "ingores" the charchters that its not able to decode or at least return the name with the value as blank in name, value = field.get('T'), field.get('V').
any help is appreciated

Here is one way you can fix it
nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py"
then in line 85
def __repr__(self):
return self.name.decode('ascii', 'ignore') # this fixes it
I don't believe it's recommended to edit source scripts, you should also post an issue on Github

UnicodeDecodeError: 'utf8' codec can't decode byte 0x89 in position 51: invalid start byte

An error occurred when compiling "QRCODE.py"
from pyshorteners import Shortener
class Shortening():
def __init__(self):
self.shortener=Shortener('Tinyurl')
fo = open('/home/jayu/Desktop/qr.png','r+')
apiKey = fo.read()
self.shortener = Shortener('Google',api_key = apiKey)
def shortenURL(self):
self.url = raw_input("Enter The Url to shortener : ");
shortener = self.shortener.short(self.url)
print ("the short url : " +shortenURL)
def decodeURL(self):
self.url = raw_input("Enter The Url to expand: ");
expandURL = self.shortener.expand(self.url)
print ("the short url : " +expandURL);
def generateQRcode(self):
self.url = raw_input("Enter the URL to get QR code :")
self.shortener.short(self.url)
print (self.shortener.qrcode(150,150))
app = Shortening()
option = int (input("Enter ur choice : "))
if option==1:
app.shortenURL()
elif option==2:
decodeURL()
elif option==3:
app.generateQRcode()
else:
print ("wrong ")
Traceback (most recent call last):
jayu#jayu:~/Desktop$ python QRCODE.py
Enter ur choice : 3
Enter the URL to get QR code :http://www.google.com
Traceback (most recent call last):
File "QRCODE.py", line 29, in <module>
app.generateQRcode()
File "QRCODE.py", line 19, in generateQRcode
self.shortener.short(self.url)
File "/home/jayu/.local/lib/python2.7/site-packages/pyshorteners/shorteners/__init__.py", line 115, in short
self.shorten = self._class(**self.kwargs).short(url)
File "/home/jayu/.local/lib/python2.7/site-packages/pyshorteners/shorteners/googl.py", line 25, in short
response = self._post(url, data=params, headers=headers)
File "/home/jayu/.local/lib/python2.7/site-packages/pyshorteners/shorteners/base.py", line 32, in _post
timeout=self.kwargs['timeout'])
File "/home/jayu/.local/lib/python2.7/site-packages/requests/api.py", line 112, in post
return request('post', url, data=data, json=json, **kwargs)
File "/home/jayu/.local/lib/python2.7/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/home/jayu/.local/lib/python2.7/site-packages/requests/sessions.py", line 498, in request
prep = self.prepare_request(req)
File "/home/jayu/.local/lib/python2.7/site-packages/requests/sessions.py", line 441, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/home/jayu/.local/lib/python2.7/site-packages/requests/models.py", line 309, in prepare
self.prepare_url(url, params)
File "/home/jayu/.local/lib/python2.7/site-packages/requests/models.py", line 359, in prepare_url
url = url.decode('utf8')
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0x89 in position 51: invalid start byte
What is the cause of the error? Python's version is 2.7.15rc1
Each time I tried to run python QRCODE.py I got a same position N in the traceback.
can anyone correct me ?

If you have this problem in open(...) function you need to set encoding in this function
fo = open(filename, something_else, encoding = 'UTF-8')
but it's work only in python3 in python 2 you need to use io.open:
fo = io.open(filename, something else, encoding = 'UTF-8')
go to google i don't know full sintax, but i already answered alike ask here: unable to decode this string using python

Scrapy response.replace encoding error

I am trying to replace the response body of a search result block of a search result page of google using response.replace() and I face some encoding issues.
scrapy shell "http://www.google.de/search?q=Zuckerccc"
>>> srb = hxs.select("//li[#class='g']").extract()
>>> body = '<html><body>' + srb[0] + '</body></html>' # get only 1st search result block
>>> b = response.replace(body = body)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 54, in replace
return Response.replace(self, *args, **kwargs)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/__init__.py", line 77, in replace
return cls(*args, **kwargs)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 31, in __init__
super(TextResponse, self).__init__(*args, **kwargs)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/__init__.py", line 19, in __init__
self._set_body(body)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 48, in _set_body
self._body = body.encode(self._encoding)
File "../local_1/Linux-2.6c2.5-x86_64/Python/Python-147.0-0/lib/python2.6/encodings/cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
UnicodeEncodeError: 'charmap' codec can't encode character u'\u0131' in position 529: character maps to <undefined>
I tried to create my own response as well,
>>> x = HtmlResponse("http://www.google.de/search?q=Zuckerccc", body = body, encoding = response.encoding)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 31, in __init__
super(TextResponse, self).__init__(*args, **kwargs)
self._set_body(body)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 48, in _set_body
self._body = body.encode(self._encoding)
File "../local_1/Linux-2.6c2.5-x86_64/Python/Python-147.0-0/lib/python2.6/encodings/cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
UnicodeEncodeError: 'charmap' codec can't encode character u'\u0131' in position 529: character maps to <undefined>
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/__init__.py", line 19, in __init__
Also, when I use _body_declared_encoding() for encoding in replace() function, it works.
replace(body = body, encoding = response._body_declared_encoding())
I don't understand why response._body_declared_encoding() and response.encoding are different. Can anybody please shed some light on this.
So, what will be a good way to fix this ?

I successfully replaced the response body with these lines of code:
scrapy shell "http://www.google.de/search?q=Zuckerccc"
>>> google_result = response.xpath('//li[#class="g"]').extract()[0]
>>> body = '<html><body>' + google_result + '</body></html>'
>>> b = response.replace(body = body)

I check the source code from scrapy.http.response.text , when we use TextResponse, we need to tell self._encoding first. So we can do like this:
>>>response._encoding='utf8'
>>>response._set_body("aaaaaa")
>>>response.body
>>>'aaaaaa'

Trouble using gdata and Unicode Cyrillic in Python

I have this code
# -*- coding: utf8 -*-
__author__ = 'user'
import gdata.youtube.service
yt_service = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery()
query.vq = u"не"
feed = yt_service.YouTubeQuery(query)
for yt_item in feed.entry:
print yt_item.GetSwfUrl()
And I am getting this error:
Traceback (most recent call last):
File "cyr_search.py", line 7, in
feed = yt_service.YouTubeQuery(query)
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/gdata/youtube/service.py", line 1346, in YouTubeQuery
result = self.Query(query.ToUri())
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/gdata/service.py", line 1715, in ToUri
return atom.service.BuildUri(q_feed, self)
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/atom/service.py", line 584, in BuildUri
parameter_list = DictionaryToParamList(url_params, escape_params)
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/atom/service.py", line 551, in DictionaryToParamList
for param, value in (url_parameters or {}).items()]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 1275, in quote_plus
return quote(s, safe)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 1268, in quote
return ''.join(map(quoter, s))
KeyError: u'\u043d'
How do I search for non-ASCII. Do I need to url encode the query? I thought the library will do that on its own.

Change to:
query.vq = u"не".encode('utf8')
The string needs to be encoded before being sent.

How to fix encoding in Python Mechanize?

here is the sample code:
from mechanize import Browser
br = Browser()
page = br.open('http://hunters.tclans.ru/news.php?readmore=2')
br.form = br.forms().next()
print br.form
The problem is that server return incorrect encoding (windows-cp1251). How can I manually set the encoding of the current page in mechanize?
Error:
Traceback (most recent call last):
File "/tmp/stackoverflow.py", line 5, in <module>
br.form = br.forms().next()
File "/usr/local/lib/python2.6/dist-packages/mechanize/_mechanize.py", line 426, in forms
return self._factory.forms()
File "/usr/local/lib/python2.6/dist-packages/mechanize/_html.py", line 559, in forms
self._forms_factory.forms())
File "/usr/local/lib/python2.6/dist-packages/mechanize/_html.py", line 225, in forms
_urlunparse=_rfc3986.urlunsplit,
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 967, in ParseResponseEx
_urlunparse=_urlunparse,
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 1104, in _ParseFileEx
fp.feed(data)
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 870, in feed
sgmllib.SGMLParser.feed(self, data)
File "/usr/lib/python2.6/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.6/sgmllib.py", line 193, in goahead
self.handle_entityref(name)
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 751, in handle_entityref
'&%s;' % name, self._entitydefs, self._encoding))
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 238, in unescape
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
File "/usr/lib/python2.6/re.py", line 151, in sub
return _compile(pattern, 0).sub(repl, string, count)
File "/usr/local/lib/python2.6/dist-packages/ClientForm.py", line 230, in replace_entities
repl = repl.encode(encoding)
LookupError: unknown encoding: windows-cp1251

I don't know about Mechanize, but you could hack codecs to accept wrong encoding names that have both ‘windows’ and ‘cp’:
>>> def fixcp(name):
... if name.lower().startswith('windows-cp'):
... try:
... return codecs.lookup(name[:8]+name[10:])
... except LookupError:
... pass
... return None
...
>>> codecs.register(fixcp)
>>> '\xcd\xe0\xef\xee\xec\xe8\xed\xe0\xe5\xec'.decode('windows-cp1251')
u'Напоминаем'

Fixed by setting
br._factory.encoding = enc
br._factory._forms_factory.encoding = enc
br._factory._links_factory._encoding = enc
(note the underscores) after br.open()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

HTMLParser or urllib2 unicode issue - python

Replace content = urllib2.urlopen( url.encode('utf-8') ).read() with content = urllib2.urlopen(url).read().decode('utf-8') To decode the response into unicode.

Related

ignore encoding error when parsing pdf with pdfminer

UnicodeDecodeError: 'utf8' codec can't decode byte 0x89 in position 51: invalid start byte

Scrapy response.replace encoding error

Trouble using gdata and Unicode Cyrillic in Python

How to fix encoding in Python Mechanize?

Categories

Resources