python exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xa7 in - python

I am using scrapy with python and I have this code in a python item pipline
def process_item(self, item, spider):
import pdb; pdb.set_trace()
ID = str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link']))
I got this error :
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\scrapy-0.20.2-py2.7.egg\scrapy\mid
dleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "C:\Python27\lib\site-packages\scrapy-0.20.2-py2.7.egg\scrapy\uti
ls\defer.py", line 65, in process_chain
d.callback(input)
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 3
82, in callback
self._startRunCallbacks(result)
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 4
90, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 5
77, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "General_Spider_code_version_2\pipelines.py", line 7, in process_
item
ID = str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link']))
File "C:\Python27\lib\uuid.py", line 549, in uuid5
hash = sha1(namespace.bytes + name).digest()
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xa7 in p
osition 1: ordinal not in range(128)
I tried to debug the item['link']
and this is the result
-> ID = str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link']))
(Pdb) item['link']
u'http://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/2014/4/6
/palm-jumeirah-abu-keibal-3-br-maid-partial-2/?back=ZHViYWkuZHViaXp6bGUuY29tL3By
b3BlcnR5LWZvci1yZW50L3Jlc2lkZW50aWFsL2FwYXJ0bWVudGZsYXQv&pos=1'
(Pdb)
as you see the item['link'] is unicode
Edit1
when I change the item['link'] to any other attribute like item['date'] the code works perfectly

Encode the unicode string into byte string with .encode('utf-8') and it should work:
str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link'].encode('utf-8')))

Related

UnicodeDecodeError: 'utf8' codec can't decode byte 0xbb in position 5: invalid start byte

I am using Python 2.7 and had this error that I can't fix. I am trying to download HTMLs from a page and the next button looks like this : Next »
Traceback (most recent call last):
File "C:\Users\Said&Nour\Desktop\Documents\PythonFiles\LebanonParsing\Al Rifai\alrifai.py", line 109, in <module>
if PageP.find('a',attrs={'title':'Next »'}) is None:
File "C:\Python27\lib\site-packages\bs4\element.py", line 1300, in find
l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
File "C:\Python27\lib\site-packages\bs4\element.py", line 1321, in find_all
return self._find_all(name, attrs, text, limit, generator, **kwargs)
File "C:\Python27\lib\site-packages\bs4\element.py", line 602, in _find_all
strainer = SoupStrainer(name, attrs, text, **kwargs)
File "C:\Python27\lib\site-packages\bs4\element.py", line 1420, in __init__
normalized_attrs[key] = self._normalize_search_value(value)
File "C:\Python27\lib\site-packages\bs4\element.py", line 1434, in _normalize_search_value
return value.decode("utf8")
File "C:\Python27\lib\encodings\utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xbb in position 5: invalid start byte

How to avoid scrapy UnicodeEncodeError

I have the following code in my parse_item callback:
sel = Selector(response)
item['name'] = sel.xpath('//div[#class="productDescriptionBlock"]/h2/text()').extract()[0]
return item
But I get UnicodeEncodeError:
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\uff01' in position 271761: character maps to <undefined>
I also tried adding .encode('utf-8') but still get the same error.
Traceback (most recent call last):
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/twisted/internet/task.py", line 638, in _tick
taskObj._oneWorkUnit()
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/linkextractors/sgml.py", line 124, in extract_links
).encode(response.encoding)
File "/home/scraper/.fakeroot/lib/python2.7/encodings/cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\x99' in position 349751: character maps to <undefined>
I've seen this before. If I'm not wrong, you are using the restrict_xpaths parameter in your rule's link extractor.
Possible solutions are:
Avoid to use restrict_xpaths for that particular site. This happens because the page content contains characters not defined in the declared encoding.
Identify the invalid characters and replace them before the rule acts on it. This can be tricky, though.
Use the middleware in this answer to re-encode the response into its declared encoding: UnicodeEncodeError after setting restrict_xpaths settings

HTMLParser or urllib2 unicode issue

I am trying to use HTMLParser and urllib2 to get to an image file
content = urllib2.urlopen( imgurl.encode('utf-8') ).read()
try:
p = MyHTMLParser( )
p.feed( content )
p.download_file( )
p.close()
except Exception,e:
print e
MyHTMLParser:
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.url=""
self.outfile = "some.png"
def download_file(self):
urllib.urlretrieve( self.url, self.outfile )
def handle_starttag(self, tag, attrs):
if tag == "a":
# after some manipulation here, self.url will have a img url
self.url = "http://somewhere.com/Fondue%C3%A0.png"
when i run the script, i get
Traceback (most recent call last):
File "test.py", line 59, in <module>
p.feed( data )
File "/usr/lib/python2.7/HTMLParser.py", line 114, in feed
self.goahead(0)
File "/usr/lib/python2.7/HTMLParser.py", line 158, in goahead
k = self.parse_starttag(i)
File "/usr/lib/python2.7/HTMLParser.py", line 305, in parse_starttag
attrvalue = self.unescape(attrvalue)
File "/usr/lib/python2.7/HTMLParser.py", line 472, in unescape
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
File "/usr/lib/python2.7/re.py", line 151, in sub
return _compile(pattern, flags).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 56: ordinal not in range(128)
Using the suggestions i found in the found, i did the .encode('utf-8') method, but it still gives me error. how to fix this ? thanks
Replace
content = urllib2.urlopen( url.encode('utf-8') ).read()
with
content = urllib2.urlopen(url).read().decode('utf-8')
To decode the response into unicode.

UnicodeEncodeError after setting restrict_xpaths settings

i'm new to python and scrapy. After setting restrict_xpaths settings to "//table[#class="lista"]" I've received following traceback. What's strange, by using other xpath rule the crawler works properly.
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield it.next()
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/sgml.py", line 124, in extract_links
).encode(response.encoding)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/iso8859_2.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 686: character maps to <undefined>
Here is MySpider Class.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ds_crawl.items import DsCrawlItem
class MySpider(CrawlSpider):
name = 'inside'
allowed_domains = ['wroclaw.dlastudenta.pl']
start_urls = ['http://wroclaw.dlastudenta.pl/stancje/']
rules = (
Rule(SgmlLinkExtractor(allow=('show_stancja'), restrict_xpaths=('//table[#class="lista"]')), callback='parse_item', follow= True),)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p[#class='bbtext intextAd']")
for titles in titles:
item = DsCrawlItem()
item['content'] = titles.select("text()").extract()
print item
Any explanation of this error and help will be appreciated. Thank you.
That's a bug caused by the web page using the » entity which is translated by lxml to the unicode character \xbb and when you use the restrict_xpaths argument the link extractors encodes the content to the original encoding iso8859-2 which fails because \xbb is not valid character in that encoding.
This simple line reproduces the exception:
>>> u'\xbb'.encode('iso8859-2')
...
UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 0: character maps to <undefined>
A workaround for this can be forcing to use utf8 for all responses. This can be done by a simple downloader middleware:
# file: myproject/middlewares.py
class ForceUTF8Response(object):
"""A downloader middleware to force UTF-8 encoding for all responses."""
encoding = 'utf-8'
def process_response(self, request, response, spider):
# Note: Use response.body_as_unicode() instead of response.text in in Scrapy <1.0.
new_body = response.text.encode(self.encoding)
return response.replace(body=new_body, encoding=self.encoding)
In your settings:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.ForceUTF8Response': 100,
}

DjangoUnicodeDecodeError: 'utf-8' codec can't decode bytes in position 4-5: invalid data

In Jython and Django I'm trying to call a view function located in a different view "normalize_name" with this param name="##~½½¬}0.jpg"
my call is:
documentName = str(normalize_name(name))
What I cannot understand is why is working if have the function in the same view where is called
the function called is:
def normalize_name(value):
ext = value.split('.')[-1]
value = join(value.split('.')[0:-1], '.')
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
value = unicode(re.sub('[-\s]+', '-', value))
return value + '.' + ext
The error I get:
Traceback (most recent call last):
File "/usr/share/jython-env-django/Lib/wsgiref/handlers.py", line 92, in run
self.result = application(self.environ, self.start_response)
File "/usr/share/jython-env-django/Lib/site-packages/django/contrib/staticfiles/handlers.py", line 67, in __call__
return self.application(environ, start_response)
File "/usr/share/jython-env-django/Lib/site-packages/django/core/handlers/wsgi.py", line 241, in __call__
response = self.get_response(request)
File "/usr/share/jython-env-django/Lib/site-packages/django/core/handlers/base.py", line 179, in get_response
response = self.handle_uncaught_exception(request, resolver, sys.exc_info())
File "/usr/share/jython-env-django/Lib/site-packages/django/core/handlers/base.py", line 221, in handle_uncaught_exception
return debug.technical_500_response(request, *exc_info)
File "/usr/share/jython-env-django/Lib/site-packages/django/views/debug.py", line 63, in technical_500_response
text = reporter.get_traceback_text()
File "/usr/share/jython-env-django/Lib/site-packages/django/views/debug.py", line 281, in get_traceback_text
c = Context(self.get_traceback_data(), autoescape=False)
File "/usr/share/jython-env-django/Lib/site-packages/django/views/debug.py", line 236, in get_traceback_data
frame['vars'] = [(k, force_escape(pprint(v))) for k, v in frame['vars']]
File "/usr/share/jython-env-django/Lib/site-packages/django/template/defaultfilters.py", line 39, in _dec
args[0] = force_unicode(args[0])
File "/usr/share/jython-env-django/Lib/site-packages/django/utils/encoding.py", line 93, in force_unicode
raise DjangoUnicodeDecodeError(s, *e.args)
DjangoUnicodeDecodeError: 'utf-8' codec can't decode bytes in position 4-5: invalid data. You passed in "'##~\xc2\\xbd\xc2\\xbd\xc2\\xac}0.jpg'" (<type 'str'>)
Thx in advance

Categories

Resources