Urllib Unicode Error, no unicode involved - python

EDIT: I've majorly edited the content of this post since the original to specify my problem:
I am writing a program to download webcomics, and I'm getting this weird error when downloading a page of the comic. The code I am running essentially boils down to the following line followed by the error. I do not know what is causing this error, and it is confusing me greatly.
>>> urllib.request.urlopen("http://abominable.cc/post/47699281401")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 470, in open
response = meth(req, response)
File "/usr/lib/python3.4/urllib/request.py", line 580, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.4/urllib/request.py", line 502, in error
result = self._call_chain(*args)
File "/usr/lib/python3.4/urllib/request.py", line 442, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 685, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.4/urllib/request.py", line 464, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 482, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 442, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1211, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.4/urllib/request.py", line 1183, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1137, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1172, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib/python3.4/http/client.py", line 1014, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 37-38: ordinal not in range(128)
The entirety of my program can be found here: https://github.com/nstephenh/pycomic

I was having the same problem. The root cause is that the remote server isn't playing by the rules. HTTP Headers are supposed to be US-ASCII only but apparently the leading http webservers (apache2, nginx) doesn't care and send direct UTF-8 encoded string.
However in http.client the parse_header function fetch the headers as iso-8859, and the default HTTPRedirectHandler in urllib doesn't care to quote the location or URI header, resulting in the aformentioned error.
I was able to 'work around' both thing by overriding the default HTTPRedirectHandler and adding three line to counter the latin1 decoding and add a path quote:
import urllib.request
from urllib.error import HTTPError
from urllib.parse import (
urlparse, quote, urljoin, urlunparse)
class UniRedirectHandler(urllib.request.HTTPRedirectHandler):
# Implementation note: To avoid the server sending us into an
# infinite loop, the request object needs to track what URLs we
# have already seen. Do this by adding a handler-specific
# attribute to the Request object.
def http_error_302(self, req, fp, code, msg, headers):
# Some servers (incorrectly) return multiple Location headers
# (so probably same goes for URI). Use first header.
if "location" in headers:
newurl = headers["location"]
elif "uri" in headers:
newurl = headers["uri"]
else:
return
# fix a possible malformed URL
urlparts = urlparse(newurl)
# For security reasons we don't allow redirection to anything other
# than http, https or ftp.
if urlparts.scheme not in ('http', 'https', 'ftp', ''):
raise HTTPError(
newurl, code,
"%s - Redirection to url '%s' is not allowed" % (msg, newurl),
headers, fp)
if not urlparts.path:
urlparts = list(urlparts)
urlparts[2] = "/"
else:
urlparts = list(urlparts)
# Header should only contain US-ASCII chars, but some servers do send unicode data
# that should be quoted back before reused
# Need to re-encode the string as iso-8859-1 before use of ""quote"" to cancel the effet of parse_header() in http/client.py
urlparts[2] = quote(urlparts[2].encode('iso-8859-1'))
newurl = urlunparse(urlparts)
newurl = urljoin(req.full_url, newurl)
# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
# handlers that also use handler-specific request attributes
new = self.redirect_request(req, fp, code, msg, headers, newurl)
if new is None:
return
# loop detection
# .redirect_dict has a key url if url was previously visited.
if hasattr(req, 'redirect_dict'):
visited = new.redirect_dict = req.redirect_dict
if (visited.get(newurl, 0) >= self.max_repeats or
len(visited) >= self.max_redirections):
raise HTTPError(req.full_url, code,
self.inf_msg + msg, headers, fp)
else:
visited = new.redirect_dict = req.redirect_dict = {}
visited[newurl] = visited.get(newurl, 0) + 1
# Don't close the fp until we are sure that we won't use it
# with HTTPError.
fp.read()
fp.close()
return self.parent.open(new, timeout=req.timeout)
http_error_301 = http_error_303 = http_error_307 = http_error_302
[...]
# Change default Redirect Handler in urllib, should be done once at the beginning of the program
opener = urllib.request.build_opener(UniRedirectHandler())
urllib.request.install_opener(opener)
This is python3 code but should be easily adapted for python2 if need be.

Related

how do I pass the √ untouched

is it possible to pass the √ through this untouched or am i asking too much
import urllib.request
path = 'html'
links = 'links'
with open(links, 'r', encoding='UTF-8') as links:
for link in links: #for each link in the file
print(link)
with urllib.request.urlopen(link) as linker: #get the html
print(linker)
with open(path, 'ab') as f: #append the html to html
f.write(linker.read())
links
https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A
output
File "PYdown.py", line 7, in <module>
with urllib.request.urlopen(link) as linker:
File "/usr/lib64/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib64/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/usr/lib64/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/usr/lib64/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib64/python3.6/urllib/request.py", line 1392, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib64/python3.6/urllib/request.py", line 1349, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/usr/lib64/python3.6/http/client.py", line 1254, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib64/python3.6/http/client.py", line 1265, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib64/python3.6/http/client.py", line 1132, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\u221a' in position 29: ordinal not in range(128)
You need to quote Unicode chars in URL. You have file which contains list of urls you need to open, so you need to split each url (using urllib.parse.urlsplit()), quote (with urllib.parse.quote()) host and every part of path (to split paths you can use pathlib.PurePosixPath.parts) and then form URL back (using urllib.parse.urlunsplit()).
from pathlib import PurePosixPath
from urllib.parse import urlsplit, urlunsplit, quote, urlencode, parse_qsl
def normalize_url(url):
splitted = urlsplit(url) # split link
path = PurePosixPath(splitted.path) # initialize path
parts = iter(path.parts) # first element always "/"
quoted_path = PurePosixPath(next(parts)) # "/"
for part in parts:
quoted_path /= quote(part) # quote each part
return urlunsplit((
splitted.scheme,
splitted.netloc.encode("idna").decode(), # idna
str(quoted_path),
urlencode(parse_qsl(splitted.query)), # force encode query
splitted.fragment
))
Usage:
links = (
"https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A",
"https://stackoverflow.com/",
"https://www.google.com/search?q=√2&client=firefox-b-d",
"http://pfarmerü.com/"
)
print(*(normalize_url(link) for link in links), sep="\n")
Output:
https://myanimelist.net/anime/27899/Tokyo_Ghoul_%E2%88%9AA
https://stackoverflow.com/
https://www.google.com/search?q=%E2%88%9A2&client=firefox-b-d,
http://xn--pfarmer-t2a.com/
instead of getting python to read √ as itself, I would have to translate the √ to %E2%88%9A in order to get python to output √
credit
#Olvin Roght

Python download images with alernating variables

I was trying to download images with url's that change but got an error.
url_image="http://www.joblo.com/timthumb.php?src=/posters/images/full/"+str(title_2)+"-poster1.jpg&h=333&w=225"
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
headers = {'User-Agent': user_agent}
req = urllib.request.Request(url_image, None, headers)
print(url_image)
#image, h = urllib.request.urlretrieve(url_image)
with urllib.request.urlopen(req) as response:
the_page = response.read()
#print (the_page)
with open('poster.jpg', 'wb') as f:
f.write(the_page)
Traceback (most recent call last):
File "C:\Users\luke\Desktop\scraper\imager finder.py", line 97, in
with urllib.request.urlopen(req) as response:
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine:
My advice is to use urlib2. In addition, I've written a nice function (I think) that will also allow gzip encoding (reduce bandwidth) if the server supports it. I use this for downloading social media files, but should work for anything.
I would try to debug your code, but since it's just a snippet (and the error messages are formatted badly), it's hard to know exactly where your error is occurring (it's certainly not line 97 in your code snippet).
This isn't as short as it could be, but it's clear and reusable. This is python 2.7, it looks like you're using 3 - in which case you google some other questions that address how to use urllib2 in python 3.
import urllib2
import gzip
from StringIO import StringIO
def download(url):
"""
Download and return the file specified in the URL; attempt to use
gzip encoding if possible.
"""
request = urllib2.Request(url)
request.add_header('Accept-Encoding', 'gzip')
try:
response = urllib2.urlopen(request)
except Exception, e:
raise IOError("%s(%s) %s" % (_ERRORS[1], url, e))
payload = response.read()
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(payload)
f = gzip.GzipFile(fileobj=buf)
payload = f.read()
return payload
def save_media(filename, media):
file_handle = open(filename, "wb")
file_handle.write(media)
file_handle.close()
title_2 = "10-cloverfield-lane"
media = download("http://www.joblo.com/timthumb.php?src=/posters/images/full/{}-poster1.jpg&h=333&w=225".format(title_2))
save_media("poster.jpg", media)

Freebase python

I tried this python example from freebase and run it in my windows and ubuntu machine.
http://mql.freebaseapps.com/ch04.html
import sys # Command-line arguments, etc.
import simplejson # JSON encoding.
import urllib # URI encoding.
import urllib2 # High-level URL content fetching.
# These are some constants we'll use.
SERVER = 'api.freebase.com' # Metaweb server
SERVICE = '/api/service/mqlread' # Metaweb service
# Compose our MQL query as a Python data structure.
# The query is an array in case multiple bands share the same name.
band = sys.argv[1] # The desired band, from command line.
query = [{'type': '/music/artist', # Our MQL query in Python.
'name': band, # Place the band in the query.
'album': [{ 'name': None, # None is Python's null.
'release_date': None,
'sort': 'release_date' }]}]
# Put the query in an envelope
envelope = {
'query': query, # The query property specifies the query.
'escape': False # Turns off HTML escaping.
}
# These five lines are the key code for using mqlread
encoded = simplejson.dumps(envelope) # JSON encode the envelope.
params = urllib.urlencode({'query':encoded}) # Escape request parameters.
url ='http://%s%s?%s' % (SERVER,SERVICE,params) # The URL to request.
f = urllib2.urlopen(url) # Open the URL as a file.
response = simplejson.load(f) # Read and JSON parse response.
# Check for errors and exit with a message if the query failed.
if response['code'] != '/api/status/ok': # If not okay...
error = response['messages'][0] # First msg object.
sys.exit('%s: %s' % (error['code'], error['message'])) # Display code,msg.
# No errors, so handle the result
result = response['result'] # Open the response envelope, get result.
# Check the number of matching bands
if len(result) == 0:
sys.exit('Unknown band')
elif len(result) > 1:
print "Warning: multiple bands named " + band + ". Listing first only."
result = result[0] # Get first band from array of matches.
if not result['album']: # Exit if band has no albums
sys.exit(band + ' has no known albums.')
for album in result['album']: # Loop through the result albums.
name = album['name'] # Album name.
date = album['release_date'] # Release date timestamp or null.
if not date: date = ''
else: date = ' [%s]' % date[0:4] # Just the 4-digit year in brackets.
print "%s%s" % (name, date) # Print name and date.
However, it did not work. Could someone explain it why i got this error?
Traceback (most recent call last):
File "mql.py", line 29, in <module>
f = urllib2.urlopen(url) # Open the URL as a file.
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 394, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 412, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1199, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1168, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "C:\Python27\lib\httplib.py", line 955, in request
self._send_request(method, url, body, headers)
File "C:\Python27\lib\httplib.py", line 989, in _send_request
self.endheaders(body)
File "C:\Python27\lib\httplib.py", line 951, in endheaders
self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 811, in _send_output
self.send(msg)
File "C:\Python27\lib\httplib.py", line 773, in send
self.connect()
File "C:\Python27\lib\httplib.py", line 754, in connect
self.timeout, self.source_address)
File "C:\Python27\lib\socket.py", line 562, in create_connection
sock.connect(sa)
One more thing, I am using python 2.7 and i don't have any special proxy configuration for the internet setting.
api.freebase.com has been retired and the Python client library was never updated to work with the new endpoint.
It would be better if you can enclose your code within a try-except block as below:
try:
#all the logic to create a URL and store in a variable say url for MQLReader service
invoke urllib2.urlopen(url)
#perform other processing.
except urllib2.URLError, e:
print str(e)
This will give you an idea about what is going wrong.
I tried your code sample and I get an error saying that:
A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
Hence catching the exception and examining it will give you the root cause of the problem.

How do I get rid of spaces in the 'message' when sending SMS via Kannel

I've setup Kannel in Ubuntu using a USB Modem and I can send SMS via the browser using the URL as seen below
localhost:13013/cgi-bin/sendsms?username=kannel&password=kannel&to=+254781923855&text='Kid got swag'
In python, I have the following script which works only if the message to be sent does not have spaces.
import urllib.request
def send_sms(mobile_no, message):
url="http://%s:%d/cgi-bin/sendsms?username=%s&password=%s&to=%s&text=%s" \
% ('localhost', 13013, 'kannel', 'kannel', str(mobile_no), message)
f = urllib.request.urlopen(url)
print("sms sent")
If I call the function with NO spaces in the message, it works and the message is sent.
sms.send_sms('+254781923855', 'kid_got_swag')
If I have spaces in the message, it fails with the error belw
sms.send_sms('+254781923855', 'kid got swag')
Traceback (most recent call last):
File "/home/lukik/workspace/projx/src/short_message.py", line 24, in <module>
sms.send_sms('+254781923855', 'kid got swag')
File "/home/lukik/workspace/projx/src/short_message.py", line 18, in send_sms
f = urllib.request.urlopen(url)
File "/usr/lib/python3.2/urllib/request.py", line 139, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.2/urllib/request.py", line 376, in open
response = meth(req, response)
File "/usr/lib/python3.2/urllib/request.py", line 488, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.2/urllib/request.py", line 414, in error
return self._call_chain(*args)
File "/usr/lib/python3.2/urllib/request.py", line 348, in _call_chain
result = func(*args)
File "/usr/lib/python3.2/urllib/request.py", line 496, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request
I've tried other variants of calling urllib but they all fail coz of the spaces in the message....
In your request you send via browser, the message is inside quotes -
&text='Kid got swag'
Try that in your request -
url="http://%s:%d/cgi-bin/sendsms?username=%s&password=%s&to=%s&text='%s'" \
% ('localhost', 13013, 'kannel', 'kannel', str(mobile_no), message)
Notice the single quotes at &text='%s'.
PS: I'd recommend using requests for requests like this. You could construct your urls better that way, like this -
>>> payload = {'key1': 'value1', 'key2': 'value2'}
>>> r = requests.get("http://httpbin.org/get", params=payload)
URLs are not permitted to contain spaces. When you tried in your browser, the browser took care of correctly encoding the URL before issuing the request. In your program you need to encode the URL. Fortunately urllib has functions built-in to take careof the details.
http://docs.python.org/3.3/library/urllib.parse.html#url-quoting
You need to URL-encode the values you pass as parameters, otherwise a broken URL gets constructed, that's why the request fails. I believe urllib.parse.urlencode does what you need.

python unhashable type - posting xml data

First, I'm not a python programmer. I'm an old C dog that's learned new Java and PHP tricks, but python looks like a pretty cool language.
I'm getting an error that I can't quite follow. The error follows the code below.
import httplib, urllib
url = "pdb-services-beta.nipr.com"
xml = '<?xml version="1.0"?><!DOCTYPE SCB_Request SYSTEM "http://www.nipr.com/html/SCB_XML_Request.dtd"><SCB_Request Request_Type="Create_Report"><SCB_Login_Data CustomerID="someuser" Passwd="somepass" /><SCB_Create_Report_Request Title=""><Producer_List><NIPR_Num_List_XML><NIPR_Num NIPR_Num="8980608" /><NIPR_Num NIPR_Num="7597855" /><NIPR_Num NIPR_Num="10166016" /></NIPR_Num_List_XML></Producer_List></SCB_Create_Report_Request></SCB_Request>'
params = {}
params['xmldata'] = xml
headers = {}
headers['Content-type'] = 'text/xml'
headers['Accept'] = '*/*'
headers['Content-Length'] = "%d" % len(xml)
connection = httplib.HTTPSConnection(url)
connection.set_debuglevel(1)
connection.request("POST", "/pdb-xml-reports/scb_xmlclient.cgi", params, headers)
response = connection.getresponse()
print response.status, response.reason
data = response.read()
print data
connection.close
Here's the error:
Traceback (most recent call last):
File "C:\Python27\tutorial.py", line 14, in connection.request("POST", "/pdb-xml-reports/scb_xmlclient.cgi", params, headers)
File "C:\Python27\lib\httplib.py", line 958, in request self._send_request(method, url, body, headers)
File "C:\Python27\lib\httplib.py", line 992, in _send_request self.endheaders(body)
File "C:\Python27\lib\httplib.py", line 954, in endheaders self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 818, in _send_output self.send(message_body)
File "C:\Python27\lib\httplib.py", line 790, in send self.sock.sendall(data)
File "C:\Python27\lib\ssl.py", line 229, in sendall v = self.send(data[count:])
TypeError: unhashable type
My log file says that the xmldata parameter is empty.
Any ideas?
I guess params has to be a string when passing to .request, this would explain the error, due to the fact, that a hash is not hashable
Try to encode your params first with
params = urllib.urlencode(params)
You can find another code example too at the bottom of:
http://docs.python.org/release/3.1.5/library/http.client.html
Thanks for the feedback.
I guess I was making this too hard. I went a different route and it seems to work.
import urllib2
URL = "https://pdb-services-beta.nipr.com/pdb-xml-reports/scb_xmlclient.cgi"
DATA = 'xmldata=<?xml version="1.0"?><!DOCTYPE SCB_Request SYSTEM "http://www.nipr.com/html/SCB_XML_Request.dtd"><SCB_Request Request_Type="Create_Report"><SCB_Login_Data CustomerID="someuser" Passwd="somepass" /><SCB_Create_Report_Request Title=""><Producer_List><NIPR_Num_List_XML><NIPR_Num NIPR_Num="8980608" /></NIPR_Num_List_XML></Producer_List></SCB_Create_Report_Request></SCB_Request>'
req = urllib2.Request(url=URL, data=DATA)
f = urllib2.urlopen(req)
print f.read()

Categories

Resources