is it possible to pass the √ through this untouched or am i asking too much
import urllib.request
path = 'html'
links = 'links'
with open(links, 'r', encoding='UTF-8') as links:
for link in links: #for each link in the file
print(link)
with urllib.request.urlopen(link) as linker: #get the html
print(linker)
with open(path, 'ab') as f: #append the html to html
f.write(linker.read())
links
https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A
output
File "PYdown.py", line 7, in <module>
with urllib.request.urlopen(link) as linker:
File "/usr/lib64/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib64/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/usr/lib64/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/usr/lib64/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib64/python3.6/urllib/request.py", line 1392, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib64/python3.6/urllib/request.py", line 1349, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/usr/lib64/python3.6/http/client.py", line 1254, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib64/python3.6/http/client.py", line 1265, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib64/python3.6/http/client.py", line 1132, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\u221a' in position 29: ordinal not in range(128)
You need to quote Unicode chars in URL. You have file which contains list of urls you need to open, so you need to split each url (using urllib.parse.urlsplit()), quote (with urllib.parse.quote()) host and every part of path (to split paths you can use pathlib.PurePosixPath.parts) and then form URL back (using urllib.parse.urlunsplit()).
from pathlib import PurePosixPath
from urllib.parse import urlsplit, urlunsplit, quote, urlencode, parse_qsl
def normalize_url(url):
splitted = urlsplit(url) # split link
path = PurePosixPath(splitted.path) # initialize path
parts = iter(path.parts) # first element always "/"
quoted_path = PurePosixPath(next(parts)) # "/"
for part in parts:
quoted_path /= quote(part) # quote each part
return urlunsplit((
splitted.scheme,
splitted.netloc.encode("idna").decode(), # idna
str(quoted_path),
urlencode(parse_qsl(splitted.query)), # force encode query
splitted.fragment
))
Usage:
links = (
"https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A",
"https://stackoverflow.com/",
"https://www.google.com/search?q=√2&client=firefox-b-d",
"http://pfarmerü.com/"
)
print(*(normalize_url(link) for link in links), sep="\n")
Output:
https://myanimelist.net/anime/27899/Tokyo_Ghoul_%E2%88%9AA
https://stackoverflow.com/
https://www.google.com/search?q=%E2%88%9A2&client=firefox-b-d,
http://xn--pfarmer-t2a.com/
instead of getting python to read √ as itself, I would have to translate the √ to %E2%88%9A in order to get python to output √
credit
#Olvin Roght
Related
I have searched a lot of similar question on SO, but did not find an exact match to my case.
I am trying to download a video using python 2.7
Here is my code for downloading the video
import urllib2
from bs4 import BeautifulSoup as bs
with open('video.txt','r') as f:
last_downloaded_video = f.read()
webpage = urllib2.urlopen('http://*.net/watch/**-'+last_downloaded_video)
soup = bs(webpage)
a = []
for link in soup.find_all('a'):
if link.has_attr('data-video-id'):
a.append(link)
#try just with first data-video-id
id = a[0]['data-video-id']
webpage2 = urllib2.urlopen('http://*/video/play/'+id)
soup = bs(webpage2)
string = str(soup.find_all('script')[2])
print string
url = string.split(': ')[1].split(',')[0]
url = url.replace('"','')
print url
print type(url)
video = urllib2.urlopen(url).read()
filename = "video.mp4"
with open(filename,'wb') as f:
f.write(video)
This code gives an unknown url type error. The traceback is
Traceback (most recent call last):
File "naruto.py", line 26, in <module>
video = urllib2.urlopen(url).read()
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 427, in _open
'unknown_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1247, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
However, when i store the same url in a variable and attempt to download it from terminal, no error is shown.
I am confused as to what the problem is.
I got a similar question in python mailing list
It's hard to tell without seeing the HTML from the page that you are scraping, however, a stray ' (single quote) character at the beginning of the URL might be the cause - this causes the same exception:
>>> import urllib2
>>> urllib2.urlopen("'http://blah.com")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "urllib2.py", line 404, in open
response = self._open(req, data)
File "urllib2.py", line 427, in _open
'unknown_open', req)
File "urllib2.py", line 382, in _call_chain
result = func(*args)
File "urllib2.py", line 1249, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So, try cleaning up your URL and remove any stray quotes.
Update after OP feedback:
The results of the print statement indicate that the URL has a single quote character at the beginning and end of the URL string. There should not any quotes of any type surrounding the URL when it is passed to urlopen(). You can remove leading and trailing quotes (both single and double) from the URL string with this:
url = url.strip('\'"')
This question already has answers here:
How to fetch a non-ascii url with urlopen?
(10 answers)
UnicodeEncodeError: 'ascii' codec can't encode character '\xe9' - -when using urlib.request python3
(2 answers)
Closed 6 years ago.
I wrote a Wikipedia scraper in Python last week.
It scrapes French pages, so I must manage UTF-8 encoding to avoid errors. I did this with these lines at the beginning of my script:
#!/usr/bin/python
# -*- coding: utf-8 -*-
I also encode the scraped string like this:
adresse = monuments[1].get_text().encode('utf-8')
My first script worked perfectly fine with Python 2.7, but I rewrote it for Python 3 (especially to use urllib.request) and UTF-8 doesn't work anymore.
I got these errors after scraping the first few elements:
File "scraper_monu_historiques_ge_py3.py", line 19, in <module>
url = urllib.request.urlopen(url_ville).read() # et on ouvre chacune d'entre elles
File "/usr/lib/python3.4/urllib/request.py", line 153, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 455, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 473, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 433, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1217, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib/python3.4/urllib/request.py", line 1174, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1090, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1118, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib/python3.4/http/client.py", line 975, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\xe9' in position 58: ordinal not in range(128)
I don't understand why, because it worked fine in Python 2.7... I published a version of this WIP on Github.
You are passing a string which contain non-ASCII characters to urllib.urlopen, which isn't a valid URI (it is a valid IRI or International Resource Identifier, though).
You need to make the IRI a valid URI before passing it to urlopen. The specifics of this
depend on which part of the IRI contain non-ASCII characters: the domain part should be encoded using Punycode, while the path should use percent-encoding.
Since your problem is exclusively due to the path containing Unicode characters, assuming your IRI is stored in the variable iri, you can fix it using the following:
import urllib.parse
import urllib.request
split_url = list(urllib.parse.urlsplit(iri))
split_url[2] = urllib.parse.quote(split_url[2]) # the third component is the path of the URL/IRI
url = urllib.parse.urlunsplit(split_url)
urllib.request.urlopen(url).read()
However, if you can avoid urllib and have the option of using the requests library instead, I would recommend doing so. The library is easier to use and has automatic IRI handling.
I learned how to download a picture from a certain URL with python as:
import urllib
imgurl="http://www.digimouth.com/news/media/2011/09/google-logo.jpg"
resource = urllib.urlopen(imgurl)
output = open("test.jpg","wb")
output.write(resource.read())
output.close()
and it worked well, but when i changed the URL to
imgurl="http://farm1.static.flickr.com/96/242125326_607a826afe_o.jpg"
it did not work, and gave the information
File "face_down.py", line 3, in <module>
resource = urllib2.urlopen(imgurl)
File "D:\Python27\another\Lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "D:\Python27\another\Lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "D:\Python27\another\Lib\urllib2.py", line 449, in _open
'_open', req)
File "D:\Python27\another\Lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "D:\Python27\another\Lib\urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "D:\Python27\another\Lib\urllib2.py", line 1197, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 10060] >
and I tried to open the latter image URL, and it could be shown as the former, I have no idea to solve it~~ help~~~~
You can try using requests module. The response will be some bytes. So, you can iterate over those byte chunks and write to the file.
import requests
url = "http://farm1.static.flickr.com/96/242125326_607a826afe_o.jpg"
r = requests.get(url)
path = "filename.jpg"
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
I looked up both of the addresses and the second one does not lead anywhere. That is probably the problem.
import urllib
imgurl="webpage url"
openimg = urllib.urlopen(imgurl) #opens image (prepares it)
img = open("test.jpg","wb") #opens the img to read it
img.write(openimg.read()) #prints it to the console
img.close() #closes img
Try the link again in your webpage and if it turns up with "webpage not available" that is probably the problem.
This question already has answers here:
UnicodeEncodeError: 'ascii' codec can't encode character '\xe9' - -when using urlib.request python3
(2 answers)
Closed 3 years ago.
The short version: I have a variable s = 'bär'. I need to convert s to ASCII so that s = 'b%C3%A4r'.
Long version:
I'm using urllib.request.urlopen() to read an mp3 pronunciation file from URL. This has worked very well, except I ran into a problem because the URLs often contain unicode characters. For example, the German "Bär". The full URL is https://d7mj4aqfscim2.cloudfront.net/tts/de/token/bär. Indeed, typing this into Chrome as a URL works, and navigates me to the mp3 file without problems. However, feeding this same URL to urllib creates a problem.
I determined this was a unicode problem because the stack-trace reads:
Traceback (most recent call last):
File "importer.py", line 145, in <module>
download_file(tuple[1], tuple[0], ".mp3")
File "importer.py", line 81, in download_file
with urllib.request.urlopen(url) as in_stream, open(to_fname+ext, 'wb') as out_file: #`with object as name:` safely __enter__() and __exit__() the runtime of object. `as` assigns `name` as referring to the object `object`.
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1283, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1240, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1083, in request
self._send_request(method, url, body, headers)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1118, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\quesm\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 960, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\xfc' in position 19: ordinal not in range(128)
... and other than the obvious UnicodeEncodeError, I can see it's trying to encode() to ASCII.
Interestingly, when I copied the URL from Chrome (instead of simply typing it into the Python interpreter), it translated the bär to b%C3%A4r. When I feed this to urllib.request.urlopen(), it processes fine, because all of these characters are ASCII. So my goal is to make this conversion within my program. I tried to get my original string to the unicode equivalent, but unicodedata.normalize() in all of its variants didn't work; further, I'm not sure how to store the Unicode as ASCII, given that Python 3 stores all strings as Unicode and thus makes no attempt to convert the text.
Use urllib.parse.quote:
>>> urllib.parse.quote('bär')
'b%C3%A4r'
>>> urllib.parse.urljoin('https://d7mj4aqfscim2.cloudfront.net/tts/de/token/',
... urllib.parse.quote('bär'))
'https://d7mj4aqfscim2.cloudfront.net/tts/de/token/b%C3%A4r'
EDIT: I've majorly edited the content of this post since the original to specify my problem:
I am writing a program to download webcomics, and I'm getting this weird error when downloading a page of the comic. The code I am running essentially boils down to the following line followed by the error. I do not know what is causing this error, and it is confusing me greatly.
>>> urllib.request.urlopen("http://abominable.cc/post/47699281401")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 470, in open
response = meth(req, response)
File "/usr/lib/python3.4/urllib/request.py", line 580, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.4/urllib/request.py", line 502, in error
result = self._call_chain(*args)
File "/usr/lib/python3.4/urllib/request.py", line 442, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 685, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.4/urllib/request.py", line 464, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 482, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 442, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1211, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.4/urllib/request.py", line 1183, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1137, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1172, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib/python3.4/http/client.py", line 1014, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 37-38: ordinal not in range(128)
The entirety of my program can be found here: https://github.com/nstephenh/pycomic
I was having the same problem. The root cause is that the remote server isn't playing by the rules. HTTP Headers are supposed to be US-ASCII only but apparently the leading http webservers (apache2, nginx) doesn't care and send direct UTF-8 encoded string.
However in http.client the parse_header function fetch the headers as iso-8859, and the default HTTPRedirectHandler in urllib doesn't care to quote the location or URI header, resulting in the aformentioned error.
I was able to 'work around' both thing by overriding the default HTTPRedirectHandler and adding three line to counter the latin1 decoding and add a path quote:
import urllib.request
from urllib.error import HTTPError
from urllib.parse import (
urlparse, quote, urljoin, urlunparse)
class UniRedirectHandler(urllib.request.HTTPRedirectHandler):
# Implementation note: To avoid the server sending us into an
# infinite loop, the request object needs to track what URLs we
# have already seen. Do this by adding a handler-specific
# attribute to the Request object.
def http_error_302(self, req, fp, code, msg, headers):
# Some servers (incorrectly) return multiple Location headers
# (so probably same goes for URI). Use first header.
if "location" in headers:
newurl = headers["location"]
elif "uri" in headers:
newurl = headers["uri"]
else:
return
# fix a possible malformed URL
urlparts = urlparse(newurl)
# For security reasons we don't allow redirection to anything other
# than http, https or ftp.
if urlparts.scheme not in ('http', 'https', 'ftp', ''):
raise HTTPError(
newurl, code,
"%s - Redirection to url '%s' is not allowed" % (msg, newurl),
headers, fp)
if not urlparts.path:
urlparts = list(urlparts)
urlparts[2] = "/"
else:
urlparts = list(urlparts)
# Header should only contain US-ASCII chars, but some servers do send unicode data
# that should be quoted back before reused
# Need to re-encode the string as iso-8859-1 before use of ""quote"" to cancel the effet of parse_header() in http/client.py
urlparts[2] = quote(urlparts[2].encode('iso-8859-1'))
newurl = urlunparse(urlparts)
newurl = urljoin(req.full_url, newurl)
# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
# handlers that also use handler-specific request attributes
new = self.redirect_request(req, fp, code, msg, headers, newurl)
if new is None:
return
# loop detection
# .redirect_dict has a key url if url was previously visited.
if hasattr(req, 'redirect_dict'):
visited = new.redirect_dict = req.redirect_dict
if (visited.get(newurl, 0) >= self.max_repeats or
len(visited) >= self.max_redirections):
raise HTTPError(req.full_url, code,
self.inf_msg + msg, headers, fp)
else:
visited = new.redirect_dict = req.redirect_dict = {}
visited[newurl] = visited.get(newurl, 0) + 1
# Don't close the fp until we are sure that we won't use it
# with HTTPError.
fp.read()
fp.close()
return self.parent.open(new, timeout=req.timeout)
http_error_301 = http_error_303 = http_error_307 = http_error_302
[...]
# Change default Redirect Handler in urllib, should be done once at the beginning of the program
opener = urllib.request.build_opener(UniRedirectHandler())
urllib.request.install_opener(opener)
This is python3 code but should be easily adapted for python2 if need be.