How to properly encode string inside urlopen? - python

Problem: I have a text file with names written in Russian. I take each name from the text file and form a request to Wikipidea with line from text file as page title. Then I want to take information about all existing images on this website.
Program:
with open('names-video.txt', "r", encoding='Windows-1251') as file:
for line in file.readlines():
print(line)
name = "_".join(line.split())
print(name)
html = urlopen(f'https://ru.wikipedia.org/wiki/{name}')
bs = BeautifulSoup(html, 'html.parser')
images = bs.findAll('img', {'src': re.compile('.jpg')})
print(images[0])
names-video.txt:
Алимпиев, Виктор Гелиевич
Андреев, Алексей Викторович (художник)
Баевер, Антонина
Булдаков, Алексей Александрович
Жестков, Максим Евгеньевич
Канис, Полина Владимировна
Мустафин, Денис Рафаилович
Преображенский, Кирилл Александрович
Селезнёв, Владимир Викторович
Сяйлев, Андрей Фёдорович
Шерстюк, Татьяна Александровна
Error message:
error from callback <bound method SocketHandler.handle_message of <amino.socket.SocketHandler object at 0x0000018B92600FA0>>: 'ascii' codec can't encode characters in position 10-17: ordinal not in range(128)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\websocket\_app.py", line 344, in _callback
callback(*args)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 80, in handle_message
self.client.handle_socket_message(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\client.py", line 345, in handle_socket_message
return self.callbacks.resolve(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 204, in resolve
return self.methods.get(data["t"], self.default)(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 192, in _resolve_chat_message
return self.chat_methods.get(key, self.default)(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 221, in on_text_message
def on_text_message(self, data): self.call(getframe(0).f_code.co_name, objects.Event(data["o"]).Event)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 209, in call
handler(data)
File "C:\Users\1\Desktop\python-bots\music_bot\bot.py", line 56, in on_text_message
html = urlopen(f'https://ru.wikipedia.org/wiki/{name}')
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 517, in open
response = self._open(req, data)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 1385, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 1342, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1266, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1104, in putrequest
self._output(self._encode_request(request))
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1184, in _encode_request
return request.encode('ascii')
Question: For some reason the code breaks on urlopen(). print(line) and print(name) work just fine. What can be the problem here? I've been trying to tackle this issue for quite a while and I will appreciate any solution, thanks in advance.

You'll need to percent encode the non-ASCII characters to make it a proper URI:
from urllib.parse import quote
...
name = "_".join(line.split())
# Percent encode the UTF-8 characters
name = quote(name)
print(name)
...

Related

Why does urllib.request.urlretrieve fail when there is a special character ("ä") in the URL?

I want to save an image from reddit to a local file:
import urllib.request
urllib.request.urlretrieve(post.url, "local-filename.jpg")
To reproduce, simply replace post.url with an URL containing "ä"
But i stumbled over an error one day: The reddit post URL had an "ä" in the title.
I understand that this has to do with a basic Python unicode issue, and i read UnicodeEncodeError: 'ascii' codec can't encode character u'\xa0' in position 20: ordinal not in range(128) , but i do not understand how to apply the fix to my problem.
This is the error i got:
https://www.reddit.com/r/OkBrudiMongo/comments/cogkye/hä_was_ist_los_petersans/ Hä, was ist los Peter-Sans?!
Traceback (most recent call last):
File "C:/Users/Administrator/Documents/DefaultProject/Main.py", line 55, in <module>
Mainbot()
File "C:/Users/Administrator/Documents/DefaultProject/Main.py", line 48, in Mainbot
Mainbot()
File "C:/Users/Administrator/Documents/DefaultProject/Main.py", line 38, in Mainbot
urllib.request.urlretrieve(post.url, "local-filename.jpg")
File "C:\Program Files\Python37\lib\urllib\request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Program Files\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\Python37\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Program Files\Python37\lib\urllib\request.py", line 543, in _open
'_open', req)
File "C:\Program Files\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Program Files\Python37\lib\urllib\request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Program Files\Python37\lib\urllib\request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "C:\Program Files\Python37\lib\http\client.py", line 1229, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Program Files\Python37\lib\http\client.py", line 1240, in _send_request
self.putrequest(method, url, **skips)
File "C:\Program Files\Python37\lib\http\client.py", line 1107, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\xe4' in position 37: ordinal not in range(128)

Invalid URL error is raised when running urllib review request

My code is as follows:
import urllib.request as urllib
def read_text():
quotes = open(r"C:\Users\hjayasinghe2\Desktop\Hasara Work\Learn\demofile.txt")
contents_of_file = quotes.read()
print(contents_of_file)
quotes.close()
check_pofanity(contents_of_file)
def check_pofanity(text_to_check):
connection = urllib.urlopen("http://www.wdyl.com/profanity?q= " + text_to_check)
output = connection.read()
print(output)
connection.close()
read_text()
errors i get is this:
Traceback (most recent call last):
File "C:/Users/hjayasinghe2/Desktop/Hasara Work/Learn/check_profanity.py", line 16, in <module>
read_text()
File "C:/Users/hjayasinghe2/Desktop/Hasara Work/Learn/check_profanity.py", line 8, in read_text
check_pofanity(contents_of_file)
File "C:/Users/hjayasinghe2/Desktop/Hasara Work/Learn/check_profanity.py", line 11, in check_pofanity
connection = urllib.urlopen("http://www.wdyl.com/profanity?q= " + text_to_check)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 543, in _open
'_open', req)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1345, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1255, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1117, in putrequest
raise InvalidURL(f"URL can't contain control characters. {url!r} "
http.client.InvalidURL: URL can't contain control characters. '/profanity?q= Video provides a powerful way to help you prove your point. When you click Online Video, you can paste in the embed code for the video you want to add.' (found at least ' ')
You'll need to URL encode the query. Try something like:
import urllib, urllib.parse
url = "http://www.wdyl.com/profanity?q=" + urllib.parse.quote(text_to_check)
connection = urllib.urlopen(url)
See https://docs.python.org/3/library/urllib.parse.html#urllib.parse.quote

Having trouble reading from a URL with non-ascii characters despite using quote_plus

Note the link listed in the comment is for Python 2.7, but this questions pertains to Python 3.7.
I'm using Python 3.7 and Django. I want to read from a URL that has special characters in its string, but get errors when I try the traditional way ...
>>> url = "https://www.supergaming.com/f/gaming/article/pvmqe/was_browsing_the_steam_app_reviews_and_ಠ_ಠ/"
...
>>> html = urllib2.urlopen(req, 5000).read()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1229, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1240, in _send_request
self.putrequest(method, url, **skips)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1107, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\u0ca0' in position 69: ordinal not in range(128)
So I tried the solution recommended here -- How to convert a url string to safe characters with python? , but I'm still unable to read the URL
>>> urllib.parse.quote_plus(url)
'https%3A%2F%2Fwww.supergaming.com%2Ff%2Fgaming%2Farticle%2Fpvmqe%2Fwas_browsing_the_steam_app_reviews_and_%E0%B2%A0_%E0%B2%A0%2F'
>>> req = urllib2.Request(urllib.parse.quote_plus(url))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 328, in __init__
self.full_url = url
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 354, in full_url
self._parse()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: 'https%3A%2F%2Fwww.supergaming.com%2Fr%2Fgaming%2Farticle%2Fpvmqe%2Fwas_browsing_the_steam_app_reviews_and_%E0%B2%A0_%E0%B2%A0%2F'
What's the proper way to read from a URL if it contains special characters?

python mechanize file upload UnicodeDecode error

So I have a little script that I would like to use to upload some PDFs to my citation-site-of-choice (citeulike.org)
Thing is its not working. It does this:
so want to upload /Users/willwade/Dropbox/Papers/price_promoting_643127.pdf to 12589610
Traceback (most recent call last):
File "citeuupload.py", line 167, in <module>
cureader.parseUserBibTex()
File "citeuupload.py", line 160, in parseUserBibTex
self.uploadFileToCitation(b['citeulike-article-id'],self.localpapers+fileorfalse)
File "citeuupload.py", line 138, in uploadFileToCitation
resp = self.browser.submit()
File "build/bdist.macosx-10.8-intel/egg/mechanize/_mechanize.py", line 541, in submit
File "build/bdist.macosx-10.8-intel/egg/mechanize/_mechanize.py", line 203, in open
File "build/bdist.macosx-10.8-intel/egg/mechanize/_mechanize.py", line 230, in _mech_open
File "build/bdist.macosx-10.8-intel/egg/mechanize/_opener.py", line 193, in open
File "build/bdist.macosx-10.8-intel/egg/mechanize/_urllib2_fork.py", line 344, in _open
File "build/bdist.macosx-10.8-intel/egg/mechanize/_urllib2_fork.py", line 332, in _call_chain
File "build/bdist.macosx-10.8-intel/egg/mechanize/_urllib2_fork.py", line 1142, in http_open
File "build/bdist.macosx-10.8-intel/egg/mechanize/_urllib2_fork.py", line 1115, in do_open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 955, in request
self._send_request(method, url, body, headers)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 989, in _send_request
self.endheaders(body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 951, in endheaders
self._send_output(message_body)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 809, in _send_output
msg += message_body
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 544: ordinal not in range(128)
and the code:
def uploadFileToCitation(self,artid,file):
print 'so want to upload', file, ' to ', artid
self.browser.open('http://www.citeulike.org/user/'+cUser+'/article/'+artid)
self.browser.select_form(name="fileupload_frm")
self.browser.form.add_file(open(file, 'rb'), 'application/pdf', file, name='file')
try:
resp = self.browser.submit()
self.wait_for_api_limit()
except mechanize.HTTPError, e:
print 'error'
print e.getcode()
print resp.read()
exit()
NB: I can see it's reading in the file correctly (and it does exist). Also note that I'm doing this elsewhere
self.browser = mechanize.Browser()
self.browser.set_handle_robots(False)
self.browser.addheaders = [
("User-agent", 'me#me.com citeusyncpy/1.0'),
]
Full code is here
Try to check this similar question.
To clarify, the message is constructed in httplib from the method, URL, headers, etc. If any of these is Unicode, the whole string gets converted to Unicode (I presume this is normal Python behavior). Then if you try to append a UTF-8 string you get the error I described in the original question...
From looks of it's a problem with encoding that proper header can fix.
Also you can check this issue.

How do I fetch htmlcode from a url with a colon ":" in the url?

I've been successful in retrieving the html code in regular webpages using python and the urllib2 module.
But when I try using it with a webpage that has a colon it doesn't work.
This code:
f = urllib2.urlopen("http://http://gulasidorna.eniro.se/hitta:svenska+kyrkan/")
htmlcode = f.read()
print htmlcode
The following code generates this error message.
File "/Users/jonathan/Documents/Dropbox/Python/eniro.py", line 137, in <module>
f = urllib2.urlopen("http://http://gulasidorna.eniro.se/hitta:svenska+kyrkan/")
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 394, in open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 412, in _open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 372, in _call_chain
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1199, in http_open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1140, in do_open
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 693, in _init_
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 718, in _set_hostport
httplib.InvalidURL: nonnumeric port: ''
This should work, you have an extra http:// in the start of the url:
f = urllib2.urlopen("http://gulasidorna.eniro.se/hitta:svenska+kyrkan/")
htmlcode = f.read()
print htmlcode

Categories

Resources