Invalid URL error is raised when running urllib review request - python

My code is as follows:
import urllib.request as urllib
def read_text():
quotes = open(r"C:\Users\hjayasinghe2\Desktop\Hasara Work\Learn\demofile.txt")
contents_of_file = quotes.read()
print(contents_of_file)
quotes.close()
check_pofanity(contents_of_file)
def check_pofanity(text_to_check):
connection = urllib.urlopen("http://www.wdyl.com/profanity?q= " + text_to_check)
output = connection.read()
print(output)
connection.close()
read_text()
errors i get is this:
Traceback (most recent call last):
File "C:/Users/hjayasinghe2/Desktop/Hasara Work/Learn/check_profanity.py", line 16, in <module>
read_text()
File "C:/Users/hjayasinghe2/Desktop/Hasara Work/Learn/check_profanity.py", line 8, in read_text
check_pofanity(contents_of_file)
File "C:/Users/hjayasinghe2/Desktop/Hasara Work/Learn/check_profanity.py", line 11, in check_pofanity
connection = urllib.urlopen("http://www.wdyl.com/profanity?q= " + text_to_check)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 543, in _open
'_open', req)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1345, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1255, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\hjayasinghe2\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1117, in putrequest
raise InvalidURL(f"URL can't contain control characters. {url!r} "
http.client.InvalidURL: URL can't contain control characters. '/profanity?q= Video provides a powerful way to help you prove your point. When you click Online Video, you can paste in the embed code for the video you want to add.' (found at least ' ')

You'll need to URL encode the query. Try something like:
import urllib, urllib.parse
url = "http://www.wdyl.com/profanity?q=" + urllib.parse.quote(text_to_check)
connection = urllib.urlopen(url)
See https://docs.python.org/3/library/urllib.parse.html#urllib.parse.quote

Related

How to properly encode string inside urlopen?

Problem: I have a text file with names written in Russian. I take each name from the text file and form a request to Wikipidea with line from text file as page title. Then I want to take information about all existing images on this website.
Program:
with open('names-video.txt', "r", encoding='Windows-1251') as file:
for line in file.readlines():
print(line)
name = "_".join(line.split())
print(name)
html = urlopen(f'https://ru.wikipedia.org/wiki/{name}')
bs = BeautifulSoup(html, 'html.parser')
images = bs.findAll('img', {'src': re.compile('.jpg')})
print(images[0])
names-video.txt:
Алимпиев, Виктор Гелиевич
Андреев, Алексей Викторович (художник)
Баевер, Антонина
Булдаков, Алексей Александрович
Жестков, Максим Евгеньевич
Канис, Полина Владимировна
Мустафин, Денис Рафаилович
Преображенский, Кирилл Александрович
Селезнёв, Владимир Викторович
Сяйлев, Андрей Фёдорович
Шерстюк, Татьяна Александровна
Error message:
error from callback <bound method SocketHandler.handle_message of <amino.socket.SocketHandler object at 0x0000018B92600FA0>>: 'ascii' codec can't encode characters in position 10-17: ordinal not in range(128)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\websocket\_app.py", line 344, in _callback
callback(*args)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 80, in handle_message
self.client.handle_socket_message(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\client.py", line 345, in handle_socket_message
return self.callbacks.resolve(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 204, in resolve
return self.methods.get(data["t"], self.default)(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 192, in _resolve_chat_message
return self.chat_methods.get(key, self.default)(data)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 221, in on_text_message
def on_text_message(self, data): self.call(getframe(0).f_code.co_name, objects.Event(data["o"]).Event)
File "C:\Users\1\Desktop\ИНФА\pycharm\venv\lib\site-packages\amino\socket.py", line 209, in call
handler(data)
File "C:\Users\1\Desktop\python-bots\music_bot\bot.py", line 56, in on_text_message
html = urlopen(f'https://ru.wikipedia.org/wiki/{name}')
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 517, in open
response = self._open(req, data)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 1385, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 1342, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1255, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1266, in _send_request
self.putrequest(method, url, **skips)
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1104, in putrequest
self._output(self._encode_request(request))
File "C:\Users\1\AppData\Local\Programs\Python\Python39\lib\http\client.py", line 1184, in _encode_request
return request.encode('ascii')
Question: For some reason the code breaks on urlopen(). print(line) and print(name) work just fine. What can be the problem here? I've been trying to tackle this issue for quite a while and I will appreciate any solution, thanks in advance.
You'll need to percent encode the non-ASCII characters to make it a proper URI:
from urllib.parse import quote
...
name = "_".join(line.split())
# Percent encode the UTF-8 characters
name = quote(name)
print(name)
...

Having trouble reading from a URL with non-ascii characters despite using quote_plus

Note the link listed in the comment is for Python 2.7, but this questions pertains to Python 3.7.
I'm using Python 3.7 and Django. I want to read from a URL that has special characters in its string, but get errors when I try the traditional way ...
>>> url = "https://www.supergaming.com/f/gaming/article/pvmqe/was_browsing_the_steam_app_reviews_and_ಠ_ಠ/"
...
>>> html = urllib2.urlopen(req, 5000).read()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1229, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1240, in _send_request
self.putrequest(method, url, **skips)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1107, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character '\u0ca0' in position 69: ordinal not in range(128)
So I tried the solution recommended here -- How to convert a url string to safe characters with python? , but I'm still unable to read the URL
>>> urllib.parse.quote_plus(url)
'https%3A%2F%2Fwww.supergaming.com%2Ff%2Fgaming%2Farticle%2Fpvmqe%2Fwas_browsing_the_steam_app_reviews_and_%E0%B2%A0_%E0%B2%A0%2F'
>>> req = urllib2.Request(urllib.parse.quote_plus(url))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 328, in __init__
self.full_url = url
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 354, in full_url
self._parse()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: 'https%3A%2F%2Fwww.supergaming.com%2Fr%2Fgaming%2Farticle%2Fpvmqe%2Fwas_browsing_the_steam_app_reviews_and_%E0%B2%A0_%E0%B2%A0%2F'
What's the proper way to read from a URL if it contains special characters?

How to download .torrent file using urllib in python?

I want to download .torrent file from the download link . I want the said file to be saved in .torrent format in the project folder
I've tried the following code
import urllib.request
url = 'https://torcache.net/torrent/92B4D5EA2D21BC2692A2CB1E5B9FBECD489863EC.torrent?title=[kat.cr]avengers.age.of.ultron.2015.1080p.brrip.x264.yify'
def download_torrent(url):
name= "movie"
full_name = str(name) + ".torrent"
urllib.request.urlretrieve(url, full_name)
download_torrent(url)
It shows the following error:
Traceback (most recent call last):
File "/home/taarush/PycharmProjects/untitled/fkn.py", line 10, in <module>
download_torrent(url)
File "/home/taarush/PycharmProjects/untitled/fkn.py", line 7, in download_torrent
urllib.request.urlretrieve(url, full_name)
File "/usr/lib/python3.5/urllib/request.py", line 187, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/usr/lib/python3.5/urllib/request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 465, in open
response = self._open(req, data)
File "/usr/lib/python3.5/urllib/request.py", line 483, in _open
'_open', req)
File "/usr/lib/python3.5/urllib/request.py", line 443, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 1286, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib/python3.5/urllib/request.py", line 1246, in do_open
r = h.getresponse()
File "/usr/lib/python3.5/http/client.py", line 1197, in getresponse
response.begin()
File "/usr/lib/python3.5/http/client.py", line 297, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.5/http/client.py", line 266, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
Where did it go wrong? is there a way to use magnet links? (urllib library only)
Add User-Agent http header as #Alik suggested. The question you've linked shows how. Here's an adaptation for Python 3:
#!/usr/bin/env python3
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', 'CERN-LineMode/2.15 libwww/2.17b3')]
urllib.request.install_opener(opener) #NOTE: global for the process
urllib.request.urlretrieve(url, filename)
See the specification for User-Agent. If the site rejects your custom User-Agent, you could send User-Agent produced by ordinary browsers such as fake_useragent.UserAgent().random.

Getting json data from imgur.com

I was trying to get json data from imgur.com
To get it one has to hit this link :
http://imgur.com/user/{Username}/index/newest/page/{pagecount}/hit.json?scrolling
Where Username and pagecount may change. So i did something like this :
import urllib2, json
Username="Tighe"
count = 0
url = "http://imgur.com/user/"+arg+"/index/newest/page/"+str(count)+"/hit.json?scrolling"
print("URL " +url)
response = urllib2.urlopen(url)
data = response.read()
I get the data but now to convert it to json format I did something like this :
jsonData = json.loads(data)
Now , it give error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "imgur_battle.py", line 8, in battle
response = urllib2.urlopen(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1187, in do_open
r = h.getresponse(buffering=True)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1045, in getresponse
response.begin()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 409, in begin
version, status, reason = self._read_status()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 373, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
import urllib2, json
username = "Tighe"
count = 0
url = "http://imgur.com/user/"+username+"/index/newest/page/"+str(count)+"/hit.json?scrolling"
response = urllib2.urlopen(url)
data = response.read()
jsonData = json.loads(data)
print jsonData
this work without any problem.
The only issue seems to be that you are using the arg variable instead of Username when you build the URLs. I got a NameError, so if you didn't presumably you have arg set to some extraneous value.

open persian url domains with urllib2

i'm trying to open an url http://الاعلي-للاتصالات.قطر/ar/news-events/event/future-internet-privacy
with the urllib2.urlopen but it reports always an error.
The similar occurs to http://الاعلي-للاتصالات.قطر/ar ... other pages (chinese ones) are opened ok.
Any ideas to point me to the right way to open those urls?
urllib2.urlopen("http://الاعلي-للاتصالات.قطر/ar/news-events/event/future-internet-privacy").read()
urllib2.urlopen('http://الاعلي-للاتصالات.قطر').read()
[Edited]
the error is :
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.6/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.6/urllib2.py", line 391, in open
response = self._open(req, data)
File "/usr/lib/python2.6/urllib2.py", line 409, in _open
'_open', req)
File "/usr/lib/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib/python2.6/urllib2.py", line 1170, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.6/urllib2.py", line 1142, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "/usr/lib/python2.6/httplib.py", line 914, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.6/httplib.py", line 951, in _send_request
self.endheaders()
File "/usr/lib/python2.6/httplib.py", line 908, in endheaders
self._send_output()
File "/usr/lib/python2.6/httplib.py", line 780, in _send_output
self.send(msg)
File "/usr/lib/python2.6/httplib.py", line 759, in send
self.sock.sendall(str)
I also tried with the u'http://الاعلي-للاتصالات.قطر'.encode('utf-8') but the result url can't be opened too.
As #Donal says, the URL has to be punycoded. Luckily Python includes this already. Here is a sample Python code
domain = "الاعلي-للاتصالات.قطر"
domain_unicode = unicode(domain, "utf8")
domain_idna = domain_unicode.encode("idna")
urllib2.urlopen("http://" + domain_idna).read()
Hope this helps.

Categories

Resources